## Haralyzer Setup

Installs haralyzer and prettytable.

In [15]:
%pip install haralyzer
%pip install prettytable

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [16]:
from haralyzer import HarParser
import json
from statistics import mean, median
from prettytable import PrettyTable

## HarFile Analyzer

In [17]:
DEBUG = False # Set to True to see debug output

In [18]:
# Extracts data from a HAR file
def extract_data(har_file_name):
    with open(har_file_name, 'r', encoding='utf-8') as har_file:
        har_data = json.load(har_file)

    har_parser = HarParser(har_data)
    har_data = har_parser.har_data
    return har_data

In [19]:
# Returns the time to first byte
def get_ttfb(har_data):
    ttfb = None
    skipped = 0
    try:
        time_frame = har_data["pages"][0]['pageTimings']
        ttfb = time_frame['onContentLoad']
    except:
        skipped += 1
    if DEBUG: print(f"Not Compatible : {skipped}")
    return ttfb


In [20]:
# Returns the page load time
def get_page_load_time(har_data):
    page_load_time = None
    skipped = 0
    try:
        time_frame = har_data["pages"][0]['pageTimings']
        page_load_time = time_frame['onLoad']
    except:
        skipped += 1
    if DEBUG: print(f"Not Compatible : {skipped}")
    return page_load_time

In [21]:
# Returns number of GET requests
def request_stats(har_data):
    num_get_requests = 0
    skipped = 0
    for entries in har_data["entries"]:
        try:
            request = entries["request"]
            request_method = request["method"]
            if(request_method == "GET"): num_get_requests += 1
        except:
            skipped += 1
    if DEBUG: print(f"Not Compatible : {skipped}")
    return num_get_requests


In [22]:
# Returns set of MIME types observed
def get_mime_types(har_data):
    mime_types = set()
    skipped = 0
    for entries in har_data["entries"]:
        try:
            response_content = entries["response"]["content"]
            mime_type = response_content["mimeType"]
            mime_types.add(mime_type)
        except:
            skipped += 1
    if DEBUG: print(f"Not Compatible : {skipped}")
    return mime_types

In [23]:
# Returns the mean, median and total size of images
def analyze_image_data(har_data):
    image_sizes = []
    skipped = 0
    for entries in har_data["entries"]:
        try:
            response = entries["response"]
            response_content = response["content"]
            response_size = response_content["size"]
            mime_type = response_content["mimeType"]
            type_list = (mime_type.split('/'))
            if 'image' in type_list:
                image_sizes.append(response_size)
        except:
            skipped += 1
    if DEBUG: print(f"Not Compatible : {skipped}")
    mean_image_size = mean(image_sizes)
    median_image_size = median(image_sizes)
    total_image_size = sum(image_sizes)
    return mean_image_size, median_image_size, total_image_size

In [24]:
# Returns the mean, median and total size of all the entries
def response_stats(har_data):
    keys = ['image', 'html', 'css', 'javascript']
    count_map = {}
    size_map = {}
    for val in keys:
        count_map[val] = 0
        size_map[val] = 0 
        
    total_size = 0
    skipped = 0
    for entries in har_data["entries"]:
        try:
            response = entries["response"]
            response_content = response["content"]
            response_size = response_content["size"]
            total_size += response_size
            mime_type = response_content["mimeType"]
            type_list = (mime_type.split('/'))
            for key in type_list:
                if key in count_map:
                    count_map[key] += 1
                if key in size_map:
                    size_map[key] += response_size
        except:
            skipped += 1
    if DEBUG: print(f"Not Compatible : {skipped}")
    return count_map, size_map, total_size

In [25]:
# HAR files and their respective paths
har_files = {
    'decaan.har': 'pcapture/websites/har_files_full/decaan.har',
    'jagran.har': 'pcapture/websites/har_files_full/jagran.har',
    'mit.har': 'pcapture/websites/har_files_full/mit.har',
    'sinu.har': 'pcapture/websites/har_files_full/sinu.har',
    'usach.har': 'pcapture/websites/har_files_full/usach.har'
}

In [26]:
har_analysis = {}
for file_name, _ in har_files.items():
    har_analysis[file_name] = {}


In [27]:
for file_name, file in har_files.items(): # iterating over all the har files
    har_data = extract_data(file)

    ## TTFB and Page Load Time
    ttfb = get_ttfb(har_data)
    page_load_time = get_page_load_time(har_data)

    ## GET Requests
    num_get_requests = request_stats(har_data)

    ## Mime Types
    mime_types = get_mime_types(har_data)

    ## Response Stats
    count_map, size_map, total_size = response_stats(har_data)
    
    ## Image Data Analysis
    mean_image_size, median_image_size, total_image_size = analyze_image_data(har_data)
    
    har_analysis[file_name]['ttfb'] = ttfb
    har_analysis[file_name]['page_load_time'] = page_load_time
    har_analysis[file_name]['total_get_request'] = num_get_requests
    har_analysis[file_name]['fraction_get_image'] = count_map['image'] / num_get_requests
    har_analysis[file_name]['fraction_get_javascript'] = count_map['javascript'] / num_get_requests
    har_analysis[file_name]['fraction_get_htmlcss'] = (count_map['html'] + count_map['css']) / num_get_requests
    har_analysis[file_name]['total_size'] = total_size
    har_analysis[file_name]['size_fraction_image'] = total_image_size / total_size
    har_analysis[file_name]['mean_image_size'] = mean_image_size
    har_analysis[file_name]['median_image_size'] = median_image_size

In [28]:
table = PrettyTable()

# Define table headers
table.field_names = ["File Name", "TTFB", "Page Load Time", "Total GET Requests",
                "Fraction GET Image", "Fraction GET JavaScript",
                "Fraction GET HTML/CSS", "Total Size", "Size Fraction Image",
                "Mean Image Size", "Median Image Size"]

# Add data to the table
for file_name, data in har_analysis.items():
    table.add_row([file_name, data['ttfb'], data['page_load_time'], data['total_get_request'],
                data['fraction_get_image'], data['fraction_get_javascript'],
                data['fraction_get_htmlcss'], data['total_size'], data['size_fraction_image'],
                data['mean_image_size'], data['median_image_size']])

# Print the table
print(table)

+------------+--------------------+--------------------+--------------------+---------------------+-------------------------+-----------------------+------------+---------------------+--------------------+-------------------+
| File Name  |        TTFB        |   Page Load Time   | Total GET Requests |  Fraction GET Image | Fraction GET JavaScript | Fraction GET HTML/CSS | Total Size | Size Fraction Image |  Mean Image Size   | Median Image Size |
+------------+--------------------+--------------------+--------------------+---------------------+-------------------------+-----------------------+------------+---------------------+--------------------+-------------------+
| decaan.har | 3185.747999988962  | 8241.801000025589  |        252         | 0.42063492063492064 |   0.25396825396825395   |  0.13095238095238096  |  13494632  | 0.21352045761603577 | 27182.830188679247 |      13030.0      |
| jagran.har | 939.9790000170469  | 14006.684000021778 |        481         | 0.3825363825363825