In [None]:
# PoMiASI Project
# This notebook analyzes data exported to a CSV file,
# where each line represents a single HTTP request and its corresponding response.
#
# Scope of analysis:
# - Total page load time
# - Average object download time
# - Number of concurrent downloads over time
# - Time gaps between downloads within a single TCP connection (client_port)
# - Gantt chart visualizing file downloads over time
#
# Input data from csv:
# first_timestamp_ms, last_timestamp_ms, duration_ms, total_bytes, request_uri, client_port, request_user_agent w csv


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import seaborn as sns

plt.rcParams['axes.unicode_minus'] = False


In [None]:
# Load data from CSV
csv_path = "/content/output.csv"

if not os.path.exists(csv_path):
    raise FileNotFoundError(f"File not found: {csv_path}")

df = pd.read_csv(csv_path)

required = ['first_timestamp_ms', 'last_timestamp_ms', 'duration_ms', 'total_bytes', 'request_uri', 'client_port']
missing = [col for col in required if col not in df.columns]
if missing:
    raise ValueError(f"Missing columns in CSV: {', '.join(missing)}")

df['first_timestamp'] = pd.to_datetime(df['first_timestamp_ms'], unit='ms', utc=True)
df['last_timestamp'] = pd.to_datetime(df['last_timestamp_ms'], unit='ms', utc=True)
df['duration_ms'] = df['duration_ms'].astype(float)

df.head()


In [None]:
# Automatic compare if multiple browsers detected

def get_browser(user_agent):
    if pd.isna(user_agent):
        return "Unknown"
    ua = user_agent.lower()
    if "firefox" in ua:
        return "Firefox"
    elif "opr/" in ua or "opera" in ua:
        return "Opera"
    elif "chrome" in ua:
        return "Chrome"
    else:
        return "Other"

df['browser'] = df['request_user_agent'].apply(get_browser)
browser_counts = df['browser'].value_counts()

print("Detected browsers:")
display(browser_counts)

if len(browser_counts) > 1:
    plt.figure(figsize=(8, 5))
    sns.barplot(x='browser', y='duration_ms', data=df, estimator=np.mean, ci=None, palette='Set2')
    plt.title('Average download time per browser')
    plt.xlabel('Browser')
    plt.ylabel('Average duration [ms]')
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(10, 5))
    for browser_name in df['browser'].unique():
        sub = df[df['browser'] == browser_name]
        freq = '100ms' if len(sub) > 500 else '10ms'
        ts_all = pd.date_range(sub['first_timestamp'].min(), sub['last_timestamp'].max(), freq=freq)
        active_counts = [((sub['first_timestamp'] <= t) & (sub['last_timestamp'] >= t)).sum() for t in ts_all]
        plt.plot(ts_all, active_counts, label=browser_name)
    plt.legend()
    plt.title('Download concurrency by browser')
    plt.xlabel('Time')
    plt.ylabel('Active downloads')
    plt.tight_layout()
    plt.show()
else:
    print(" Only one browser detected â€” comparison skipped.")


In [None]:
total_time_ms = (df['last_timestamp'].max() - df['first_timestamp'].min()).total_seconds() * 1000
avg_duration_ms = df['duration_ms'].mean()
total_bytes_sum = df['total_bytes'].sum()

print(f"Total page load time: {total_time_ms:.1f} ms")
print(f"Average object download time: {avg_duration_ms:.1f} ms")
print(f"Total data transferred: {total_bytes_sum/1024/1024:.2f} MB")
print(f"Objects analyzed: {len(df)}")


In [None]:
plt.figure(figsize=(10, 6))
plt.barh(df['request_uri'], df['duration_ms'], color='steelblue')
plt.xlabel('Download time [ms]')
plt.ylabel('Object (URI)')
plt.title('Download time per object')
plt.tight_layout()
plt.show()


In [None]:
freq = '100ms' if len(df) > 500 else '10ms'
ts_all = pd.date_range(df['first_timestamp'].min(), df['last_timestamp'].max(), freq=freq)
active_counts = [((df['first_timestamp'] <= t) & (df['last_timestamp'] >= t)).sum() for t in ts_all]

plt.figure(figsize=(10, 5))
plt.plot(ts_all, active_counts, color='darkorange')
plt.title('Download concurrency over time')
plt.xlabel('Time')
plt.ylabel('Active downloads')
plt.tight_layout()
plt.show()


In [None]:
gaps = []
for port, group in df.sort_values('first_timestamp').groupby('client_port'):
    ends = group['last_timestamp'].shift(1)
    starts = group['first_timestamp']
    gap = (starts - ends).dt.total_seconds() * 1000
    gaps += list(gap[gap > 0].dropna())

plt.figure(figsize=(8, 5))
plt.hist(gaps, bins=50, color='seagreen')
plt.title('Time gaps between downloads (same connection)')
plt.xlabel('Gap length [ms]')
plt.ylabel('Occurrences')
plt.tight_layout()
plt.show()


In [None]:
gantt_df = df[['request_uri', 'first_timestamp', 'last_timestamp']].sort_values('first_timestamp').reset_index(drop=True)

plt.figure(figsize=(12, max(4, len(gantt_df) * 0.15)))
ax = plt.gca()
for i, row in gantt_df.iterrows():
    ax.hlines(y=i, xmin=row['first_timestamp'], xmax=row['last_timestamp'], color='royalblue', linewidth=2)

ax.set_yticks(range(len(gantt_df)))
ax.set_yticklabels([uri.split('/')[-1][:40] for uri in gantt_df['request_uri']])
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S.%f'))
plt.title('Gantt chart of downloads')
plt.xlabel('Time')
plt.ylabel('Object (URI)')
plt.tight_layout()
plt.show()
