In [None]:
# PoMiASI Project
# This notebook analyzes data exported to a CSV file,
# where each line represents a single HTTP request and its corresponding response.
#
# Scope of analysis:
# - Total page load time
# - Average object download time
# - Number of concurrent downloads over time
# - Time gaps between downloads within a single TCP connection (client_port)
# - Gantt chart visualizing file downloads over time
#
# Input data from CSV:
# first_timestamp_ms, last_timestamp_ms, duration_ms, total_bytes, request_uri, client_port, request_user_agent
#
# ------------------------------------------------------------
# CHANGELOG – Version 2.0
#
# - Added trimming of the two shortest downloads per browser × attempt.
# - Added detailed download time statistics (mean, median, min, max).
# - Added percentage distribution of concurrent downloads.
# - Added full statistics for time gaps (mean, median, min, max).
# - Added "Active downloads over time" chart (=1st packet, up to 25 s).
# - Reworked Gantt chart: grouped by client_port
# - Added cross-browser comparison summary.
# - Introduced support for multiple attempts (runs) via `attempt` column.
# - Removed Seaborn — all plots now use clean Matplotlib.
# ------------------------------------------------------------


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np

plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (10, 5)

In [None]:
# ---------------------------------------------------------
# wczytujemy csv
# ---------------------------------------------------------

csv_path = "/content/output.csv"

if not os.path.exists(csv_path):
    raise FileNotFoundError(f"File not found: {csv_path}")

df = pd.read_csv(csv_path)

required = ['first_timestamp_ms', 'last_timestamp_ms', 'duration_ms',
            'total_bytes', 'request_uri', 'client_port']
missing = [col for col in required if col not in df.columns]
if missing:
    raise ValueError(f"Missing columns in CSV: {', '.join(missing)}")

df['first_timestamp'] = pd.to_datetime(df['first_timestamp_ms'], unit='ms', utc=True)
df['last_timestamp'] = pd.to_datetime(df['last_timestamp_ms'], unit='ms', utc=True)
df['duration_ms'] = df['duration_ms'].astype(float)


In [None]:
# identyfikacja przegladarki po user agencie
def get_browser(user_agent):
    if pd.isna(user_agent):
        return "Unknown"
    ua = user_agent.lower()
    if "firefox" in ua:
        return "Firefox"
    elif "opr/" in ua or "opera" in ua:
        return "Opera"
    elif "chrome" in ua:
        return "Chrome"
    else:
        return "Other"

if 'request_user_agent' in df.columns:
    df['browser'] = df['request_user_agent'].apply(get_browser)
else:
    df['browser'] = "Unknown"

df['attempt'] = 1


total_time_ms = (df['last_timestamp'].max() - df['first_timestamp'].min()).total_seconds() * 1000
avg_duration_ms = df['duration_ms'].mean()
total_bytes_sum = df['total_bytes'].sum()

print(f"Total page load time: {total_time_ms:.1f} ms")
print(f"Average object download time: {avg_duration_ms:.1f} ms")
print(f"Total data transferred: {total_bytes_sum/1024/1024:.2f} MB")
print(f"Objects analyzed: {len(df)}")


In [None]:
# ---------------------------------------------------------
# 1. Wyrzucenie dwóch najkrotszych
# ---------------------------------------------------------

def drop_two_shortest(group):
    if len(group) <= 2:
        return pd.DataFrame(columns=group.columns)
    return group.sort_values('duration_ms').iloc[2:]

df_trimmed = df.groupby(['browser', 'attempt'], group_keys=False).apply(drop_two_shortest)

# ---------------------------------------------------------
# 2. STATY: srednia/mediana/min/max
# ---------------------------------------------------------

stats = (
    df_trimmed
    .groupby(['browser', 'attempt'])['duration_ms']
    .agg(['count', 'mean', 'median', 'min', 'max'])
    .reset_index()
)
print("\nDownload duration stats (trimmed 2 shortest):")
display(stats)


In [None]:
# ---------------------------------------------------------
# 3. Liczba rownoczesnych pobran w czasie (per przeglądarka)
# ---------------------------------------------------------

def active_downloads(df_sub, max_time_s=25):
    start0 = df_sub['first_timestamp'].min()
    t0 = start0
    tend = df_sub['last_timestamp'].max()
    freq = '10ms'
    ts_all = pd.date_range(t0, tend, freq=freq)
    rel_time = (ts_all - t0).total_seconds()
    rel_time = rel_time[rel_time <= max_time_s]
    active_counts = [((df_sub['first_timestamp'] <= t) & (df_sub['last_timestamp'] >= t)).sum() for t in ts_all[:len(rel_time)]]
    return rel_time, active_counts

plt.figure()
for bname, g in df.groupby('browser'):
    xs, ys = active_downloads(g)
    plt.step(xs, ys, where='post', label=bname)

plt.title('Active downloads over time (=1st packet)')
plt.xlabel('Time [s] (25s)')
plt.ylabel('Active downloads')
plt.legend()
plt.xlim(0, 25)
plt.tight_layout()
plt.show()


In [None]:
# ---------------------------------------------------------
# 4. Procentowy rozklad liczby rownoczesnych pobran
# ---------------------------------------------------------

def active_distribution(df_sub):
    events = []
    for s, e in zip(df_sub['first_timestamp'], df_sub['last_timestamp']):
        events.append((s, +1))
        events.append((e, -1))
    events.sort()
    active = 0
    last_t = events[0][0]
    spans = []
    for t, d in events:
        if t > last_t:
            spans.append({'active': active, 'dt': (t - last_t).total_seconds()})
        active += d
        last_t = t
    dist = pd.DataFrame(spans).groupby('active')['dt'].sum().reset_index()
    total = dist['dt'].sum()
    dist['percent'] = 100 * dist['dt'] / total if total > 0 else 0
    return dist

dist_all = []
for (b,a), g in df.groupby(['browser','attempt']):
    d = active_distribution(g)
    d['browser'] = b
    d['attempt'] = a
    dist_all.append(d)

dist_all = pd.concat(dist_all, ignore_index=True)
print("\nActive download distribution(% of time):")
display(dist_all)


In [None]:
# ---------------------------------------------------------
# 5. TIME GAPS STATY (pomiędzy pobraniami w tym samym połączeniu)
# ---------------------------------------------------------

gaps = []
for (b, a, port), g in df.sort_values('first_timestamp').groupby(['browser','attempt','client_port']):
    ends = g['last_timestamp'].shift(1)
    starts = g['first_timestamp']
    gap = (starts - ends).dt.total_seconds() * 1000
    gaps += [(b, a, x) for x in gap[gap > 0].dropna()]

gaps_df = pd.DataFrame(gaps, columns=['browser','attempt','gap_ms'])

##### michal tutaj nie zapomnij dopisać .agg i wywołac reset_index
gaps_stats = (
    gaps_df.groupby(['browser','attempt'])['gap_ms']
    .agg(['count','mean','median','min','max'])
    .reset_index()
)
print("\nTime gaps stats (per browser/attempt):")
display(gaps_stats)

plt.figure()
### Liczba wystąpień sprawdz
plt.hist(gaps_df['gap_ms'], bins=50)
plt.title('Time gaps between downloads (same connection)')
plt.xlabel('Gap length [ms]')
plt.ylabel('Count of exist')
plt.tight_layout()
plt.show()


In [None]:
# ---------------------------------------------------------
# 6. GANTT ze wzgledu na client_port i color=order
# ---------------------------------------------------------

df_gantt = df.copy()
df_gantt['order_in_port'] = (
    df_gantt.sort_values('first_timestamp')
    .groupby(['browser','attempt','client_port'])
    .cumcount() + 1
)

for (b,a), g in df_gantt.groupby(['browser','attempt']):
    plt.figure(figsize=(12, max(4, len(g['client_port'].unique())*0.3)))
    threads = sorted(g['client_port'].unique())
    y = 10
    for t in threads:
        gt = g[g['client_port'] == t].sort_values('first_timestamp')
        for _, row in gt.iterrows():
            plt.broken_barh([(mdates.date2num(row['first_timestamp']),
                               (row['last_timestamp'] - row['first_timestamp']).total_seconds()/86400)],
                            (y, 8))
        y += 12
    plt.title(f'Gantt by connection – {b}, attempt {a}')
    plt.xlabel('Time')
    plt.ylabel('client_port threads')
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S.%f'))
    plt.tight_layout()
    plt.show()


In [None]:
# ---------------------------------------------------------
# 7. Czas pobierania dla poj obiektu
# ---------------------------------------------------------

plt.figure(figsize=(10, 6))
subset = df_trimmed.sort_values('duration_ms', ascending=False)
plt.barh(subset['request_uri'], subset['duration_ms'])
plt.xlabel('Download time [ms]')
plt.ylabel('Object (URI)')
plt.title('Download time per object (trimmed)')
plt.tight_layout()
plt.show()


In [None]:
# ---------------------------------------------------------
# 8. Podsumowanie porownania
# ---------------------------------------------------------

browser_cmp = (
    df_trimmed.groupby('browser')['duration_ms']
    .agg(['count','mean','median','min','max'])
    .reset_index()
)
print("\nCross-browser comparison:")
display(browser_cmp)

plt.figure()
#seaborn papa - usun michal - matplotlib full control
plt.bar(browser_cmp['browser'], browser_cmp['mean'])
plt.xlabel('Browser')
plt.ylabel('Average duration [ms]')
plt.title('Average download time per browser')
plt.tight_layout()
plt.show()

print("\nWszystko executed fine.")
