In [None]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
import numpy as np

csv_file = "2_5_fabricio_tiktok30minuse"

data = pd.read_csv("../csv_files/" + csv_file + ".csv", encoding='latin-1')

In [None]:
data = data[~(data["Protocol"].isin(["MDNS", "ICMP", "DHCP", "NTP", "SSDP", "ARP"])|data['Protocol'].isna())]
data["Protocol"].value_counts()

In [None]:
source = data["Destination"].value_counts().idxmax()
source

In [None]:
data.iloc[12]

In [None]:
# add new column host for ip host

def find_host(row):
    # find non-source ip address
    ip = str(row["Destination"] if row["Destination"] != source else row["Source"])

    # akamai
    if any(x in ip for x in ['google', 'gmail', '1e100', 'app-measurement']):
        return "Google"
    elif any(x in ip for x in ['amazon', 'aws']):
        return "Amazon"
    elif any(x in ip for x in ['apple', 'apple-dns', 'aaplimg']):
        return "Apple"
    elif any(x in ip for x in ['akamai', 'akadns']):
        return "Akamai"
    elif any(x in ip for x in ['facebook', 'fbcdn', 'instagram']):
        return "Facebook"
    else:
        return "other"

data["Host"] = data.apply(find_host, axis=1)
data["Host"].value_counts()


In [None]:
type(data.apply(find_host, axis=1))

In [None]:
data[data['Host'] == 'other']['Source'].value_counts()

Buckets time to to 1 second or 5 second chunks

In [None]:
bucket_size = 5
time_series = np.arange(0, data['Time'].max(), bucket_size)

# UNCOMMENT APPROPRIATE CODE FOOR CSV FORMAT
# if saved as seconds, convert to integer seconds
data['Time'] = data['Time'].round().astype(int)

# if saved as datetime, convert to integer seconds
# times = pd.to_datetime(data['Time'])
# data['Time'] = (times-times[0]).dt.seconds

# round seconds to nearest 5
data['Time'] = data['Time'].apply(lambda x: round(x/bucket_size)*bucket_size)
data['Time']

In [None]:
# separate data by host
fb = data.where(data['Host'] == 'Facebook')['Length'].fillna(0)
goog = data.where(data['Host'] == 'Google')['Length'].fillna(0)
amz = data.where(data['Host'] == 'Amazon')['Length'].fillna(0)
aapl = data.where(data['Host'] == 'Apple')['Length'].fillna(0)
ak = data.where(data['Host'] == 'Akamai')['Length'].fillna(0)
oth = data.where(data['Host'] == 'other')['Length'].fillna(0)

In [None]:
# plot packet length by time
size_data = pd.DataFrame({'Time': data['Time'], 'Facebook': fb, 'Google': goog, 'Amazon': amz, 'Akamai': ak, 'Apple': aapl, 'Other': oth})
size_data = size_data.groupby('Time', dropna=False, as_index=False).sum()
# add zeros to missing data
size_data = size_data.set_index('Time').reindex(time_series, fill_value=0)

fig, ax = plt.subplots()

ax.stackplot(time_series, 
            size_data['Facebook'], size_data['Google'], size_data['Amazon'], size_data['Akamai'], size_data['Apple'], size_data['Other'], 
            labels=['Facebook', 'Google', 'Amazon', 'Akamai', 'Apple', 'Other'])

# ax.stackplot(data['Time'], fb, goog, amz, ak, labels=['Facebook', 'Google', 'Amazon', 'Akamai'])
ax.legend(loc='upper right')
ax.set_title(csv_file + " (5s)")
ax.set_xlabel('Second')
ax.set_ylabel('Total packet throughput (bytes)')
#ax.set_yscale('log')

plt.rcParams['figure.figsize'] = [15, 5]

plt.show()

In [None]:
# count number of packets per second by host
count_data = pd.DataFrame({'Time': data['Time'], 
                           'Facebook': fb[fb > 0], 'Google': goog[goog > 0], 'Amazon': amz[amz > 0], 
                           'Akamai': ak[ak > 0], 'Apple': aapl[aapl > 0], 'Other': oth[oth > 0]})
count_data = count_data.groupby('Time', dropna=False, as_index=False).count()
count_data = count_data.set_index('Time').reindex(time_series, fill_value=0)

fig, ax = plt.subplots()
ax.stackplot(time_series,
            count_data['Facebook'], count_data['Google'], count_data['Amazon'], count_data['Akamai'], count_data['Apple'], count_data['Other'],
            labels=['Facebook', 'Google', 'Amazon', 'Akamai', 'Apple', 'Other'])

ax.legend(loc='upper right')
ax.set_title(csv_file + " (5s)")
ax.set_xlabel('Second')
ax.set_ylabel('Total packet count')
# ax.set_yscale('log')

plt.rcParams['figure.figsize'] = [15, 5]

plt.show()
