In [1]:
from glob import glob
from os import path

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV, LinearRegression, RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


# [Dataset](https://drive.google.com/drive/folders/1nfrYxDm7TLzls9pedZbLX5rP4McVDWDe)

In [2]:
DATASET_PATH = "SSD2022AS2"


In [3]:
csv_files = glob(pathname=path.join(DATASET_PATH, "*.csv"), recursive=True)


In [4]:
dfs = list()

for csv_file in csv_files:
    dfs.append(pd.read_csv(filepath_or_buffer=csv_file))


In [5]:
df = pd.concat(objs=dfs).reset_index(drop=True)


In [6]:
df.timestamp = pd.to_datetime(df.timestamp)


# Stream Quality

# Next Session Time

In [7]:
df = df.groupby(by=["client_user_id", "session_id"]).aggregate(
    {
        "dropped_frames": [np.mean, np.std, np.max],
        "FPS": [np.mean, np.std],
        "bitrate": [np.mean, np.std],
        "RTT": [np.mean, np.std],
        "timestamp": [np.ptp],
    }
)


In [8]:
df.columns = ["_".join(column).lower() for column in df.columns.to_flat_index()]


In [9]:
df = df.reset_index(drop=True)


In [10]:
df.timestamp_ptp = df.timestamp_ptp.dt.total_seconds()


In [11]:
df = df.drop_duplicates()


In [12]:
df.to_csv(path_or_buf="output.csv")


# Data Engineering

In [13]:
df.isna().any()


dropped_frames_mean    False
dropped_frames_std      True
dropped_frames_amax    False
fps_mean               False
fps_std                 True
bitrate_mean           False
bitrate_std             True
rtt_mean               False
rtt_std                 True
timestamp_ptp          False
dtype: bool

In [14]:
df[:] = SimpleImputer().fit_transform(X=df)


In [15]:
X = df.drop(labels=["timestamp_ptp"], axis=1)
y = df.timestamp_ptp


In [16]:
X[:] = MinMaxScaler().fit_transform(X=X, y=y)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# Model

In [18]:
model = LassoCV()
y_pred = model.fit(X=X_train, y=y_train).predict(X=X_test)


In [19]:
print("Mean Absolute Error:", mean_absolute_error(y_true=y_test, y_pred=y_pred))
print("Mean Squared Error", mean_squared_error(y_true=y_test, y_pred=y_pred))
print(
    "Root Mean Squared Error:",
    mean_squared_error(y_true=y_test, y_pred=y_pred, squared=False),
)
print("R^2 Score", r2_score(y_true=y_test, y_pred=y_pred))


Mean Absolute Error: 3776.865357538145
Mean Squared Error 39503092.98480691
Root Mean Squared Error: 6285.148604830829
R^2 Score 0.04206912436011945


# Total Number of Bad Sessions