# RF for LIVE_NFLX QoE Prediction

## Load the useful librairies

In [1]:
# Load the libraries
from os import listdir
from os.path import isfile, join
import sys
from pathlib import Path
import numpy as np
import random as rd
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
import pickle
import scipy.io



## Load and transform the "mat.files"  data

In [2]:
root_folder =  "Mat_Files"
mat_files = [f for f in listdir(root_folder) if "mat" in f and isfile(join(root_folder, f))]
mat_files

['Chimera1102347_VMAFViterbiQualityBasedAdaptor_Trace_2.mat',
 'AirShow_VMAFViterbiQualityBasedAdaptor_Trace_4.mat',
 'MeridianConversation_OracleVMAFViterbiQualityBasedAdaptor_Trace_2.mat',
 'Sparks_HuangBufferBasedAdaptor_Trace_5.mat',
 'Soccer_HuangBufferBasedAdaptor_Trace_4.mat',
 'CosmosLaundromat_HuangBufferBasedAdaptor_Trace_6.mat',
 'Sparks_OracleVMAFViterbiQualityBasedAdaptor_Trace_0.mat',
 'GTA_OracleVMAFViterbiQualityBasedAdaptor_Trace_2.mat',
 'ElFuenteDance_SimpleThroughputBasedAdaptor_Trace_3.mat',
 'AirShow_HuangBufferBasedAdaptor_Trace_5.mat',
 'Chimera1102347_OracleVMAFViterbiQualityBasedAdaptor_Trace_5.mat',
 'Soccer_HuangBufferBasedAdaptor_Trace_3.mat',
 'TearsOfSteelStatic_OracleVMAFViterbiQualityBasedAdaptor_Trace_0.mat',
 'ElFuenteDance_SimpleThroughputBasedAdaptor_Trace_0.mat',
 'Chimera1102353_VMAFViterbiQualityBasedAdaptor_Trace_1.mat',
 'TearsOfSteelStatic_VMAFViterbiQualityBasedAdaptor_Trace_3.mat',
 'GTA_HuangBufferBasedAdaptor_Trace_5.mat',
 'ElFuenteMask_V

In [3]:
name_vars = list(scipy.io.loadmat(root_folder + "/" + mat_files[0]).keys())[4:39]
data = {}
for var in name_vars:
    print(var)
    data[var] = pd.DataFrame()
    for f in mat_files:
        mat = scipy.io.loadmat(root_folder + "/" + f)
        tmp_var = pd.DataFrame(mat[var])
        data[var] = pd.concat([data[var], tmp_var])

STRRED
N_playback_frames
VMAF
SSIM
buffer_evolution_sec
height
playback_duration_sec
playout_bitrate
continuous_zscored_mos
scene_cuts_detected
PSNR
MSSIM
per_segment_encoding_QP
rebuffer_duration_sec
video_duration_sec
width
per_segment_encoding_height
selected_streams
distorted_mp4_video
adaptation_algorithm
rebuffer_number
content_spatial_information
content_temporal_information
frame_rate
per_segment_encoding_width
cropping_parameters
throughput_trace_name
content_name_acronym
scene_cuts
N_rebuffer_frames
is_rebuffered_bool
throughput_trace_kbps
reference_yuv_video
N_total_frames
retrospective_zscored_mos


In [4]:
scal_vars = ["playback_duration_sec", "rebuffer_duration_sec", "video_duration_sec", "N_playback_frames", "height", "width",
             "rebuffer_number", "content_spatial_information", "content_temporal_information", "frame_rate",
             "N_rebuffer_frames", "N_total_frames", "throughput_trace_name"]
X_scal = pd.DataFrame()
for var in scal_vars:
    tmp_var = data[var]
    X_scal = pd.concat([X_scal, tmp_var], axis = 1)
    
X_scal.columns = scal_vars
X_scal.to_csv("data/scal_vars.csv")
X_scal

Unnamed: 0,playback_duration_sec,rebuffer_duration_sec,video_duration_sec,N_playback_frames,height,width,rebuffer_number,content_spatial_information,content_temporal_information,frame_rate,N_rebuffer_frames,N_total_frames,throughput_trace_name
0,27.533333,0.033333,27.566667,826,1080,1920,1,46.559426,6.704285,30.000000,1,827,Train_vestby_oslo
0,27.133333,4.300000,31.433333,814,1080,1920,3,24.465410,9.669711,30.000000,129,943,Bus_ljansbakken_oslo
0,26.633333,0.000000,26.633333,799,1080,1920,0,36.516497,3.111322,30.000000,0,799,Train_vestby_oslo
0,27.933333,0.000000,27.933333,838,1080,1920,0,51.239882,12.835748,30.000000,0,838,Ferry_nesoddtangen_oslo
0,25.000000,4.320000,29.320000,625,1080,1920,3,92.946820,30.131946,25.000000,108,733,Bus_ljansbakken_oslo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,26.625000,0.000000,26.625000,639,1080,1920,0,43.562215,11.106232,24.000000,0,639,Tram_jernbanetorget_ljabru
0,27.133333,0.000000,27.133333,814,1080,1920,0,24.465410,9.669711,30.000000,0,814,Car_snaroya_smestad
0,27.000000,0.000000,27.000000,810,1080,1920,0,45.257302,6.470532,30.000000,0,810,Tram_ljabru_jernbanetorget
0,27.000000,0.000000,27.000000,648,1080,1920,0,34.617578,21.403401,24.000000,0,648,Tram_jernbanetorget_ljabru


In [36]:
cont_vars = ["buffer_evolution_sec", "MSSIM", "per_segment_encoding_height", "per_segment_encoding_QP", "playout_bitrate",
             "PSNR", "scene_cuts", "selected_streams", "SSIM", "STRRED", "throughput_trace_kbps", "VMAF",
             "per_segment_encoding_width", "scene_cuts", "scene_cuts_detected", "continuous_zscored_mos", "is_rebuffered_bool"]
for var in cont_vars:
    tmp = data[var]
    tmp.to_csv("data/" + var + ".csv")

In [6]:
#root_folder = "/QoE_Data/LIVE_NFLX/data/"
root_folder = "data/"
csv_files = [f for f in listdir(root_folder) if "csv" in f and isfile(join(root_folder, f))]
csv_files

['per_segment_encoding_height.csv',
 'scene_cuts.csv',
 'is_rebuffered_bool.csv',
 'VMAF.csv',
 'continuous_zscored_mos.csv',
 'selected_streams.csv',
 'PSNR.csv',
 'SSIM.csv',
 'scene_cuts_detected.csv',
 'scal_vars.csv',
 'per_segment_encoding_QP.csv',
 'playout_bitrate.csv',
 'buffer_evolution_sec.csv',
 'throughput_trace_kbps.csv',
 'MSSIM.csv',
 'per_segment_encoding_width.csv',
 'STRRED.csv']

In [7]:
# Build the dataframe of features statistics
df_new = pd.DataFrame()
for f in csv_files:
    if 'continuous' in f:# or 'scal_vars' in f:
        continue
    if 'scal_vars' in f:
        print(f)
        tmp_data = pd.read_csv(root_folder + f).iloc[: , 1:]
        df_new["test_var"] = tmp_data["throughput_trace_name"]
        ["Train_vestby_oslo","Tram_jernbanetorget_ljabru"]
    else:
        print(f)
        tmp_data = pd.read_csv(root_folder + f).iloc[: , 1:]
        df_new[f+"_min"] = tmp_data.min(axis=1)
        df_new[f+"_max"] = tmp_data.max(axis=1)
        df_new[f+"_sum"] = tmp_data.sum(axis=1)
        df_new[f+"_mean"] = tmp_data.mean(axis=1)
        df_new[f+"_median"] = tmp_data.median(axis=1)
        df_new[f+"_std"] = tmp_data.std(axis=1)
        df_new[f+"_1stQ"] = tmp_data.quantile(0.25, axis=1)
        df_new[f+"_3rdQ"] = tmp_data.quantile(0.75, axis=1)
        df_new[f+"_skew"] = tmp_data.skew(axis=1)
        df_new[f+"_kur"] = tmp_data.kurtosis(axis=1)
        df_new[f+"_count"] = tmp_data.shape[1] - tmp_data.isnull().sum(axis=1)
    
df_new.head(10)

per_segment_encoding_height.csv
scene_cuts.csv
is_rebuffered_bool.csv
VMAF.csv
selected_streams.csv
PSNR.csv
SSIM.csv
scene_cuts_detected.csv
scal_vars.csv
per_segment_encoding_QP.csv
playout_bitrate.csv
buffer_evolution_sec.csv
throughput_trace_kbps.csv
MSSIM.csv
per_segment_encoding_width.csv
STRRED.csv


Unnamed: 0,per_segment_encoding_height.csv_min,per_segment_encoding_height.csv_max,per_segment_encoding_height.csv_sum,per_segment_encoding_height.csv_mean,per_segment_encoding_height.csv_median,per_segment_encoding_height.csv_std,per_segment_encoding_height.csv_1stQ,per_segment_encoding_height.csv_3rdQ,per_segment_encoding_height.csv_skew,per_segment_encoding_height.csv_kur,...,STRRED.csv_max,STRRED.csv_sum,STRRED.csv_mean,STRRED.csv_median,STRRED.csv_std,STRRED.csv_1stQ,STRRED.csv_3rdQ,STRRED.csv_skew,STRRED.csv_kur,STRRED.csv_count
0,360.0,1080.0,10440.0,652.5,720.0,226.495033,495.0,720.0,0.36569,-0.052893,...,345.080129,46296.463832,56.048988,20.606288,67.807192,13.473389,73.651502,1.553019,1.413226,826
1,360.0,720.0,8100.0,540.0,540.0,180.0,360.0,720.0,0.0,-2.230769,...,178.020214,29553.196034,36.306138,27.051776,31.392378,10.311113,56.00585,1.266429,1.63642,814
2,720.0,1080.0,15480.0,967.5,1080.0,172.336879,720.0,1080.0,-0.895257,-1.390609,...,53.046983,6740.454856,8.436114,5.372933,7.735403,3.494339,9.952786,2.195295,5.512179,799
3,360.0,1080.0,13140.0,821.25,900.0,286.260371,540.0,1080.0,-0.414217,-1.550333,...,414.646534,84706.630824,101.081898,87.856483,88.527189,18.707355,144.089248,0.978122,0.345673,838
4,216.0,720.0,5346.0,381.857143,360.0,119.017683,360.0,360.0,1.945719,5.072595,...,4988.706087,413306.944583,661.291111,316.528598,1052.775797,238.017406,494.201721,3.041851,7.953462,625
5,270.0,720.0,5580.0,372.0,360.0,101.291658,360.0,360.0,3.17227,11.767222,...,1580.726527,239969.119295,370.322715,250.774885,313.244878,140.508243,514.103691,1.196269,0.552229,648
6,360.0,1080.0,13500.0,843.75,720.0,234.403498,720.0,1080.0,-0.367916,-0.846684,...,486.525293,74952.459676,89.442076,52.678424,88.080972,18.951725,143.925879,1.375703,1.720191,838
7,360.0,720.0,7200.0,480.0,540.0,111.098412,360.0,540.0,0.311574,-0.403846,...,2878.433902,346804.384511,443.483868,162.795678,554.550252,107.39193,464.951388,1.755489,2.270178,782
8,360.0,720.0,6480.0,432.0,360.0,132.621912,360.0,450.0,1.631987,1.320051,...,980.669771,104405.634254,164.937811,95.961426,193.934569,53.768635,184.680747,2.425078,5.775314,633
9,360.0,1080.0,13140.0,876.0,1080.0,310.777826,630.0,1080.0,-1.013194,-0.902676,...,178.020214,15246.744923,18.730645,3.900427,29.041117,2.056327,24.158844,2.049725,4.524785,814


In [8]:
tmp_data = pd.read_csv(root_folder + 'scal_vars.csv').iloc[: , 1:13]
tmp_data

Unnamed: 0,playback_duration_sec,rebuffer_duration_sec,video_duration_sec,N_playback_frames,height,width,rebuffer_number,content_spatial_information,content_temporal_information,frame_rate,N_rebuffer_frames,N_total_frames
0,27.533333,0.033333,27.566667,826,1080,1920,1,46.559426,6.704285,30.000000,1,827
1,27.133333,4.300000,31.433333,814,1080,1920,3,24.465410,9.669711,30.000000,129,943
2,26.633333,0.000000,26.633333,799,1080,1920,0,36.516497,3.111322,30.000000,0,799
3,27.933333,0.000000,27.933333,838,1080,1920,0,51.239882,12.835748,30.000000,0,838
4,25.000000,4.320000,29.320000,625,1080,1920,3,92.946820,30.131946,25.000000,108,733
...,...,...,...,...,...,...,...,...,...,...,...,...
415,26.625000,0.000000,26.625000,639,1080,1920,0,43.562215,11.106232,24.000000,0,639
416,27.133333,0.000000,27.133333,814,1080,1920,0,24.465410,9.669711,30.000000,0,814
417,27.000000,0.000000,27.000000,810,1080,1920,0,45.257302,6.470532,30.000000,0,810
418,27.000000,0.000000,27.000000,648,1080,1920,0,34.617578,21.403401,24.000000,0,648


In [9]:
print(df_new.shape)
df_new = pd.concat([df_new, tmp_data], axis=1)
print(df_new.shape)

(420, 166)
(420, 178)


In [10]:
# Build the response variable
MOS_df = pd.read_csv(root_folder + "continuous_zscored_mos.csv")
df_new["mean"] = MOS_df.mean(axis = 1)
df_new["std"] = MOS_df.std(axis = 1)
df_new.shape

(420, 180)

In [11]:
# count the effective number of MOS score per video (on test set)
fill_mos = pd.DataFrame([MOS_df.shape[1] - MOS_df.isnull().sum(axis=1)]).T
fill_mos["test_var"] = pd.read_csv(root_folder + "scal_vars.csv")["throughput_trace_name"]
fill_mos = np.array(fill_mos[fill_mos["test_var"].isin(["Train_vestby_oslo","Tram_jernbanetorget_ljabru"])][0])
fill_mos

array([ 828,  800,  783,  812,  649,  640,  798,  828,  649,  827,  634,
        839,  798,  626,  643,  816,  649,  816,  649,  844,  816,  828,
        675,  649,  783,  798,  816,  640,  652,  634,  825,  811,  839,
        662,  839,  626,  634,  816,  665,  665,  665,  640,  645,  783,
        640,  626,  800,  839,  816,  806,  665, 1005,  783,  839,  649,
        649,  659,  798,  811,  811,  828,  783,  811,  652,  812,  800,
        626,  808,  665,  783,  811,  812,  640,  626,  649,  626,  815,
        838,  795,  665,  800,  649,  815,  828,  839,  795,  783,  815,
        822,  643,  869,  795,  839,  649,  643,  815,  634,  812,  830,
        827,  815,  812,  811,  800,  829,  626,  827,  812,  800,  815,
        643,  640,  783,  795,  652,  811,  626,  640,  649,  665])

# Random Forest

In [12]:
# Train/Test split by throughput_trace_name
df_train = df_new[~df_new["test_var"].isin(["Train_vestby_oslo","Tram_jernbanetorget_ljabru"])]
X_train = df_train.drop(["mean", "std", "test_var"], axis = 1) 
y_train = df_train["mean"]
y_train2 = df_train["std"]

In [13]:
X_train.shape

(300, 177)

In [14]:
df_test = df_new[df_new["test_var"].isin(["Train_vestby_oslo","Tram_jernbanetorget_ljabru"])]
X_test = df_test.drop(["mean", "std", "test_var"], axis = 1) 
y_test = df_test["mean"]
y_test2 = df_test["std"]

In [15]:
X_test.shape

(120, 177)

In [16]:
########################################
# Feature Importance
"""
Function to Fit model based on optimal values of depth and number of estimators and use it
to compute feature importance for all the features.
"""
def get_feature_importance(depth, n_tree, max_leaf, X_train, y_train):
    from sklearn.ensemble import RandomForestRegressor
    rf_opt = RandomForestRegressor(max_depth = depth, n_estimators = n_tree, max_leaf_nodes=max_leaf,
                                   random_state=42, bootstrap=False)
    rf_opt.fit(X_train, y_train)
    feature_importance = pd.DataFrame(rf_opt.feature_importances_)
    feature_importance.index = X_train.columns
    feature_importance = feature_importance.sort_values(by=list(feature_importance.columns),axis=0,ascending=False)

    return feature_importance

# Fewest features
"""
Function to Fit model based on optimal values of depth and number of estimators and feature importance
to find the fewest possible features to exceed the previously attained score with all selected features
"""
def get_fewest_features(depth, n_tree, max_leaf, importance):
    sorted_feature_names = importance.index
    # print('sorted_feature_names: ', sorted_feature_names)
    features = []
    for f in range(1,len(sorted_feature_names)+1):
        features.append(sorted_feature_names[0:f])
        # print('features:', features)
    return features

def get_scores(depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test):
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor(max_depth=depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, n_jobs=4,
                                   random_state=42, bootstrap=False)

    model.fit(X_train[feats], y_train)
    y_pred = model.predict(X_test[feats])
    
    accurac = model.score(X_test[feats], y_test)

    return accurac

In [17]:
def analyze_models(depths, n_trees, X_train, y_train, X_test, y_test, max_leaf, res_file):
    
    with open(res_file, "w") as res_file:
        print('depth;tree;n_feat;Accuracy;feats', file=res_file)
        for depth in depths:
            print(depth)
            for n_tree in n_trees:
                # get feature orders to use
                importance = get_feature_importance(depth, n_tree, max_leaf, X_train, y_train)
                m_feats = get_fewest_features(depth, n_tree, max_leaf, importance)
                accuracs = []
                for feats in m_feats:
                    # Get the scores with the given (depth, n_tree, feat)
                    accuracs = get_scores(depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test)
                    print(str(depth)+';'+str(n_tree)+';'+str(len(feats))+';'+str(accuracs)+';'+str(list(feats)), file=res_file)
    print("Analysis Complete. Check output file.")
    return []

In [18]:
results_file = "Models_featImp.csv"
analyze_models(range(3,10,1), range(1,4,1), X_train, y_train, X_test, y_test, 500, results_file)

3
4
5
6
7
8
9
Analysis Complete. Check output file.


[]

In [19]:
results = pd.read_csv(results_file,sep=';')
results = results.sort_values(by=['Accuracy'], ascending=False)
print(results.head(10))
print("******")

      depth  tree  n_feat  Accuracy  \
3380      9     2      18  0.364035   
2849      8     2      18  0.331218   
3200      9     1      15  0.329749   
3202      9     1      17  0.323816   
3029      8     3      21  0.318080   
3205      9     1      20  0.317421   
2671      8     1      17  0.317369   
1257      5     2      19  0.316324   
3561      9     3      22  0.315136   
1608      6     1      16  0.313715   

                                                  feats  
3380  ['playout_bitrate.csv_3rdQ', 'STRRED.csv_1stQ'...  
2849  ['playout_bitrate.csv_3rdQ', 'STRRED.csv_1stQ'...  
3200  ['playout_bitrate.csv_3rdQ', 'STRRED.csv_1stQ'...  
3202  ['playout_bitrate.csv_3rdQ', 'STRRED.csv_1stQ'...  
3029  ['playout_bitrate.csv_3rdQ', 'STRRED.csv_1stQ'...  
3205  ['playout_bitrate.csv_3rdQ', 'STRRED.csv_1stQ'...  
2671  ['playout_bitrate.csv_3rdQ', 'STRRED.csv_1stQ'...  
1257  ['playout_bitrate.csv_3rdQ', 'STRRED.csv_1stQ'...  
3561  ['playout_bitrate.csv_3rdQ', 'STRRED.csv_1

In [25]:
from sklearn.ensemble import RandomForestRegressor
importance = get_feature_importance(depth = 9, n_tree = 2, max_leaf = 500, X_train = X_train, y_train = y_train)
feats = get_fewest_features(9, 2, 500, importance)[16]
model = RandomForestRegressor(max_depth=9, n_estimators = 2, max_leaf_nodes=500, n_jobs=4,
                                   random_state=42, bootstrap=False)
model.fit(X_train[feats], y_train)
y_pred = model.predict(X_test[feats])

In [26]:
# RMSE
np.sqrt(np.mean((y_test - y_pred) ** 2))

0.41663267447337926

In [27]:
results_file2 = "Models2_featImp.csv"
analyze_models(range(3,10,1), range(1,5,1), X_train, y_train2, X_test, y_test2, 500, results_file2)

3
4
5
6
7
8
9
Analysis Complete. Check output file.


[]

In [28]:
results2 = pd.read_csv(results_file2,sep=';')
results2 = results2.sort_values(by=['Accuracy'], ascending=False)
print(results2.head(10))
print("******")

      depth  tree  n_feat  Accuracy  \
4256      9     1       9  0.200174   
3547      8     1       8  0.040385   
4257      9     1      10  0.027502   
2839      7     1       8  0.008026   
3364      7     4       2  0.003343   
2833      7     1       2  0.003343   
3187      7     3       2  0.003343   
2985      7     1     154 -0.004382   
2131      6     1       8 -0.006989   
2885      7     1      54 -0.007730   

                                                  feats  
4256  ['throughput_trace_kbps.csv_median', 'PSNR.csv...  
3547  ['throughput_trace_kbps.csv_median', 'PSNR.csv...  
4257  ['throughput_trace_kbps.csv_median', 'PSNR.csv...  
2839  ['throughput_trace_kbps.csv_median', 'PSNR.csv...  
3364  ['PSNR.csv_std', 'throughput_trace_kbps.csv_me...  
2833  ['throughput_trace_kbps.csv_median', 'PSNR.csv...  
3187  ['PSNR.csv_std', 'throughput_trace_kbps.csv_me...  
2985  ['throughput_trace_kbps.csv_median', 'PSNR.csv...  
2131  ['throughput_trace_kbps.csv_median', 'PSNR

In [29]:
from sklearn.ensemble import RandomForestRegressor
importance = get_feature_importance(depth = 9, n_tree = 1, max_leaf = 500, X_train = X_train, y_train = y_train)
feats = get_fewest_features(9, 1, 500, importance)[8]
model = RandomForestRegressor(max_depth=9, n_estimators = 1, max_leaf_nodes=500, n_jobs=4,
                                   random_state=42, bootstrap=False)
model.fit(X_train[feats], y_train2)
y_pred2 = model.predict(X_test[feats])

In [30]:
# RMSE
np.sqrt(np.mean((y_test2 - y_pred2) ** 2))

0.15293341728365253

In [38]:
# Build dataframe to export
tmp = pd.DataFrame({'pred1' : y_pred, 'pred2' : y_pred2})
tmp.to_csv('/mnt/raid1/QoE_Data/LIVE_NFLX/data/preds.csv')