In [1]:
" Import the libraries " 

import os
import sys 
import math
import copy

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
" Import the scripts of SD for Explaining "

absFilePath = os.path.dirname(os.path.dirname(os.getcwd()))
newPath = os.path.join(absFilePath, 'SplitSD4X')
sys.path.append(newPath)

from fill_missing_values import *
from missing_values_table import *
from neighbors_generation import *
from patterns_extraction import *
from performances import *
from subgroups_discovery import *
from sp_lime import *

## Data Preparation 

In [3]:
" Loading the dataset "
datasets_path = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'Datasets\\')
url = datasets_path + 'data_transcoding_mesurment.tsv'
df = pd.read_csv(url, sep='\t')

" Handling some data "
df = df.drop(columns=['id'])
df = df.rename(columns={'i': 'i_frames', 'p': 'p_frames' , 'b':'b_frames'})
df.head()

Unnamed: 0,duration,codec,width,height,bitrate,framerate,i_frames,p_frames,b_frames,frames,...,p_size,b_size,size,o_codec,o_bitrate,o_framerate,o_width,o_height,umem,utime
0,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,1564,...,825054,0,889537,mpeg4,56000,12.0,176,144,22508,0.612
1,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,1564,...,825054,0,889537,mpeg4,56000,12.0,320,240,25164,0.98
2,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,1564,...,825054,0,889537,mpeg4,56000,12.0,480,360,29228,1.216
3,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,1564,...,825054,0,889537,mpeg4,56000,12.0,640,480,34316,1.692
4,130.35667,mpeg4,176,144,54590,12.0,27,1537,0,1564,...,825054,0,889537,mpeg4,56000,12.0,1280,720,58528,3.456


In [4]:
" display the features types "
df.dtypes

duration       float64
codec           object
width            int64
height           int64
bitrate          int64
framerate      float64
i_frames         int64
p_frames         int64
b_frames         int64
frames           int64
i_size           int64
p_size           int64
b_size           int64
size             int64
o_codec         object
o_bitrate        int64
o_framerate    float64
o_width          int64
o_height         int64
umem             int64
utime          float64
dtype: object

In [5]:
" Checking missing values "
df.replace('?', np.nan, inplace=True)
missing_values_table(df)

Your slelected dataframe has 21 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [6]:
data_df = df.drop(columns=['utime'])
target_df = df['utime']

In [7]:
categorical_feature_mask = (data_df.dtypes == object)
categorical_feature_mask

duration       False
codec           True
width          False
height         False
bitrate        False
framerate      False
i_frames       False
p_frames       False
b_frames       False
frames         False
i_size         False
p_size         False
b_size         False
size           False
o_codec         True
o_bitrate      False
o_framerate    False
o_width        False
o_height       False
umem           False
dtype: bool

In [8]:
categorical_cols_names = data_df.columns[categorical_feature_mask].tolist()
categorical_cols_names

['codec', 'o_codec']

In [9]:
numerical_cols_names = data_df.columns[~categorical_feature_mask].tolist()
numerical_cols_names

['duration',
 'width',
 'height',
 'bitrate',
 'framerate',
 'i_frames',
 'p_frames',
 'b_frames',
 'frames',
 'i_size',
 'p_size',
 'b_size',
 'size',
 'o_bitrate',
 'o_framerate',
 'o_width',
 'o_height',
 'umem']

In [10]:
data_df = pd.concat([data_df[numerical_cols_names].astype(float), data_df[categorical_cols_names]],axis = 1)
data_target_df = pd.concat([data_df, target_df], axis=1) 

In [11]:
" generate the Test SET "
nb_test_instances = 1000 
test_df = data_target_df.sample(n=nb_test_instances)
data_test_df = test_df.drop(columns=['utime'])
target_test_df = test_df['utime']

" generate the Training SET "
train_df = pd.concat([data_target_df,test_df]).drop_duplicates(keep=False)
data_train_df = train_df.drop(columns=['utime'])
target_train_df = train_df['utime']

In [12]:
" Decode Categorical Features "

format_mapper = {
    0 : 'vp8', 1 : 'h264', 2 : 'mpeg4', 3 : 'flv'
}
format_mapper_inv = dict(map(reversed, format_mapper.items()))

In [13]:
data_test_df_copy = data_test_df.copy()

In [14]:
data_test_df_copy['codec']   = data_test_df_copy['codec'].replace(format_mapper_inv)
data_test_df_copy['o_codec'] = data_test_df_copy['o_codec'].replace(format_mapper_inv)
data_test_df_copy.head()

Unnamed: 0,duration,width,height,bitrate,framerate,i_frames,p_frames,b_frames,frames,i_size,p_size,b_size,size,o_bitrate,o_framerate,o_width,o_height,umem,codec,o_codec
34421,74.535,176.0,144.0,55305.0,12.0,15.0,879.0,0.0,894.0,23996.0,491276.0,0.0,515272.0,5000000.0,15.0,1920.0,1080.0,632628.0,2,1
45527,33.09,320.0,240.0,279173.0,13.0,22.0,434.0,0.0,456.0,101794.0,1052937.0,0.0,1154731.0,3000000.0,24.0,1920.0,1080.0,219480.0,1,2
40505,270.655,640.0,480.0,652967.0,30.062963,89.0,8028.0,0.0,8117.0,1912240.0,20178890.0,0.0,22091130.0,5000000.0,12.0,1920.0,1080.0,219480.0,0,2
41476,106.765,1280.0,720.0,849586.0,25.0,66.0,2603.0,0.0,2669.0,4427240.0,6911026.0,0.0,11338266.0,539000.0,24.0,1280.0,720.0,219480.0,1,0
63115,95.462,640.0,480.0,809256.0,29.0,51.0,2811.0,0.0,2862.0,886496.0,8770159.0,0.0,9656655.0,539000.0,29.97,320.0,240.0,221160.0,1,2


In [15]:
data_test_copy = data_test_df_copy.values
numerical_cols = np.arange(0,len(numerical_cols_names)) 
categorical_cols = np.arange(len(numerical_cols_names),data_df.shape[1])

## Neighbors Generation

In [16]:
# generate neighbors : 
nb_neighbors = 50
list_neigh = generate_all_neighbors(data_test_copy,numerical_cols,categorical_cols,nb_neighbors)

  return np.random.multivariate_normal(inst_num,covn,n)


In [17]:
" store all the neighbors together "
n = np.size(data_test_copy,0)
all_neighbors = list_neigh[0]
for i in range(1,n) :
    all_neighbors = np.concatenate((all_neighbors, list_neigh[i]), axis=0)

###  One hot encoding 

In [18]:
df_neigh = pd.DataFrame(data = all_neighbors,columns= numerical_cols_names + categorical_cols_names)
df_neigh[categorical_cols_names] = df_neigh[categorical_cols_names].astype(int,errors='ignore')

In [19]:
" Decode non ordinal features to perform one hot encoding "

df_neigh['codec']   = df_neigh['codec'].replace(format_mapper)
df_neigh['o_codec'] = df_neigh['o_codec'].replace(format_mapper)

In [20]:
" One hot encoding "
df_neigh = pd.get_dummies(df_neigh, prefix_sep='_', drop_first=True)
df_neigh.head()

Unnamed: 0,duration,width,height,bitrate,framerate,i_frames,p_frames,b_frames,frames,i_size,...,o_framerate,o_width,o_height,umem,codec_h264,codec_mpeg4,codec_vp8,o_codec_h264,o_codec_mpeg4,o_codec_vp8
0,74.013768,157.526318,138.737369,103854.288899,11.45597,6.312745,548.6175,7.230709,562.160943,-563311.121641,...,14.900848,1868.694211,1053.635418,626452.939673,0,1,0,1,0,0
1,81.725026,245.798607,173.950359,264189.392985,11.965166,17.713976,1058.499699,-3.963474,1072.250195,476302.326254,...,15.15781,1975.005148,1108.144129,643963.25379,0,1,0,1,0,0
2,78.521659,115.625503,114.748385,27105.396436,11.541558,16.229804,740.21781,6.398105,762.845756,-537069.30957,...,15.293035,1980.433461,1109.9944,630421.935308,0,1,0,1,0,0
3,53.883889,190.403875,148.78767,78944.254519,11.528824,4.63791,470.12875,-2.529534,472.237111,53074.363284,...,15.79798,2005.359431,1122.653747,625800.741577,0,1,0,1,0,0
4,17.117683,232.351584,173.225724,216567.175871,13.374434,8.258057,42.686978,6.041069,56.986124,-95787.599488,...,14.733864,1951.238428,1096.404289,643800.850939,0,1,0,1,0,0


In [21]:
" Store the neighbors in a list"

data_neigh = df_neigh.values
n = np.size(data_test_copy,0)
list_neigh = []
j = 0
for i in range(0,n):
    list_neigh.append(data_neigh[j:(j+nb_neighbors),:])
    j += nb_neighbors

####  One hot encoding for the training and the test sets

In [22]:
data_train_df = pd.get_dummies(data_train_df, prefix_sep='_', drop_first=True)
data_train_df.head()

Unnamed: 0,duration,width,height,bitrate,framerate,i_frames,p_frames,b_frames,frames,i_size,...,o_framerate,o_width,o_height,umem,codec_h264,codec_mpeg4,codec_vp8,o_codec_h264,o_codec_mpeg4,o_codec_vp8
0,130.35667,176.0,144.0,54590.0,12.0,27.0,1537.0,0.0,1564.0,64483.0,...,12.0,176.0,144.0,22508.0,0,1,0,0,1,0
1,130.35667,176.0,144.0,54590.0,12.0,27.0,1537.0,0.0,1564.0,64483.0,...,12.0,320.0,240.0,25164.0,0,1,0,0,1,0
2,130.35667,176.0,144.0,54590.0,12.0,27.0,1537.0,0.0,1564.0,64483.0,...,12.0,480.0,360.0,29228.0,0,1,0,0,1,0
3,130.35667,176.0,144.0,54590.0,12.0,27.0,1537.0,0.0,1564.0,64483.0,...,12.0,640.0,480.0,34316.0,0,1,0,0,1,0
4,130.35667,176.0,144.0,54590.0,12.0,27.0,1537.0,0.0,1564.0,64483.0,...,12.0,1280.0,720.0,58528.0,0,1,0,0,1,0


In [23]:
data_train = data_train_df.values
target_train = target_train_df.values

In [24]:
data_test_df = pd.get_dummies(data_test_df, prefix_sep='_', drop_first=True)
data_test_df.head()

Unnamed: 0,duration,width,height,bitrate,framerate,i_frames,p_frames,b_frames,frames,i_size,...,o_framerate,o_width,o_height,umem,codec_h264,codec_mpeg4,codec_vp8,o_codec_h264,o_codec_mpeg4,o_codec_vp8
34421,74.535,176.0,144.0,55305.0,12.0,15.0,879.0,0.0,894.0,23996.0,...,15.0,1920.0,1080.0,632628.0,0,1,0,1,0,0
45527,33.09,320.0,240.0,279173.0,13.0,22.0,434.0,0.0,456.0,101794.0,...,24.0,1920.0,1080.0,219480.0,1,0,0,0,1,0
40505,270.655,640.0,480.0,652967.0,30.062963,89.0,8028.0,0.0,8117.0,1912240.0,...,12.0,1920.0,1080.0,219480.0,0,0,1,0,1,0
41476,106.765,1280.0,720.0,849586.0,25.0,66.0,2603.0,0.0,2669.0,4427240.0,...,24.0,1280.0,720.0,219480.0,1,0,0,0,0,1
63115,95.462,640.0,480.0,809256.0,29.0,51.0,2811.0,0.0,2862.0,886496.0,...,29.97,320.0,240.0,221160.0,1,0,0,0,1,0


In [25]:
data_test = data_test_df.values
target_test = target_test_df.values

In [26]:
" Define the functions to save and load data "
import pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [27]:
'SAVE THE DATA'

path = './saved_data/'
save_obj(data_train, path + 'data_train')
save_obj(target_train, path + 'target_train')
save_obj(data_test, path  + 'data_test')
save_obj(target_test, path + 'target_test')
save_obj(list_neigh, path + 'list_neighbors')

## Training the models

In [28]:
" Linear Regression : "
regression = LinearRegression()
model_reg = regression.fit(data_train, target_train)
target_pred_reg = model_reg.predict(data_test)

In [29]:
" Random Forest Regressor "
regr = RandomForestRegressor(max_depth=7, random_state=0)
model_rf = regr.fit(data_train, target_train)
target_pred_rf = model_rf.predict(data_test)

In [30]:
" Ada Boost regressor "
adaBoostRegr = AdaBoostRegressor(random_state=0, n_estimators=10)
model_abr = adaBoostRegr.fit(data_train, target_train)
target_pred_abr = model_abr.predict(data_test)

In [31]:
" Sklearn MLP regressor "

mlp = make_pipeline(StandardScaler(),
                    MLPRegressor(hidden_layer_sizes=(50, 50),
                                 tol=1e-2, 
                                 max_iter=1000, 
                                 random_state=0))
model_nt = mlp.fit(data_train, target_train)
target_pred_nt = model_nt.predict(data_test)

## Scores of the black box models 


In [32]:
print(f"{'The score of the linear regression model is ' :<60}{': {}'.format(round(model_reg.score(data_test, target_test),4))}")
print(f"{'The score of the Random Forest Regressor model is ':<60}{': {}'.format(round(model_rf.score(data_test, target_test),4))}")
print(f"{'The score of the AdaBoost Regressor model is ':<60}{': {}'.format(round(model_abr.score(data_test, target_test),4))}")
print(f"{'The score of the Multi-Layer-Perceptron Regressor model is ':<60}{': {}'.format(round(model_nt.score(data_test, target_test),4))}")

The score of the linear regression model is                 : 0.6843
The score of the Random Forest Regressor model is           : 0.9332
The score of the AdaBoost Regressor model is                : 0.7515
The score of the Multi-Layer-Perceptron Regressor model is  : 0.9873


## Execution of Split Based Selection Form Algorithm : 

In [33]:
split_point = len(numerical_cols)
nb_models = 100
(L_Subgroups,P) = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh,split_point)

In [34]:
'SAVE THE LIST OF THE SUBGROUPS'
save_obj(L_Subgroups, path + 'list_subgroups')

## Subgroups Descriptions

In [35]:
att_names = data_test_df.columns
patt_descriptions = patterns(P,split_point,data_test,att_names)

subrgoup 0
25552.0 < umem <= 299561.6
756.0 < o_height <= 1080.0
56000.0 < o_bitrate <= 1044800.0
15.59 < o_framerate <= 29.97
codec_mpeg4 = 1
-------------------------------------------------------------------
subrgoup 1
299561.6 < umem <= 710576.0
3022400.0 < o_bitrate <= 5000000.0
424.8 < height <= 1080.0
29.0 < framerate <= 30.39
-------------------------------------------------------------------
subrgoup 2
230550.4 < umem <= 299561.6
518.4 < o_height <= 756.0
codec_vp8 = 0
-------------------------------------------------------------------
subrgoup 3
25552.0 < umem <= 230550.4
518.4 < o_height <= 756.0
26968.0 < bitrate <= 1818772.0
12.0 < o_framerate <= 15.59
-------------------------------------------------------------------
subrgoup 4
299561.6 < umem <= 710576.0
3022400.0 < o_bitrate <= 5000000.0
144.0 < height <= 424.8
157.74 < duration <= 1768.94
334.0 < p_frames <= 4452.9
-------------------------------------------------------------------
subrgoup 5
25552.0 < umem <= 299561.

codec_vp8 = 1
-------------------------------------------------------------------
subrgoup 93
299561.6 < umem <= 710576.0
3022400.0 < o_bitrate <= 5000000.0
144.0 < height <= 424.8
157.74 < duration <= 1768.94
4452.9 < p_frames <= 33880.0
codec_mpeg4 = 0
codec_vp8 = 0
-------------------------------------------------------------------
subrgoup 94
299561.6 < umem <= 710576.0
56000.0 < o_bitrate <= 644800.0
144.0 < height <= 424.8
12.0 < o_framerate <= 15.59
549815.9 < i_size <= 20478926.0
756.0 < o_height <= 1080.0
-------------------------------------------------------------------
subrgoup 95
299561.6 < umem <= 710576.0
56000.0 < o_bitrate <= 644800.0
144.0 < height <= 424.8
12.0 < o_framerate <= 15.59
549815.9 < i_size <= 20478926.0
144.0 < o_height <= 756.0
-------------------------------------------------------------------
subrgoup 96
25552.0 < umem <= 299561.6
756.0 < o_height <= 1080.0
1044800.0 < o_bitrate <= 5000000.0
12.0 < o_framerate <= 13.8
25.12 < framerate <= 30.39
-------

In [36]:
'SAVE THE SUBGROUPS PATTERNS'
save_obj(patt_descriptions, path + 'patterns')
save_obj(att_names, path + 'att_names')