In [1]:
" Import the libraries " 

import os
import sys 
import math
import copy

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
" Import the scripts of SD for Explaining and the supplementary scripts for neighbors generation"

absFilePath = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
newPath = os.path.join(absFilePath, 'SplitSD4X\\')
sys.path.append(newPath)

newPath_supp = os.path.join(newPath, 'supplementary')
sys.path.append(newPath_supp)

from fill_missing_values import *
from missing_values_table import *
from subgroups_discovery import *

from neighbors_generation import *
from neighbors_generation_2 import *
from neighbors_generation_3 import *
from neighbors_generation_4 import *

## Data Preparation 

In [3]:
" Loading the dataset "
datasets_path = os.path.join(absFilePath, 'Datasets\\')
url = datasets_path + 'data_transcoding_mesurment.tsv'
df = pd.read_csv(url, sep='\t')

" Handling some data "
df = df.drop(columns=['id'])
df = df.rename(columns={'i': 'i_frames', 'p': 'p_frames' , 'b':'b_frames'})

data_df = df.drop(columns=['utime'])
target_df = df['utime']

categorical_feature_mask = (data_df.dtypes == object)
categorical_cols_names = data_df.columns[categorical_feature_mask].tolist()
numerical_cols_names = data_df.columns[~categorical_feature_mask].tolist()

data_df = pd.concat([data_df[numerical_cols_names].astype(float), data_df[categorical_cols_names]],axis = 1)
data_target_df = pd.concat([data_df, target_df], axis=1) 

In [4]:
" generate the Test SET "
nb_test_instances = 1000 
test_df = data_target_df.sample(n=nb_test_instances)
data_test_df = test_df.drop(columns=['utime'])
target_test_df = test_df['utime']

" generate the Training SET "
train_df = pd.concat([data_target_df,test_df]).drop_duplicates(keep=False)
data_train_df = train_df.drop(columns=['utime'])
target_train_df = train_df['utime']

" Extract values of the test set to generate the neighbors"

" Decode Categorical Features "

format_mapper = {
    0 : 'vp8', 1 : 'h264', 2 : 'mpeg4', 3 : 'flv'
}
format_mapper_inv = dict(map(reversed, format_mapper.items()))

data_test_df_copy = data_test_df.copy()
data_test_df_copy['codec']   = data_test_df_copy['codec'].replace(format_mapper_inv)
data_test_df_copy['o_codec'] = data_test_df_copy['o_codec'].replace(format_mapper_inv)

data_test_copy = data_test_df_copy.values
numerical_cols = np.arange(0,len(numerical_cols_names)) 
categorical_cols = np.arange(len(numerical_cols_names),data_df.shape[1])

## Neighbors Generation (*Version 1*)

In [5]:
# generate neighbors : 
nb_neighbors = 50
list_neigh = generate_all_neighbors(data_test_copy,numerical_cols,categorical_cols,nb_neighbors)

" store all the neighbors together "
n = np.size(data_test_copy,0)
all_neighbors = list_neigh[0]
for i in range(1,n) :
    all_neighbors = np.concatenate((all_neighbors, list_neigh[i]), axis=0)
    
" One hot encoding "

df_neigh = pd.DataFrame(data = all_neighbors,columns= numerical_cols_names + categorical_cols_names)
df_neigh[categorical_cols_names] = df_neigh[categorical_cols_names].astype(int,errors='ignore')

" Decode non ordinal features to perform one hot encoding "

df_neigh['codec']   = df_neigh['codec'].replace(format_mapper)
df_neigh['o_codec'] = df_neigh['o_codec'].replace(format_mapper)

" One hot encoding "
df_neigh = pd.get_dummies(df_neigh, prefix_sep='_', drop_first=True)

" Store the neighbors in a list"

data_neigh = df_neigh.values
n = np.size(data_test_copy,0)
list_neigh = []
j = 0
for i in range(0,n):
    list_neigh.append(data_neigh[j:(j+nb_neighbors),:])
    j += nb_neighbors

  return np.random.multivariate_normal(inst_num,covn,n)


## Neighbors Generation (*Version 2*)

In [6]:
mat_nb_categ = []
j = 0 
for name in categorical_cols_names :
    mat_nb_categ.append(np.size(data_df[name].unique()))
    
list_neigh_2 = generate_all_neighbors_2(data_test_copy,numerical_cols,categorical_cols,mat_nb_categ,nb_neighbors)

" store all the neighbors together "
n = np.size(data_test_copy,0)
all_neighbors_2 = list_neigh_2[0]
for i in range(1,n) :
    all_neighbors_2 = np.concatenate((all_neighbors_2, list_neigh_2[i]), axis=0)
    
df_neigh_2 = pd.DataFrame(data = all_neighbors_2,columns= numerical_cols_names + categorical_cols_names)
df_neigh_2[categorical_cols_names] = df_neigh_2[categorical_cols_names].astype(int,errors='ignore')

" Decode non ordinal features to perform one hot encoding "

df_neigh_2['codec']   = df_neigh_2['codec'].replace(format_mapper)
df_neigh_2['o_codec'] = df_neigh_2['o_codec'].replace(format_mapper)

" One hot encoding "
df_neigh_2 = pd.get_dummies(df_neigh_2, prefix_sep='_', drop_first=True)

data_neigh_2 = df_neigh_2.values
n = np.size(data_test_copy,0)
list_neigh_2 = []
j = 0
for i in range(0,n):
    list_neigh_2.append(data_neigh_2[j:(j+nb_neighbors),:])
    j += nb_neighbors

## Neighbors Generation (*Version 3*)

In [7]:
list_neigh_3 = generate_all_neighbors_3(data_test_copy,numerical_cols,categorical_cols,mat_nb_categ,nb_neighbors)

" store all the neighbors together "
n = np.size(data_test_copy,0)
all_neighbors_3 = list_neigh_3[0]
for i in range(1,n) :
    all_neighbors_3 = np.concatenate((all_neighbors_3, list_neigh_3[i]), axis=0)
    
df_neigh_3 = pd.DataFrame(data = all_neighbors_3,columns= numerical_cols_names + categorical_cols_names)
df_neigh_3[categorical_cols_names] = df_neigh_3[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "
df_neigh_3['codec']   = df_neigh_3['codec'].replace(format_mapper)
df_neigh_3['o_codec'] = df_neigh_3['o_codec'].replace(format_mapper)

" One hot encoding "
df_neigh_3 = pd.get_dummies(df_neigh_3, prefix_sep='_', drop_first=True)

data_neigh_3 = df_neigh_3.values
n = np.size(data_test_copy,0)
list_neigh_3 = []
j = 0
for i in range(0,n):
    list_neigh_3.append(data_neigh_3[j:(j+nb_neighbors),:])
    j += nb_neighbors

  return np.random.multivariate_normal(inst_num,varn,n)


## Neighbors Generation (*Version 4*)

In [8]:
special = []
list_neigh_4 = generate_all_neighbors_4(data_test_copy,numerical_cols,categorical_cols,mat_nb_categ,nb_neighbors,special)

" store all the neighbors together "
n = np.size(data_test_copy,0)
all_neighbors_4 = list_neigh_4[0]
for i in range(1,n) :
    all_neighbors_4 = np.concatenate((all_neighbors_4, list_neigh_4[i]), axis=0)
    
all_neighbors_4[:,5] = all_neighbors_4[:,5] + 1 
all_neighbors_4[:,7] = all_neighbors_4[:,7] + 1 
all_neighbors_4[:,11] = all_neighbors_4[:,11] + 1 

df_neigh_4 = pd.DataFrame(data = all_neighbors_4,columns= numerical_cols_names + categorical_cols_names)
df_neigh_4[categorical_cols_names] = df_neigh_4[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "
df_neigh_4['codec']   = df_neigh_4['codec'].replace(format_mapper)
df_neigh_4['o_codec'] = df_neigh_4['o_codec'].replace(format_mapper)

" One hot encoding "
df_neigh_4 = pd.get_dummies(df_neigh_4, prefix_sep='_', drop_first=True)

data_neigh_4 = df_neigh_4.values
n = np.size(data_test_copy,0)
list_neigh_4 = []
j = 0
for i in range(0,n):
    list_neigh_4.append(data_neigh_4[j:(j+nb_neighbors),:])
    j += nb_neighbors

####  One hot encoding for the training and the test sets

In [9]:
data_train_df = pd.get_dummies(data_train_df, prefix_sep='_', drop_first=True)

data_train = data_train_df.values
target_train = target_train_df.values

data_test_df = pd.get_dummies(data_test_df, prefix_sep='_', drop_first=True)
data_test = data_test_df.values
target_test = target_test_df.values

## Training the MLP model

In [10]:
" Sklearn MLP regressor "

mlp = make_pipeline(StandardScaler(),
                    MLPRegressor(hidden_layer_sizes=(50, 50),
                                 tol=1e-2, 
                                 max_iter=1000, 
                                 random_state=0))
model_nt = mlp.fit(data_train, target_train)
target_pred_nt = model_nt.predict(data_test)

## Execution of Split Based Selection Form Algorithm : 

In [11]:
split_point = len(numerical_cols)
nb_models = 100
(L_Subgroups_1,P_1) = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh,split_point)
(L_Subgroups_2,P_2) = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh_2,split_point)
(L_Subgroups_3,P_3) = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh_3,split_point)
(L_Subgroups_4,P_4) = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh_4,split_point)

In [12]:
" Define the functions to save and load data "
import pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [13]:
'SAVE THE DATA'

path = './saved_data/'
save_obj(data_train, path + 'data_train')
save_obj(target_train, path + 'target_train')
save_obj(data_test, path  + 'data_test')
save_obj(target_test, path + 'target_test')
save_obj(list_neigh, path   + 'list_neighbors_1')
save_obj(list_neigh_2, path + 'list_neighbors_2')
save_obj(list_neigh_3, path + 'list_neighbors_3')
save_obj(list_neigh_4, path + 'list_neighbors_4')

In [14]:
'SAVE THE LIST OF THE SUBGROUPS'
save_obj(L_Subgroups_1, path + 'list_subgroups_1')
save_obj(L_Subgroups_2, path + 'list_subgroups_2')
save_obj(L_Subgroups_3, path + 'list_subgroups_3')
save_obj(L_Subgroups_4, path + 'list_subgroups_4')