# Imports

In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import math
from datetime import datetime
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
import warnings
warnings.filterwarnings("ignore")

# Data



In [101]:
data = pd.read_csv('dataset_completo', header ='infer').iloc[:,1:]

# Classification Model

### Train Test Splitting

In [102]:
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder

In [103]:
data.columns

Index(['titolo', 'durata', 'views', 'n_comments', 'n_like', 'genere', 'publ',
       'max_quality', 'score', 'timedelta', 'class'],
      dtype='object')

In [104]:
Y = data['class'] # Extract the target feature

In [105]:
X = data.drop(columns = ['class','score','publ','titolo']) # Remove from the data useful to the analysis: "score, publ, titolo, class"

In [106]:
le = LabelEncoder()
X['genere'] = le.fit_transform(X['genere']) # Transform the Categorical genere feature in a numerical feature

In [107]:
X_train,X_test,y_train,y_test = train_test_split(X,Y, train_size=.8, random_state=42) # Split dataset in train and test

In [108]:
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train, train_size = .8, random_state = 42) # Split train in train and validation

### Excursus

We want to predict the class of a given video. The classes were defined using a home-made score.

To predict we will try different models.
Let's start with SVM

### SVC

In [109]:
from sklearn.svm import SVC

REMINDER OF THE PARAMETERS


C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=- 1, decision_function_shape='ovr', break_ties=False, random_state=None

Try to use SVC without scaling data

In [110]:
clf = SVC(gamma = 'auto', random_state = 42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.16061185468451242
0.4732394366197183 0.25


Try to use SVC with Standard Scaler

In [111]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.8075326343334872
0.8267605633802817 0.803460182616736


Try to use SVC with MinMaxScaler

In [112]:
clf = make_pipeline(MinMaxScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.5932135459307548
0.7422535211267606 0.5745550161812297


Try to use SVC with MinMaxScaler and Standard Scaler (in that order)

In [113]:
clf = make_pipeline(MinMaxScaler(),StandardScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.8075326343334872
0.8267605633802817 0.803460182616736


Try to use SVC with Standard Scaler and MinMaxScaler (in that order)

In [114]:
clf = make_pipeline(StandardScaler(),MinMaxScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.5932135459307548
0.7422535211267606 0.5745550161812297


Same Results as One Scaler.


Best Results without Scaling.

In [115]:
svc = SVC()

#def gridsearch
params = {
    'kernel' : ('poly', 'rbf', 'sigmoid'),
    'C' : np.linspace(1, 100, num=5), 
    #'degree' : [3,5,8],
    #'gamma' : ('auto','scale')
    
}

search = GridSearchCV(svc,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1, refit=True, verbose=10, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'C': array([  1.  ,  25.75,  50.5 ,  75.25, 100.  ]),
                         'kernel': ('poly', 'rbf', 'sigmoid')},
             pre_dispatch='10*n_jobs', return_train_score=True,
             scoring='f1_macro', verbose=10)

In [116]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)

In [117]:
f1_score(y_val, y_pred, average = 'macro')

0.31184183717669406

### RANDOM FOREST

In [118]:
from sklearn.ensemble import RandomForestClassifier as RFC

In [119]:
clf = RFC(random_state = 42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using Random Forest without scaling data: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

f1-score using Random Forest without scaling data:  0.9425383988190441
0.9436619718309859 0.9377152681460934


In [120]:
clf = make_pipeline(MinMaxScaler(), RFC(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.9462186951230818
0.9436619718309859 0.9366736014794268


In [121]:
clf = make_pipeline(StandardScaler(), RFC(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.9425383988190441
0.9436619718309859 0.9377152681460934


In [122]:
clf = make_pipeline(MinMaxScaler(), RFC(random_state = 42, n_jobs = -1))

#def gridsearch
params = {'randomforestclassifier__criterion' : ['gini','entropy'],
          'randomforestclassifier__max_features' : [None],
          'randomforestclassifier__n_estimators' : [50,100,200]
          
    
    
}

search = GridSearchCV(clf,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1,refit=True,verbose=10, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(n_jobs=-1,
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__criterion': ['gini',
                                                               'entropy'],
                         'randomforestclassifier__max_features': [None],
                         'randomforestclassifier__n_estimators': [50, 100,
                                                                  200]},
             pre_dispatch='10*n_jobs', return_train_score=True,
             scoring='f1_macro', verbose=10)

In [123]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.9464788732394366 0.9490016759130837


In [124]:
f1_score(y_val, y_pred, average = 'macro')

0.9497116153062941

### XGBoost

In [125]:
from xgboost import XGBClassifier

In [126]:
clf = XGBClassifier(random_state=42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using XGBoost without scaling data: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

f1-score using XGBoost without scaling data:  0.9584295743338952
0.9577464788732394 0.9579764794267223


In [127]:
clf = make_pipeline(MinMaxScaler(), XGBClassifier(random_state=42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using XGBoost using MinMaxScaler: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

f1-score using XGBoost using MinMaxScaler:  0.9584295743338952
0.9577464788732394 0.9579764794267223


In [128]:
clf = make_pipeline(StandardScaler(), XGBClassifier(random_state=42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using XGBoost using StandardScaler: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

f1-score using XGBoost using StandardScaler:  0.9584295743338952
0.9577464788732394 0.9579764794267223


In [129]:
clf = make_pipeline(MinMaxScaler(), XGBClassifier(random_state=42))

#def gridsearch
params = {'xgbclassifier__learning_rate' : [.2,.3,.4],
          'xgbclassifier__n_estimators' : [600,800,1000]
         }

search = GridSearchCV(clf,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1,refit=True,verbose=10, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




GridSearchCV(estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      enable_categorical=False,
                                                      gamma=None, gpu_id=None,
                                                      importance_type=None,
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                               

In [130]:
xgb_params=search.best_params_
xgb_params

{'xgbclassifier__learning_rate': 0.3, 'xgbclassifier__n_estimators': 1000}

In [131]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, y_pred)

array([[197,   9,   0,   0],
       [  7, 323,   6,   0],
       [  0,   6, 133,   1],
       [  0,   0,   1,  27]])

In [132]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.9619718309859155 0.9622919556171984 0.9619776509111083


### One vs Rest Classifier - ORCO

In [133]:
from sklearn.multiclass import OneVsRestClassifier as ORC

In [134]:
clf=make_pipeline(MinMaxScaler(), XGBClassifier(learning_rate=xgb_params['xgbclassifier__learning_rate'],n_estimators=xgb_params['xgbclassifier__n_estimators'],random_state=42))
classifier=ORC(clf)
classifier.fit(X_train, y_train)



OneVsRestClassifier(estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                              ('xgbclassifier',
                                               XGBClassifier(base_score=None,
                                                             booster=None,
                                                             colsample_bylevel=None,
                                                             colsample_bynode=None,
                                                             colsample_bytree=None,
                                                             enable_categorical=False,
                                                             gamma=None,
                                                             gpu_id=None,
                                                             importance_type=None,
                                                             interaction_constraints=None,
                                          

In [135]:
y_pred = classifier.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.9661971830985916 0.9654631877022655 0.9694869948139254


### One vs One Classifier - ORCO2

In [136]:
from sklearn.multiclass import OneVsOneClassifier as OOC

In [137]:
clf=make_pipeline(MinMaxScaler(), XGBClassifier(learning_rate=xgb_params['xgbclassifier__learning_rate'],n_estimators=xgb_params['xgbclassifier__n_estimators'],random_state=42))
classifier=OOC(clf)
classifier.fit(X_train, y_train)



OneVsOneClassifier(estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                             ('xgbclassifier',
                                              XGBClassifier(base_score=None,
                                                            booster=None,
                                                            colsample_bylevel=None,
                                                            colsample_bynode=None,
                                                            colsample_bytree=None,
                                                            enable_categorical=False,
                                                            gamma=None,
                                                            gpu_id=None,
                                                            importance_type=None,
                                                            interaction_constraints=None,
                                                      

In [138]:
y_pred = classifier.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.9647887323943662 0.9703507859454461 0.9682739122429602
[CV 5/5; 2/15] START C=1.0, kernel=rbf..........................................
[CV 5/5; 2/15] END C=1.0, kernel=rbf;, score=(train=0.313, test=0.341) total time=   0.5s
[CV 3/5; 5/15] START C=25.75, kernel=rbf........................................
[CV 3/5; 5/15] END C=25.75, kernel=rbf;, score=(train=0.332, test=0.339) total time=   0.5s
[CV 1/5; 7/15] START C=50.5, kernel=poly........................................
[CV 1/5; 7/15] END C=50.5, kernel=poly;, score=(train=0.203, test=0.154) total time=  22.0s
[CV 4/5; 1/6] START randomforestclassifier__criterion=gini, randomforestclassifier__max_features=None, randomforestclassifier__n_estimators=50
[CV 4/5; 1/6] END randomforestclassifier__criterion=gini, randomforestclassifier__max_features=None, randomforestclassifier__n_estimators=50;, score=(train=1.000, test=0.933) total time=   0.4s
[CV 1/5; 5/6] START randomforestclassifier__criterion=entropy, randomforestclassifier__ma