# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import math
from datetime import datetime
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_fscore_support
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

# Data

In [4]:
data = pd.read_csv('dataset_esteso.csv', header ='infer')#.iloc[:,1:]

In [7]:
data

Unnamed: 0.1,Unnamed: 0,titolo,durata,views,n_comments,n_like,genere,subscribers,publ,max_quality,timedelta,score
0,0,Kinect Sports - Tennis da tavolo Gameplay - Ev...,340,13328,12,11,Gaming,110000,2010-11-11,4,0.298205,0.218196
1,1,Pro Skaters at Morrison Hill,210,5664,13,46,Sports,317,2009-07-20,4,0.219331,0.167308
2,2,Yiruma-Shining Smile,205,11269,69,46,Music,431,2007-11-03,4,0.116417,0.093694
3,3,Guitarras de America (Portafolio),294,3823,6,7,Music,435,2009-07-15,3,0.218508,0.137809
4,4,Review of Microsoft Band as a fitness watch,216,5464,4,32,Science & Technology,20200,2014-11-06,6,0.537955,0.397559
...,...,...,...,...,...,...,...,...,...,...,...,...
8937,9122,Rob Ruggiero #7 Bridgewater State Football Hig...,210,4336,3,9,People & Blogs,0,2014-01-07,4,0.488062,0.318213
8938,9123,Rąbanie drewna w oponie,211,271266,42,223,Nonprofits & Activism,7800,2012-03-17,3,0.379219,0.415023
8939,9124,Open Heart four leaf clover nail design for Sa...,383,2852,9,44,Howto & Style,36700,2013-03-11,4,0.438334,0.314836
8940,9125,Campeonato Brasileiro Série C 2013: Treze 1x0 ...,361,4043,6,43,Sports,4140,2013-10-14,4,0.474066,0.349952


## Titles elaboration

In [3]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import *
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/simone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def clean(string):
    tokenizer = RegexpTokenizer(r'\w+')
    string=tokenizer.tokenize(string)
    for word in range(len(string)):
        string[word] = string[word].lower() 
    #Remove stopwords
    string = [word for word in string if not word in stopwords.words()]
    #STEMMING
    stemmer = PorterStemmer()
    string = [stemmer.stem(word) for word in string]
    return " ".join(string)

In [5]:
#remove all the bad stuff we don't need
titles=list(data['titolo'])
for i,title in tqdm(enumerate(titles)):
    titles[i]=clean(title)

4433it [01:24, 52.34it/s]


In [6]:
#create dictionary with key the word and value the number of times it has been seen
diz={}
for title in titles:
    for word in title.split(' '):
        if word in diz:
            diz[word]+=1
        else:
            diz[word]=1  
print('the total number of different words is: ',len(diz.keys()))
print('the total number of words is: ',np.sum(list(diz.values())))

the total number of different words is:  13015
the total number of words is:  25322


In [7]:
#update value of the dictionary with its prob computed as #(times is observed)/#(total words)
tot=np.sum(list(diz.values()))
for key in diz.keys():
    diz[key]=diz[key]/tot

In [8]:
prob_title=[]
for title in tqdm(titles):
    somma=0
    for word in title.split(' '):
        somma+=(diz[word])
    prob_title.append(somma)

100%|███████████████████████████████████| 4433/4433 [00:00<00:00, 537442.18it/s]


In [9]:
data=data.drop(columns = ['titolo'])
data['titolo']=prob_title
data['titolo']=(data['titolo']-data['titolo'].min())/(data['titolo'].max()-data['titolo'].min())

# Classification Model

### Train Test Splitting

In [10]:
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder

In [11]:
data.columns

Index(['durata', 'views', 'n_comments', 'n_like', 'genere', 'publ',
       'max_quality', 'score', 'timedelta', 'class', 'titolo'],
      dtype='object')

In [12]:
Y = data['class'] # Extract the target feature

In [13]:
X = data.drop(columns = ['class','score','publ']) # Remove from the data useful to the analysis: "score, publ, titolo, class"

In [14]:
le = LabelEncoder()
X['genere'] = le.fit_transform(X['genere']) # Transform the Categorical genere feature in a numerical feature

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,Y, train_size=.8, random_state=42) # Split dataset in train and test

In [16]:
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train, train_size = .8, random_state = 42) # Split train in train and validation

### Excursus

We want to predict the class of a given video. The classes were defined using a home-made score.

To predict we will try different models.
Let's start with SVM

### SVC

In [17]:
from sklearn.svm import SVC

REMINDER OF THE PARAMETERS


C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=- 1, decision_function_shape='ovr', break_ties=False, random_state=None

Try to use SVC without scaling data

In [18]:
clf = SVC(gamma = 'auto', random_state = 42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.16061185468451242
0.4732394366197183 0.25


Try to use SVC with Standard Scaler

In [19]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.8001545157307074
0.8183098591549296 0.7949318076745261


Try to use SVC with MinMaxScaler

In [20]:
clf = make_pipeline(MinMaxScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.5908952871236659
0.7394366197183099 0.5721278317152104


Try to use SVC with MinMaxScaler and Standard Scaler (in that order)

In [21]:
clf = make_pipeline(MinMaxScaler(),StandardScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.8001545157307074
0.8183098591549296 0.7949318076745261


Try to use SVC with Standard Scaler and MinMaxScaler (in that order)

In [22]:
clf = make_pipeline(StandardScaler(),MinMaxScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.5908952871236659
0.7394366197183099 0.5721278317152104


Same Results as One Scaler.


Best Results without Scaling.

In [23]:
svc = SVC()

#def gridsearch
params = {
    'kernel' : ('poly', 'rbf', 'sigmoid'),
    'C' : np.linspace(1, 100, num=5), 
    #'degree' : [3,5,8],
    #'gamma' : ('auto','scale')
    
}

search = GridSearchCV(svc,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1, refit=True, verbose=10, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'C': array([  1.  ,  25.75,  50.5 ,  75.25, 100.  ]),
                         'kernel': ('poly', 'rbf', 'sigmoid')},
             pre_dispatch='10*n_jobs', return_train_score=True,
             scoring='f1_macro', verbose=10)

In [24]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)

In [25]:
f1_score(y_val, y_pred, average = 'macro')

0.31184183717669406

### Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier as RFC

In [27]:
clf = RFC(random_state = 42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using Random Forest without scaling data: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

f1-score using Random Forest without scaling data:  0.9276672522324696
0.9352112676056338 0.9225135806749885


In [28]:
clf = make_pipeline(MinMaxScaler(), RFC(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.9301919059217758
0.9380281690140845 0.9245737979657882


In [29]:
clf = make_pipeline(StandardScaler(), RFC(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.9266516527250583
0.9338028169014084 0.9208304438280166


In [30]:
clf = make_pipeline(MinMaxScaler(), RFC(random_state = 42, n_jobs = -1))

#def gridsearch
params = {'randomforestclassifier__criterion' : ['gini','entropy'],
          'randomforestclassifier__max_features' : [None],
          'randomforestclassifier__n_estimators' : [50,100,200]
         }

search = GridSearchCV(clf,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1,refit=True,verbose=10, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(n_jobs=-1,
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__criterion': ['gini',
                                                               'entropy'],
                         'randomforestclassifier__max_features': [None],
                         'randomforestclassifier__n_estimators': [50, 100,
                                                                  200]},
             pre_dispatch='10*n_jobs', return_train_score=True,
             scoring='f1_macro', verbose=10)

In [31]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.9478873239436619 0.9497457235321314


In [32]:
f1_score(y_val, y_pred, average = 'macro')

0.9506803687468519

### XGBoost

In [33]:
from xgboost import XGBClassifier

In [34]:
clf = XGBClassifier(random_state=42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using XGBoost without scaling data: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

f1-score using XGBoost without scaling data:  0.9603909262916284
0.9605633802816902 0.9599341192787796


In [35]:
clf = make_pipeline(MinMaxScaler(), XGBClassifier(random_state=42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using XGBoost using MinMaxScaler: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

f1-score using XGBoost using MinMaxScaler:  0.9603909262916284
0.9605633802816902 0.9599341192787796


In [36]:
clf = make_pipeline(StandardScaler(), XGBClassifier(random_state=42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using XGBoost using StandardScaler: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

f1-score using XGBoost using StandardScaler:  0.9603909262916284
0.9605633802816902 0.9599341192787796


In [37]:
clf = make_pipeline(MinMaxScaler(), XGBClassifier(random_state=42))

#def gridsearch
params = {'xgbclassifier__learning_rate' : [.2,.3,.4],
          'xgbclassifier__n_estimators' : [600,800,1000]
         }

search = GridSearchCV(clf,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1,refit=True,verbose=10, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




GridSearchCV(estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      enable_categorical=False,
                                                      gamma=None, gpu_id=None,
                                                      importance_type=None,
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                               

In [38]:
xgb_params=search.best_params_
xgb_params

{'xgbclassifier__learning_rate': 0.3, 'xgbclassifier__n_estimators': 600}

In [39]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, y_pred)

array([[198,   8,   0,   0],
       [  6, 324,   6,   0],
       [  0,   6, 133,   1],
       [  0,   0,   1,  27]])

In [40]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.9605633802816902 0.9609757859454462 0.9607088856336363


### One vs Rest Classifier - ORCO

In [41]:
from sklearn.multiclass import OneVsRestClassifier as ORC

In [42]:
clf=make_pipeline(MinMaxScaler(), XGBClassifier(learning_rate=xgb_params['xgbclassifier__learning_rate'],n_estimators=xgb_params['xgbclassifier__n_estimators'],random_state=42))
classifier=ORC(clf)
classifier.fit(X_train, y_train)



OneVsRestClassifier(estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                              ('xgbclassifier',
                                               XGBClassifier(base_score=None,
                                                             booster=None,
                                                             colsample_bylevel=None,
                                                             colsample_bynode=None,
                                                             colsample_bytree=None,
                                                             enable_categorical=False,
                                                             gamma=None,
                                                             gpu_id=None,
                                                             importance_type=None,
                                                             interaction_constraints=None,
                                          

In [43]:
y_pred = classifier.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.9690140845070423 0.9674208275543228 0.9714435290974552


### One vs One Classifier - ORCO2

In [44]:
from sklearn.multiclass import OneVsOneClassifier as OOC

In [45]:
clf=make_pipeline(MinMaxScaler(), XGBClassifier(learning_rate=xgb_params['xgbclassifier__learning_rate'],n_estimators=xgb_params['xgbclassifier__n_estimators'],random_state=42))
classifier=OOC(clf)
classifier.fit(X_train, y_train)



OneVsOneClassifier(estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                             ('xgbclassifier',
                                              XGBClassifier(base_score=None,
                                                            booster=None,
                                                            colsample_bylevel=None,
                                                            colsample_bynode=None,
                                                            colsample_bytree=None,
                                                            enable_categorical=False,
                                                            gamma=None,
                                                            gpu_id=None,
                                                            importance_type=None,
                                                            interaction_constraints=None,
                                                      

In [46]:
y_pred = classifier.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.9591549295774648 0.9684162621359224 0.9676597958895383


### Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression

In [48]:
clf = make_pipeline(StandardScaler(), LogisticRegression(random_state=42, multi_class = 'multinomial',
                                                         solver = 'newton-cg',
                                                         n_jobs = -1))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall_fscore_support(y_val, y_pred, average='macro')[0])
print('RECALL:',precision_recall_fscore_support(y_val, y_pred, average='macro')[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

F1 SCORE:  0.719891418878267
PRECISION: 0.7567042708442585
RECALL: 0.6952525427646787
ACCURACY: 0.7859154929577464


### KNN

In [49]:
from sklearn.neighbors import KNeighborsClassifier

In [50]:
clf = make_pipeline(StandardScaler(),KNeighborsClassifier(n_jobs = -1) )
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall_fscore_support(y_val, y_pred, average='macro')[0])
print('RECALL:',precision_recall_fscore_support(y_val, y_pred, average='macro')[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

F1 SCORE:  0.6966332868104037
PRECISION: 0.7329750726280289
RECALL: 0.6708911234396671
ACCURACY: 0.7366197183098592


### Decision Tree Classifier

In [51]:
from sklearn.tree import DecisionTreeClassifier

In [52]:
clf = make_pipeline(StandardScaler(),DecisionTreeClassifier(random_state = 42) )
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall_fscore_support(y_val, y_pred, average='macro')[0])
print('RECALL:',precision_recall_fscore_support(y_val, y_pred, average='macro')[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

F1 SCORE:  0.9167199780371165
PRECISION: 0.9035016321077023
RECALL: 0.9320475034674064
ACCURACY: 0.9140845070422535


### Gradient Boosting

In [53]:
from sklearn.ensemble import GradientBoostingClassifier

In [54]:
clf = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
precision_recall = precision_recall_fscore_support(y_val, y_pred, average='macro')
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall[0])
print('RECALL:',precision_recall[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

F1 SCORE:  0.9713753646704398
PRECISION: 0.9721155792929519
RECALL: 0.9706484049930653
ACCURACY: 0.9633802816901409


In [55]:
clf = make_pipeline(MinMaxScaler(), GradientBoostingClassifier(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
precision_recall = precision_recall_fscore_support(y_val, y_pred, average='macro')
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall[0])
print('RECALL:',precision_recall[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

F1 SCORE:  0.9701598836087464
PRECISION: 0.9695111550374709
RECALL: 0.9708434466019418
ACCURACY: 0.9619718309859155


In [56]:
clf = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state = 42))

#def gridsearch
params = {'gradientboostingclassifier__learning_rate' : [.2,.225,.25],#.285
          'gradientboostingclassifier__n_estimators' : [70,75,80],
          #'gradientboostingclassifier__n_iter_no_change':[2,5,10],
          'gradientboostingclassifier__max_depth':[3,4,5]
         }

search = GridSearchCV(clf,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1,refit=True,verbose=1, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('gradientboostingclassifier',
                                        GradientBoostingClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'gradientboostingclassifier__learning_rate': [0.2,
                                                                       0.225,
                                                                       0.25],
                         'gradientboostingclassifier__max_depth': [3, 4, 5],
                         'gradientboostingclassifier__n_estimators': [70, 75,
                                                                      80]},
             pre_dispatch='10*n_jobs', return_train_score=True,
             scoring='f1_macro', verbose=1)

In [57]:
gb_params=search.best_params_
gb_params

{'gradientboostingclassifier__learning_rate': 0.25,
 'gradientboostingclassifier__max_depth': 3,
 'gradientboostingclassifier__n_estimators': 75}

In [58]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, y_pred)

array([[200,   6,   0,   0],
       [  9, 321,   6,   0],
       [  0,   6, 134,   0],
       [  0,   0,   0,  28]])

In [59]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.967605633802817 0.9743917591308369 0.9748965046589191


### One vs Rest Classifier (GB) - ORCO

In [60]:
clf=make_pipeline(StandardScaler(), GradientBoostingClassifier(learning_rate=gb_params['gradientboostingclassifier__learning_rate'],n_estimators=gb_params['gradientboostingclassifier__n_estimators'],max_depth=gb_params['gradientboostingclassifier__max_depth'],random_state=42))
classifier=ORC(clf)
classifier.fit(X_train, y_train)

OneVsRestClassifier(estimator=Pipeline(steps=[('standardscaler',
                                               StandardScaler()),
                                              ('gradientboostingclassifier',
                                               GradientBoostingClassifier(learning_rate=0.25,
                                                                          n_estimators=75,
                                                                          random_state=42))]))

In [61]:
y_pred = classifier.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.9619718309859155 0.9703739019879797 0.9698794055513708


### One vs One Classifier (GB) - ORCO2

In [62]:
clf=make_pipeline(StandardScaler(), GradientBoostingClassifier(learning_rate=gb_params['gradientboostingclassifier__learning_rate'],n_estimators=gb_params['gradientboostingclassifier__n_estimators'],max_depth=gb_params['gradientboostingclassifier__max_depth'],random_state=42))
classifier=OOC(clf)
classifier.fit(X_train, y_train)

OneVsOneClassifier(estimator=Pipeline(steps=[('standardscaler',
                                              StandardScaler()),
                                             ('gradientboostingclassifier',
                                              GradientBoostingClassifier(learning_rate=0.25,
                                                                         n_estimators=75,
                                                                         random_state=42))]))

In [63]:
y_pred = classifier.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.9633802816901409 0.9608500924641701 0.9620311123299864
[CV 4/5; 1/15] START C=1.0, kernel=poly.........................................
[CV 4/5; 1/15] END C=1.0, kernel=poly;, score=(train=0.182, test=0.211) total time=  11.7s
[CV 3/5; 12/15] START C=75.25, kernel=sigmoid...................................
[CV 3/5; 12/15] END C=75.25, kernel=sigmoid;, score=(train=0.236, test=0.172) total time=   0.5s
[CV 1/5; 13/15] START C=100.0, kernel=poly......................................
[CV 1/5; 13/15] END C=100.0, kernel=poly;, score=(train=0.209, test=0.154) total time=  46.6s
[CV 2/5; 3/6] START randomforestclassifier__criterion=gini, randomforestclassifier__max_features=None, randomforestclassifier__n_estimators=200
[CV 2/5; 3/6] END randomforestclassifier__criterion=gini, randomforestclassifier__max_features=None, randomforestclassifier__n_estimators=200;, score=(train=1.000, test=0.940) total time=   1.7s
[CV 1/5; 2/9] START xgbclassifier__learning_rate=0.2, xgbclassifier__n_estimato