# Imports

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import math
from datetime import datetime
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_fscore_support
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

# Data

In [24]:
data = pd.read_csv('dataset_esteso.csv', header ='infer').iloc[:,1:]

## Feature engineering

In [25]:
data['log(views)/log(com))']=(np.log(2+data['views']))/np.log((2+data['n_comments']))
data['log(com)/log(views))']=(np.log(2+data['n_comments']))/np.log((2+data['views']))

## Titles elaboration

In [26]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import *
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/simone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
def clean(string):
    tokenizer = RegexpTokenizer(r'\w+')
    string=tokenizer.tokenize(string)
    for word in range(len(string)):
        string[word] = string[word].lower() 
    #Remove stopwords
    string = [word for word in string if not word in stopwords.words()]
    #STEMMING
    stemmer = PorterStemmer()
    string = [stemmer.stem(word) for word in string]
    return " ".join(string)

In [28]:
#remove all the bad stuff we don't need
titles=list(data['titolo'])
for i,title in tqdm(enumerate(titles)):
    titles[i]=clean(title)

8942it [02:45, 53.88it/s]


In [29]:
#create dictionary with key the word and value the number of times it has been seen
diz={}
for title in titles:
    for word in title.split(' '):
        if word in diz:
            diz[word]+=1
        else:
            diz[word]=1  
print('the total number of different words is: ',len(diz.keys()))
print('the total number of words is: ',np.sum(list(diz.values())))

the total number of different words is:  22237
the total number of words is:  51268


In [30]:
#update value of the dictionary with its prob computed as #(times is observed)/#(total words)
tot=np.sum(list(diz.values()))
for key in diz.keys():
    diz[key]=diz[key]/tot

In [31]:
prob_title=[]
for title in tqdm(titles):
    somma=0
    for word in title.split(' '):
        somma+=(diz[word])
    prob_title.append(somma)

100%|███████████████████████████████████| 8942/8942 [00:00<00:00, 320444.51it/s]


In [32]:
data=data.drop(columns = ['titolo'])
data['titolo']=prob_title
data['titolo']=(data['titolo']-data['titolo'].min())/(data['titolo'].max()-data['titolo'].min())

# Classification Model

### Train Test Splitting

In [33]:
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder

In [34]:
data.columns

Index(['durata', 'views', 'n_comments', 'n_like', 'genere', 'subscribers',
       'publ', 'max_quality', 'timedelta', 'score', 'class',
       'log(views)/log(com))', 'log(com)/log(views))', 'titolo'],
      dtype='object')

In [35]:
Y = data['class'] # Extract the target feature

In [36]:
X = data.drop(columns = ['class','score','publ']) # Remove from the data useful to the analysis: "score, publ, titolo, class"

In [37]:
le = LabelEncoder()
X['genere'] = le.fit_transform(X['genere']) # Transform the Categorical genere feature in a numerical feature

In [38]:
X_train,X_test,y_train,y_test = train_test_split(X,Y, train_size=.8, random_state=42) # Split dataset in train and test

In [39]:
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train, train_size = .8, random_state = 42) # Split train in train and validation

### Excursus

We want to predict the class of a given video. The classes were defined using a home-made score.

To predict we will try different models.
Let's start with SVM

### SVC

In [40]:
from sklearn.svm import SVC

REMINDER OF THE PARAMETERS


C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=- 1, decision_function_shape='ovr', break_ties=False, random_state=None

Try to use SVC without scaling data

In [41]:
clf = SVC(gamma = 'auto', random_state = 42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.16518483855872718
0.493361285814116 0.25


Try to use SVC with Standard Scaler

In [42]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.764452285469093
0.822501747030049 0.730862554743103


Try to use SVC with MinMaxScaler

In [43]:
clf = make_pipeline(MinMaxScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.5600878028991642
0.7833682739343116 0.5620668494543095


Try to use SVC with MinMaxScaler and Standard Scaler (in that order)

In [44]:
clf = make_pipeline(MinMaxScaler(),StandardScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.764452285469093
0.822501747030049 0.730862554743103


Try to use SVC with Standard Scaler and MinMaxScaler (in that order)

In [45]:
clf = make_pipeline(StandardScaler(),MinMaxScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.5600878028991642
0.7833682739343116 0.5620668494543095


Same Results as One Scaler.


Best Results without Scaling.

In [46]:
svc = SVC()

#def gridsearch
params = {
    'kernel' : ('poly', 'rbf', 'sigmoid'),
    'C' : np.linspace(1, 100, num=5), 
    #'degree' : [3,5,8],
    #'gamma' : ('auto','scale')
    
}

search = GridSearchCV(svc,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1, refit=True, verbose=10, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


KeyboardInterrupt: 

In [None]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)

In [None]:
f1_score(y_val, y_pred, average = 'macro')

### Random Forest

In [47]:
from sklearn.ensemble import RandomForestClassifier as RFC

In [48]:
clf = RFC(random_state = 42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using Random Forest without scaling data: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

f1-score using Random Forest without scaling data:  0.9488920740520171
0.9608665269042628 0.9367408175288233


In [49]:
clf = make_pipeline(MinMaxScaler(), RFC(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.9607919557828855
0.9664570230607966 0.9541855825413448


In [50]:
clf = make_pipeline(StandardScaler(), RFC(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.9538189628325174
0.9636617749825297 0.943075563816315


In [51]:
clf = make_pipeline(MinMaxScaler(), RFC(random_state = 42, n_jobs = -1))

#def gridsearch
params = {'randomforestclassifier__criterion' : ['gini','entropy'],
          'randomforestclassifier__max_features' : [None],
          'randomforestclassifier__n_estimators' : [50,100,200]
         }

search = GridSearchCV(clf,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1,refit=True,verbose=10, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(n_jobs=-1,
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__criterion': ['gini',
                                                               'entropy'],
                         'randomforestclassifier__max_features': [None],
                         'randomforestclassifier__n_estimators': [50, 100,
                                                                  200]},
             pre_dispatch='10*n_jobs', return_train_score=True,
             scoring='f1_macro', verbose=10)

In [52]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.9727463312368972 0.970737915877278


In [53]:
f1_score(y_val, y_pred, average = 'macro')

0.9717853695220419

### XGBoost

In [54]:
from xgboost import XGBClassifier

In [55]:
clf = XGBClassifier(random_state=42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using XGBoost without scaling data: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

f1-score using XGBoost without scaling data:  0.9795974500885825
0.9790356394129979 0.9761018200890641


In [56]:
clf = make_pipeline(MinMaxScaler(), XGBClassifier(random_state=42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using XGBoost using MinMaxScaler: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

f1-score using XGBoost using MinMaxScaler:  0.9795974500885825
0.9790356394129979 0.9761018200890641


In [57]:
clf = make_pipeline(StandardScaler(), XGBClassifier(random_state=42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using XGBoost using StandardScaler: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

f1-score using XGBoost using StandardScaler:  0.9752241087894786
0.9776380153738644 0.97456846715732


In [58]:
clf = make_pipeline(MinMaxScaler(), XGBClassifier(random_state=42))

#def gridsearch
params = {'xgbclassifier__learning_rate' : [.2,.3,.4],
          'xgbclassifier__n_estimators' : [600,800,1000]
         }

search = GridSearchCV(clf,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1,refit=True,verbose=10, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




GridSearchCV(estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      enable_categorical=False,
                                                      gamma=None, gpu_id=None,
                                                      importance_type=None,
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                               

In [59]:
xgb_params=search.best_params_
xgb_params

{'xgbclassifier__learning_rate': 0.4, 'xgbclassifier__n_estimators': 600}

In [60]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, y_pred)

array([[462,  10,   0,   0],
       [  7, 693,   6,   0],
       [  0,   7, 204,   1],
       [  0,   0,   1,  40]])

In [61]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.9790356394129979 0.9771025110915817 0.9796287014229188


### One vs Rest Classifier - ORCO

In [62]:
from sklearn.multiclass import OneVsRestClassifier as ORC

In [63]:
clf=make_pipeline(MinMaxScaler(), XGBClassifier(learning_rate=xgb_params['xgbclassifier__learning_rate'],n_estimators=xgb_params['xgbclassifier__n_estimators'],random_state=42))
classifier=ORC(clf)
classifier.fit(X_train, y_train)



OneVsRestClassifier(estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                              ('xgbclassifier',
                                               XGBClassifier(base_score=None,
                                                             booster=None,
                                                             colsample_bylevel=None,
                                                             colsample_bynode=None,
                                                             colsample_bytree=None,
                                                             enable_categorical=False,
                                                             gamma=None,
                                                             gpu_id=None,
                                                             importance_type=None,
                                                             interaction_constraints=None,
                                          

In [64]:
y_pred = classifier.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.9825296995108316 0.9763976364323268 0.9795435515885038


### One vs One Classifier - ORCO2

In [65]:
from sklearn.multiclass import OneVsOneClassifier as OOC

In [66]:
clf=make_pipeline(MinMaxScaler(), XGBClassifier(learning_rate=xgb_params['xgbclassifier__learning_rate'],n_estimators=xgb_params['xgbclassifier__n_estimators'],random_state=42))
classifier=OOC(clf)
classifier.fit(X_train, y_train)



OneVsOneClassifier(estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                             ('xgbclassifier',
                                              XGBClassifier(base_score=None,
                                                            booster=None,
                                                            colsample_bylevel=None,
                                                            colsample_bynode=None,
                                                            colsample_bytree=None,
                                                            enable_categorical=False,
                                                            gamma=None,
                                                            gpu_id=None,
                                                            importance_type=None,
                                                            interaction_constraints=None,
                                                      

In [67]:
y_pred = classifier.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.9790356394129979 0.9700072600257317 0.9743999253500384


### Logistic Regression

In [68]:
from sklearn.linear_model import LogisticRegression

In [69]:
clf = make_pipeline(StandardScaler(), LogisticRegression(random_state=42, multi_class = 'multinomial',
                                                         solver = 'newton-cg',
                                                         n_jobs = -1))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall_fscore_support(y_val, y_pred, average='macro')[0])
print('RECALL:',precision_recall_fscore_support(y_val, y_pred, average='macro')[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

F1 SCORE:  0.7185199445767452
PRECISION: 0.7751135802863467
RECALL: 0.6893042898754981
ACCURACY: 0.8078266946191475


### KNN

In [70]:
from sklearn.neighbors import KNeighborsClassifier

In [71]:
clf = make_pipeline(StandardScaler(),KNeighborsClassifier(n_jobs = -1) )
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall_fscore_support(y_val, y_pred, average='macro')[0])
print('RECALL:',precision_recall_fscore_support(y_val, y_pred, average='macro')[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

F1 SCORE:  0.7124996587520716
PRECISION: 0.8073373351203186
RECALL: 0.6651667910081941
ACCURACY: 0.7735849056603774


### Decision Tree Classifier

In [72]:
from sklearn.tree import DecisionTreeClassifier

In [73]:
clf = make_pipeline(StandardScaler(),DecisionTreeClassifier(random_state = 42) )
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall_fscore_support(y_val, y_pred, average='macro')[0])
print('RECALL:',precision_recall_fscore_support(y_val, y_pred, average='macro')[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

F1 SCORE:  0.9493410320344047
PRECISION: 0.9482159541509183
RECALL: 0.9504913932399646
ACCURACY: 0.9517819706498952


### Gradient Boosting

In [74]:
from sklearn.ensemble import GradientBoostingClassifier

In [75]:
clf = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
precision_recall = precision_recall_fscore_support(y_val, y_pred, average='macro')
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall[0])
print('RECALL:',precision_recall[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

F1 SCORE:  0.968781399694038
PRECISION: 0.9739612716177736
RECALL: 0.9639036972255672
ACCURACY: 0.9769392033542977


In [76]:
clf = make_pipeline(MinMaxScaler(), GradientBoostingClassifier(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
precision_recall = precision_recall_fscore_support(y_val, y_pred, average='macro')
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall[0])
print('RECALL:',precision_recall[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

F1 SCORE:  0.9683287580050023
PRECISION: 0.974132524168669
RECALL: 0.9628473761039462
ACCURACY: 0.976240391334731


In [85]:
clf = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state = 42))

#def gridsearch
params = {'gradientboostingclassifier__learning_rate' : [.25,.3,.35],#.285
          'gradientboostingclassifier__n_estimators' : [70,75,80],
          #'gradientboostingclassifier__n_iter_no_change':[2,5,10],
          'gradientboostingclassifier__max_depth':[3,4,5]
         }

search = GridSearchCV(clf,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1,refit=True,verbose=1, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('gradientboostingclassifier',
                                        GradientBoostingClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'gradientboostingclassifier__learning_rate': [0.25,
                                                                       0.3,
                                                                       0.35],
                         'gradientboostingclassifier__max_depth': [3, 4, 5],
                         'gradientboostingclassifier__n_estimators': [70, 75,
                                                                      80]},
             pre_dispatch='10*n_jobs', return_train_score=True,
             scoring='f1_macro', verbose=1)

In [86]:
gb_params=search.best_params_
gb_params

{'gradientboostingclassifier__learning_rate': 0.25,
 'gradientboostingclassifier__max_depth': 3,
 'gradientboostingclassifier__n_estimators': 75}

In [87]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, y_pred)

array([[462,  10,   0,   0],
       [  7, 694,   5,   0],
       [  0,   6, 205,   1],
       [  0,   0,   2,  39]])

In [88]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.9769392033542977 0.9520656838353502 0.9570696032695396


### One vs Rest Classifier (GB) - ORCO

In [89]:
clf=make_pipeline(StandardScaler(), GradientBoostingClassifier(learning_rate=gb_params['gradientboostingclassifier__learning_rate'],n_estimators=gb_params['gradientboostingclassifier__n_estimators'],max_depth=gb_params['gradientboostingclassifier__max_depth'],random_state=42))
classifier=ORC(clf)
classifier.fit(X_train, y_train)

OneVsRestClassifier(estimator=Pipeline(steps=[('standardscaler',
                                               StandardScaler()),
                                              ('gradientboostingclassifier',
                                               GradientBoostingClassifier(learning_rate=0.25,
                                                                          n_estimators=75,
                                                                          random_state=42))]))

In [90]:
y_pred = classifier.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.9769392033542977 0.9464977838766896 0.9584828856989338


### One vs One Classifier (GB) - ORCO2

In [91]:
clf=make_pipeline(StandardScaler(), GradientBoostingClassifier(learning_rate=gb_params['gradientboostingclassifier__learning_rate'],n_estimators=gb_params['gradientboostingclassifier__n_estimators'],max_depth=gb_params['gradientboostingclassifier__max_depth'],random_state=42))
classifier=OOC(clf)
classifier.fit(X_train, y_train)

OneVsOneClassifier(estimator=Pipeline(steps=[('standardscaler',
                                              StandardScaler()),
                                             ('gradientboostingclassifier',
                                              GradientBoostingClassifier(learning_rate=0.25,
                                                                         n_estimators=75,
                                                                         random_state=42))]))

In [92]:
y_pred = classifier.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

0.9783368273934312 0.9700042591134543 0.9730260490962801
