# Imports

In [147]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import math
from datetime import datetime
from datetime import timedelta
from sklearn.model_selection import train_test_split

# Data



In [148]:
data = pd.read_csv('/content/dataset_completo.csv', header ='infer').iloc[:,1:]


# Classification Model

### Train Test Splitting

In [149]:
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder


In [150]:
data.columns

Index(['titolo', 'durata', 'views', 'n_comments', 'n_like', 'genere', 'publ',
       'max_quality', 'score', 'timedelta', 'class'],
      dtype='object')

In [151]:
Y = data['class'] # Extract the target feature

In [152]:
X = data.drop(columns = ['class','score','publ','titolo']) # Remove from the data useful to the analysis: "score, publ, titolo, class"

In [153]:
le = LabelEncoder()
le.fit(X['genere'])
X['genere'] = le.transform(X['genere']) # Transform the Categorical genere feature in a numerical feature

In [155]:
X_train,X_test,y_train,y_test = train_test_split(X,Y, train_size=.8, random_state=42) # Split dataset in train and test

In [156]:
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train, train_size = .8, random_state = 42) # Split train in train and validation

### Excursus

We want to predict the class of a given video. The classes were defined using a home-made score.

To predict we will try different models.
Let's start with SVM

### SVC

In [157]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score

REMINDER OF THE PARAMETERS


C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=- 1, decision_function_shape='ovr', break_ties=False, random_state=None

Try to use SVC without scaling data

In [165]:
clf = SVC(gamma = 'auto', random_state = 42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))

f1-score using SVC without scaling data:  0.16061185468451242


Try to use SVC with Standard Scaler

In [167]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))

f1-score using SVC without scaling data:  0.8075326343334872


Try to use SVC with MinMaxScaler

In [168]:
clf = make_pipeline(MinMaxScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))

f1-score using SVC without scaling data:  0.5932135459307548


Try to use SVC with MinMaxScaler and Standard Scaler (in that order)

In [169]:
clf = make_pipeline(MinMaxScaler(),StandardScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))

f1-score using SVC without scaling data:  0.8075326343334872


Try to use SVC with Standard Scaler and MinMaxScaler (in that order)

In [170]:
clf = make_pipeline(StandardScaler(),MinMaxScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))

f1-score using SVC without scaling data:  0.5932135459307548


Same Results as One Scaler.


Best Results without Scaling.

In [172]:
from sklearn.model_selection import GridSearchCV

In [None]:

svc = SVC()

#def gridsearch
params = {
    'kernel' : ('poly', 'rbf', 'sigmoid'),
    'C' : np.linspace(1, 100, num=5), 
    #'degree' : [3,5,8],
    #'gamma' : ('auto','scale')
    
}

search = GridSearchCV(svc,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1,refit=True,verbose=10, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [None]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)


In [None]:
f1_score(y_val, y_pred, average = 'macro')

### RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC

In [None]:
clf = RFC(random_state = 42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using Random Forest without scaling data: ',f1_score(y_val, y_pred, average = 'macro'))

In [None]:
clf = make_pipeline(MinMaxScaler(), RFC(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))

In [None]:
clf = make_pipeline(StandardScaler(), RFC(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))

In [None]:
clf = make_pipeline(MinMaxScaler(), RFC(random_state = 42, n_jobs = -1))

#def gridsearch
params = {'randomforestclassifier__criterion' : ['gini','entropy'],
          'randomforestclassifier__max_features' : [None],
          'randomforestclassifier__n_estimators' : [50,100,200]
          
    
    
}

search = GridSearchCV(clf,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1,refit=True,verbose=10, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

In [None]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)


In [None]:
f1_score(y_val, y_pred, average = 'macro')