# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import math
from datetime import datetime
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_fscore_support
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

# Data

In [2]:
data = pd.read_csv('dataset_completo', header ='infer').iloc[:,1:]

## Titles elaboration

In [3]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import *
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/simone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def clean(string):
    tokenizer = RegexpTokenizer(r'\w+')
    string=tokenizer.tokenize(string)
    for word in range(len(string)):
        string[word] = string[word].lower() 
    #Remove stopwords
    string = [word for word in string if not word in stopwords.words()]
    #STEMMING
    stemmer = PorterStemmer()
    string = [stemmer.stem(word) for word in string]
    return " ".join(string)

In [5]:
#remove all the bad stuff we don't need
titles=list(data['titolo'])
for i,title in tqdm(enumerate(titles)):
    titles[i]=clean(title)

4433it [01:24, 52.34it/s]


In [6]:
#create dictionary with key the word and value the number of times it has been seen
diz={}
for title in titles:
    for word in title.split(' '):
        if word in diz:
            diz[word]+=1
        else:
            diz[word]=1  
print('the total number of different words is: ',len(diz.keys()))
print('the total number of words is: ',np.sum(list(diz.values())))

the total number of different words is:  13015
the total number of words is:  25322


In [7]:
#update value of the dictionary with its prob computed as #(times is observed)/#(total words)
tot=np.sum(list(diz.values()))
for key in diz.keys():
    diz[key]=diz[key]/tot

In [8]:
prob_title=[]
for title in tqdm(titles):
    somma=0
    for word in title.split(' '):
        somma+=(diz[word])
    prob_title.append(somma)

100%|███████████████████████████████████| 4433/4433 [00:00<00:00, 537442.18it/s]


In [9]:
data=data.drop(columns = ['titolo'])
data['titolo']=prob_title
data['titolo']=(data['titolo']-data['titolo'].min())/(data['titolo'].max()-data['titolo'].min())

# Classification Model

### Train Test Splitting

In [10]:
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder

In [11]:
data.columns

Index(['durata', 'views', 'n_comments', 'n_like', 'genere', 'publ',
       'max_quality', 'score', 'timedelta', 'class', 'titolo'],
      dtype='object')

In [12]:
Y = data['class'] # Extract the target feature

In [13]:
X = data.drop(columns = ['class','score','publ']) # Remove from the data useful to the analysis: "score, publ, titolo, class"

In [14]:
le = LabelEncoder()
X['genere'] = le.fit_transform(X['genere']) # Transform the Categorical genere feature in a numerical feature

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,Y, train_size=.8, random_state=42) # Split dataset in train and test

In [16]:
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train, train_size = .8, random_state = 42) # Split train in train and validation

### Excursus

We want to predict the class of a given video. The classes were defined using a home-made score.

To predict we will try different models.
Let's start with SVM

### SVC

In [17]:
from sklearn.svm import SVC

REMINDER OF THE PARAMETERS


C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=- 1, decision_function_shape='ovr', break_ties=False, random_state=None

Try to use SVC without scaling data

In [18]:
clf = SVC(gamma = 'auto', random_state = 42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.16061185468451242
0.4732394366197183 0.25


Try to use SVC with Standard Scaler

In [19]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.8001545157307074
0.8183098591549296 0.7949318076745261


Try to use SVC with MinMaxScaler

In [20]:
clf = make_pipeline(MinMaxScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.5908952871236659
0.7394366197183099 0.5721278317152104


Try to use SVC with MinMaxScaler and Standard Scaler (in that order)

In [21]:
clf = make_pipeline(MinMaxScaler(),StandardScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.8001545157307074
0.8183098591549296 0.7949318076745261


Try to use SVC with Standard Scaler and MinMaxScaler (in that order)

In [22]:
clf = make_pipeline(StandardScaler(),MinMaxScaler(), SVC(gamma='auto',random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

0.5908952871236659
0.7394366197183099 0.5721278317152104


Same Results as One Scaler.


Best Results without Scaling.

In [None]:
svc = SVC()

#def gridsearch
params = {
    'kernel' : ('poly', 'rbf', 'sigmoid'),
    'C' : np.linspace(1, 100, num=5), 
    #'degree' : [3,5,8],
    #'gamma' : ('auto','scale')
    
}

search = GridSearchCV(svc,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1, refit=True, verbose=10, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [None]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)

In [None]:
f1_score(y_val, y_pred, average = 'macro')

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC

In [None]:
clf = RFC(random_state = 42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using Random Forest without scaling data: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

In [None]:
clf = make_pipeline(MinMaxScaler(), RFC(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

In [None]:
clf = make_pipeline(StandardScaler(), RFC(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

In [None]:
clf = make_pipeline(MinMaxScaler(), RFC(random_state = 42, n_jobs = -1))

#def gridsearch
params = {'randomforestclassifier__criterion' : ['gini','entropy'],
          'randomforestclassifier__max_features' : [None],
          'randomforestclassifier__n_estimators' : [50,100,200]
         }

search = GridSearchCV(clf,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1,refit=True,verbose=10, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

In [None]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

In [None]:
f1_score(y_val, y_pred, average = 'macro')

### XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
clf = XGBClassifier(random_state=42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using XGBoost without scaling data: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

In [None]:
clf = make_pipeline(MinMaxScaler(), XGBClassifier(random_state=42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using XGBoost using MinMaxScaler: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

In [None]:
clf = make_pipeline(StandardScaler(), XGBClassifier(random_state=42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('f1-score using XGBoost using StandardScaler: ',f1_score(y_val, y_pred, average = 'macro'))
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'))

In [None]:
clf = make_pipeline(MinMaxScaler(), XGBClassifier(random_state=42))

#def gridsearch
params = {'xgbclassifier__learning_rate' : [.2,.3,.4],
          'xgbclassifier__n_estimators' : [600,800,1000]
         }

search = GridSearchCV(clf,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1,refit=True,verbose=10, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

In [None]:
xgb_params=search.best_params_
xgb_params

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, y_pred)

In [None]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

### One vs Rest Classifier - ORCO

In [None]:
from sklearn.multiclass import OneVsRestClassifier as ORC

In [None]:
clf=make_pipeline(MinMaxScaler(), XGBClassifier(learning_rate=xgb_params['xgbclassifier__learning_rate'],n_estimators=xgb_params['xgbclassifier__n_estimators'],random_state=42))
classifier=ORC(clf)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

### One vs One Classifier - ORCO2

In [None]:
from sklearn.multiclass import OneVsOneClassifier as OOC

In [None]:
clf=make_pipeline(MinMaxScaler(), XGBClassifier(learning_rate=xgb_params['xgbclassifier__learning_rate'],n_estimators=xgb_params['xgbclassifier__n_estimators'],random_state=42))
classifier=OOC(clf)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = make_pipeline(StandardScaler(), LogisticRegression(random_state=42, multi_class = 'multinomial',
                                                         solver = 'newton-cg',
                                                         n_jobs = -1))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall_fscore_support(y_val, y_pred, average='macro')[0])
print('RECALL:',precision_recall_fscore_support(y_val, y_pred, average='macro')[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
clf = make_pipeline(StandardScaler(),KNeighborsClassifier(n_jobs = -1) )
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall_fscore_support(y_val, y_pred, average='macro')[0])
print('RECALL:',precision_recall_fscore_support(y_val, y_pred, average='macro')[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = make_pipeline(StandardScaler(),DecisionTreeClassifier(random_state = 42) )
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall_fscore_support(y_val, y_pred, average='macro')[0])
print('RECALL:',precision_recall_fscore_support(y_val, y_pred, average='macro')[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
clf = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
precision_recall = precision_recall_fscore_support(y_val, y_pred, average='macro')
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall[0])
print('RECALL:',precision_recall[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

In [None]:
clf = make_pipeline(MinMaxScaler(), GradientBoostingClassifier(random_state = 42))
clf.fit(X_train,y_train)
y_pred = clf.predict(X_val)
precision_recall = precision_recall_fscore_support(y_val, y_pred, average='macro')
print('F1 SCORE: ',f1_score(y_val, y_pred, average = 'macro'))
print('PRECISION:',precision_recall[0])
print('RECALL:',precision_recall[1])
print('ACCURACY:', sum(y_pred == y_val)/len(y_val))

In [None]:
clf = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state = 42))

#def gridsearch
params = {'gradientboostingclassifier__learning_rate' : [.2,.225,.25],#.285
          'gradientboostingclassifier__n_estimators' : [70,75,80],
          #'gradientboostingclassifier__n_iter_no_change':[2,5,10],
          'gradientboostingclassifier__max_depth':[3,4,5]
         }

search = GridSearchCV(clf,param_grid=params,scoring='f1_macro',
                                  n_jobs=-1,refit=True,verbose=1, pre_dispatch='10*n_jobs',
                                  return_train_score=True)
search.fit(X_train,y_train)

In [None]:
gb_params=search.best_params_
gb_params

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, y_pred)

In [None]:
best_one=search.best_estimator_
y_pred = best_one.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

### One vs Rest Classifier (GB) - ORCO

In [None]:
clf=make_pipeline(StandardScaler(), GradientBoostingClassifier(learning_rate=gb_params['gradientboostingclassifier__learning_rate'],n_estimators=gb_params['gradientboostingclassifier__n_estimators'],max_depth=gb_params['gradientboostingclassifier__max_depth'],random_state=42))
classifier=ORC(clf)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))

### One vs One Classifier (GB) - ORCO2

In [None]:
clf=make_pipeline(StandardScaler(), GradientBoostingClassifier(learning_rate=gb_params['gradientboostingclassifier__learning_rate'],n_estimators=gb_params['gradientboostingclassifier__n_estimators'],max_depth=gb_params['gradientboostingclassifier__max_depth'],random_state=42))
classifier=OOC(clf)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_val)
print(accuracy_score(y_val, y_pred), recall_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average = 'macro'))