# Bag Of Words - Metacritics Sentiment Analysis
10 fold Cross Validation
## [Panda](https://github.com/PANDA-UFSCar) - 2023/2
Autores: João Ricardo Lovato e [Letícia Bossato Marchezi](linkedin.com/in/letmarchezi/)

In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [2]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)
%cd "gdrive/MyDrive/Grupo 1 - Processamento de Linguagem Natural/Data/"

Mounted at /content/gdrive
/content/gdrive/.shortcut-targets-by-id/1ub11KA5pjUO4RCNqv5VFfBB4Tnv21ooN/Grupo 1 - Processamento de Linguagem Natural/Data


In [3]:
import pandas as pd
df = pd.read_csv("preprocessed-metacritics-total.csv")

In [4]:
df.head()

Unnamed: 0,Movie name,Review,Created at,Score,Genre
0,Arrival,"['denis', 'villeneuve', 'shows', 'us', 'all', ...","OCT 3, 2022",1.0,Mistery
1,Arrival,"['amy', 'adams', 'gives', 'a', 'superb', 'perf...","MAR 7, 2022",1.0,Mistery
2,Arrival,"['this', 'movie', 'is', 'not', 'for', 'everyon...","DEC 6, 2019",1.0,Mistery
3,Arrival,"['arrival', 'is', 'one', 'of', 'my', 'favorite...","APR 3, 2020",1.0,Mistery
4,Arrival,"['i', 'do', 'not', 'think', 'this', 'movie', '...","MAR 2, 2020",1.0,Mistery


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6480 entries, 0 to 6479
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Movie name  6480 non-null   object 
 1   Review      6480 non-null   object 
 2   Created at  6480 non-null   object 
 3   Score       6480 non-null   float64
 4   Genre       6480 non-null   object 
dtypes: float64(1), object(4)
memory usage: 253.2+ KB


In [6]:
df['Review'] = df['Review'].apply(lambda x: ast.literal_eval(x))
df.head()


Unnamed: 0,Movie name,Review,Created at,Score,Genre
0,Arrival,"[denis, villeneuve, shows, us, all, his, talen...","OCT 3, 2022",1.0,Mistery
1,Arrival,"[amy, adams, gives, a, superb, performance, in...","MAR 7, 2022",1.0,Mistery
2,Arrival,"[this, movie, is, not, for, everyone, if, you,...","DEC 6, 2019",1.0,Mistery
3,Arrival,"[arrival, is, one, of, my, favorite, sci, fi, ...","APR 3, 2020",1.0,Mistery
4,Arrival,"[i, do, not, think, this, movie, is, about, th...","MAR 2, 2020",1.0,Mistery


In [7]:
X = df['Review'].apply(lambda x: " ".join(x))
y = df['Score']

In [8]:
def calc_mean_metrics(metrics_list_dic):
  # Initialize dictionaries to store the sum of each metric
  mean_metrics = {'Accuracy': 0, 'Precision': 0, 'Recall': 0, 'F1-Score': 0}

  # Calculate the sum of each metric
  for metrics_dict in metrics_list_dic:
      for metric, value in metrics_dict.items():
          mean_metrics[metric] += value

  # Calculate the mean of each metric
  num_metrics = len(metrics_list_dic)
  mean_metrics = {metric: mean_metrics[metric] / num_metrics for metric in mean_metrics}

  # Print the mean of each metric
  for metric, value in mean_metrics.items():
      print(f"Mean {metric}: {value}")
  return mean_metrics

In [17]:
def custom_cross_val_metrics(classifier, X, y, k=10,verbose=False,standarlization=False):
    """
    Perform k-fold cross-validation and calculate metrics for each fold.

    Parameters:
    - classifier: The classifier to evaluate.
    - X: The feature data (list of arrays).
    - y: The target labels.
    - k: The number of folds for cross-validation.

    Returns:
    - List of dictionaries containing metrics for each fold.
    """
    metrics_list = []
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
        y_train, y_test = np.array(y)[train_index], np.array(y)[test_index]  # Use np.array to index y

        # Bag of Words
        count_vect = CountVectorizer()  # Convercao das strings para uma matriz de contagem dos tokens

        # Tranformacao das string em uma matriz de string-termo (strings e termos das strings)
        # Extracao da contagem de tokens das strings usando o vocabulario anterior
        X_train_bow = count_vect.fit_transform(X_train)
        X_test_bow = count_vect.transform(X_test)
        if(standarlization):

            inst_scaler = preprocessing.StandardScaler(with_mean=False)
            #print("Média do dataset de treino antes do standardization: ",X_train_bow.mean())
            X_train_bow = inst_scaler.fit_transform(X_train_bow)
            #print("Média após:",X_train_bow.mean(),"\n")

            inst_scaler = preprocessing.StandardScaler(with_mean=False)
            #print("Média do dataset de teste antes do standardization: ",X_test_bow.mean())
            X_test_bow = inst_scaler.fit_transform(X_test_bow)
            #print("Média após:",X_test_bow.mean())

        # Fit the classifier on the training data
        classifier.fit(X_train_bow, y_train)

        # Make predictions on the test data
        y_pred = classifier.predict(X_test_bow)
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')

        metrics_dict = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1
        }

        metrics_list.append(metrics_dict)
    if (verbose):
        for i, metrics in enumerate(metrics_list):
            print(f"Fold {i+1} Metrics:")
            for metric, value in metrics.items():
                print(f"{metric}: {value}")
            print()
    return metrics_list

In [18]:
#-----------------------RF-------------------------------
print("Random Forest:")
print(f"\nSem standart scaler:")
metrics_list = custom_cross_val_metrics(RandomForestClassifier(), X, y, k=10,verbose=False,standarlization=False)
mean_metric_rf = calc_mean_metrics(metrics_list)

metrics_list = custom_cross_val_metrics(RandomForestClassifier(), X, y, k=10,verbose=False,standarlization=True)
print(f"\nCom standart scaler:")
mean_metric_rf = calc_mean_metrics(metrics_list)
print("-------------------------------------------\n")
#-----------------------KNN-------------------------------
print("KNN:")
metrics_list = custom_cross_val_metrics(KNeighborsClassifier(), X, y, k=10,verbose=False,standarlization=False)
print(f"\nSem: standart scaler:")
mean_metric_knn = calc_mean_metrics(metrics_list)

metrics_list = custom_cross_val_metrics(KNeighborsClassifier(), X, y, k=10,verbose=False,standarlization=True)
print(f"\nCom standart scaler:")
mean_metric_knn = calc_mean_metrics(metrics_list)
print("-------------------------------------------\n")
#-----------------------SVM-------------------------------
print("SVM:")
metrics_list = custom_cross_val_metrics(SVC(), X, y, k=10,verbose=False,standarlization=False)
print(f"\nSem: standart scaler:")
mean_metric_svm = calc_mean_metrics(metrics_list)

metrics_list = custom_cross_val_metrics(SVC(), X, y, k=10,verbose=False,standarlization=True)
print(f"\nCom standart scaler:")
mean_metric_svm = calc_mean_metrics(metrics_list)

Random Forest:

Sem standart scaler:
Mean Accuracy: 0.6211419753086421
Mean Precision: 0.6241428237606838
Mean Recall: 0.621141975308642
Mean F1-Score: 0.6219827177930701

Com standart scaler:
Mean Accuracy: 0.6211419753086421
Mean Precision: 0.6240226024722563
Mean Recall: 0.6211419753086421
Mean F1-Score: 0.6221182820443356
-------------------------------------------

KNN:

Sem: standart scaler:
Mean Accuracy: 0.4330246913580247
Mean Precision: 0.43700191613945183
Mean Recall: 0.4330246913580247
Mean F1-Score: 0.4296219874840469

Com standart scaler:
Mean Accuracy: 0.42330246913580255
Mean Precision: 0.4269590296692555
Mean Recall: 0.42330246913580255
Mean F1-Score: 0.38309769512962877
-------------------------------------------

SVM:

Sem: standart scaler:
Mean Accuracy: 0.6060185185185186
Mean Precision: 0.6080484895446808
Mean Recall: 0.6060185185185186
Mean F1-Score: 0.6064485225717109

Com standart scaler:
Mean Accuracy: 0.4625
Mean Precision: 0.5848290773817348
Mean Recall: 0.4

**Extra: Multinomial NB, especial para word count**

In [19]:
print("-------------------------------------------\n")
#-----------------------NB-------------------------------
print("Multinomial NB:")
from sklearn.naive_bayes import MultinomialNB

metrics_list = custom_cross_val_metrics(MultinomialNB(), X, y, k=10,verbose=False,standarlization=False)
print(f"\nSem: standart scaler:")
mean_metric_nb = calc_mean_metrics(metrics_list)

metrics_list = custom_cross_val_metrics(MultinomialNB(), X, y, k=10,verbose=False,standarlization=True)
print(f"\nCom standart scaler:")
mean_metric_nb = calc_mean_metrics(metrics_list)

print("-------------------------------------------\n")

-------------------------------------------

Multinomial NB:

Sem: standart scaler:
Mean Accuracy: 0.648611111111111
Mean Precision: 0.6643279689327117
Mean Recall: 0.648611111111111
Mean F1-Score: 0.652020288984039

Com standart scaler:
Mean Accuracy: 0.5365740740740741
Mean Precision: 0.5420906683255792
Mean Recall: 0.536574074074074
Mean F1-Score: 0.5380825756593248
-------------------------------------------

