# Text Classification  -  Sentiment Analysis

- Apply Text Vectorization TfIdfVectorization and CounterVectorization
- Apply Calssification models to text vector to predict whether the rating is positive or nagative
- Select the best performing models 

##  Required packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline

import itertools


from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline

from time import time
from sklearn import metrics

from sklearn import svm
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from IPython.core.display import display, HTML

# import customised plotting functions from plot_mtrics.py script
from plot_metrics import plot_metrics_and_time, plot_radar_metrics, plot_radar_mult
from plot_metrics import plot_radar_some, barplot_metric_mult, lineplot_metrics, stack_barplot

## Load data & Split into training and test data

In [2]:
# load the data
df = pd.read_csv("data_to_vect.csv" , index_col=0)

# display first lines of the data
display(df.head())

# check nan values
print(df.isna().sum())

print(df.info())

# the target is -1 if rating<3, 0 if rating==3 and 1 if rating>3
y = -1*(df["rating"]<3) + (df["rating"]>3)*1  

# explanatory variable : the stemmed comment
X = df["comment_stem"]

# Set the test size at 25% of the overall sample.
# Set stratify = y to preserve labels repartition on training and test data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, stratify = y)

Unnamed: 0,comment,rating,comment_cleaned,comment_lemmatized,comment_stem,stem_len
0,"Facilte, sollicitations dosées...qualité..on p...",4,facilte sollicitations dosées qualité pourrait...,facilte sollicitation doser qualité pouvoir ce...,facilt sollicit dos qualit pouvoir cepend amél...,558
1,Livraison impeccable avec possibilité de prend...,4,livraison impeccable possibilité prendre samed...,livraison impeccable possibilité prendre samed...,livraison impecc possibil prendr samed matin b...,461
2,Achat d'un canapé La Redoute intérieurs le 6/9...,4,achat canapé redoute intérieurs site redoute a...,achat canapé redoute intérieur site redoute ab...,achat canap redout intérieur sit redout abord ...,426
3,je commande depuis longtemps chez la redoute; ...,4,commande depuis longtemps redoute appréciais c...,commande depuis longtemps redoute appréciai ch...,command depuis longtemp redout appréci choix r...,370
4,"Comme d'habitude, aucun souci et rapidité d'ex...",5,habitude aucun souci rapidité expédition colis...,habitude aucun souci rapidité expédition colis...,habitud aucun souc rapid expédit colis bravo d...,366


comment               0
rating                0
comment_cleaned       0
comment_lemmatized    0
comment_stem          0
stem_len              0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 22683 entries, 0 to 23466
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   comment             22683 non-null  object
 1   rating              22683 non-null  int64 
 2   comment_cleaned     22683 non-null  object
 3   comment_lemmatized  22683 non-null  object
 4   comment_stem        22683 non-null  object
 5   stem_len            22683 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.2+ MB
None


## Preprocessing with TF-IDF
- TF-IDF
- RandomUnderSampling

In [3]:
#vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf=True)
tfvectorizer = TfidfVectorizer()

V_train = tfvectorizer.fit_transform(X_train).toarray()
V_test = tfvectorizer.transform(X_test).toarray()

# Undersampling to tackle classes imbalance
ru = RandomUnderSampler()
V_ru, y_ru = ru.fit_resample(V_train, y_train)

print("Shape of random undersampled train features:", V_ru.shape)
print("Shape of random undersampled train labels:", y_ru.shape)


Shape of random undersampled train features: (2757, 5120)
Shape of random undersampled train labels: (2757,)


## Preprocessing with CountVectorizer
- CountVectorizer
- RandomUnderSampling

In [4]:
#Vectorization
cvectorizer = CountVectorizer()
V_train_cvz = np.asarray(cvectorizer.fit_transform(X_train).todense())
V_test_cvz = np.asarray(cvectorizer.transform(X_test).todense())

# Undersampling to tackle classes imbalance
ru = RandomUnderSampler()
V_ru_cvz, y_ru_cvz = ru.fit_resample(V_train_cvz, y_train)

print("Shape of random undersampled train features:", V_ru_cvz.shape)
print("Shape of random undersampled train labels:", y_ru_cvz.shape)

Shape of random undersampled train features: (2757, 5120)
Shape of random undersampled train labels: (2757,)


##  Models training and Evaluation

### Function to fit and evaluate a model

In [5]:
# function to fit a model and calculate its metrics, time performance, classification report and confusion matrix

def fit_and_evaluate(model_name, model, encoder=0):
    """ function to fit a model and calculate its metrics and time performance
        return a dataFrame of metrics and time performance
        encoder = 0  => fit on X_train, y_train without undersampling 
        encoder = 1 => fit on random undersampled TfIdfVectorize train data
        encoder = 2 ==> fit on random undersampled CountVectorized train data
    """
    # Fit the model to the training data
    t0 = time()
    if encoder == 0:
        model.fit(X_train, y_train) # model fit directly on original X_train, y_train. To be used with pipeline
    elif encoder == 1:
        model.fit(V_ru, y_ru)  # model fit on the random under-sampled tf-idf vectorized train data
    elif encoder == 2:
        model.fit(V_ru_cvz, y_ru_cvz)  # model fit on the under-sampled CountVectorized train data
    else:
        raise ValueError("Admissible values for encoder are 0, 1 or 2")
    t1 = time()
    
    # Prediction on the tf-idf vectorized test data
    if encoder == 0: 
        prediction = model.predict(X_test) 
    elif encoder==1 :
        prediction = model.predict(V_test) 
    else:
        prediction = model.predict(V_test_cvz) 
   
    t2 = time()
    time_train = t1-t0
    time_predict = t2-t1

    # Evaluate model performance
    if encoder == 0: 
        accuracy = model.score(X_test, y_test)
    elif encoder == 1:
        accuracy = model.score(V_test, y_test)
    else:
        accuracy = model.score(V_test_cvz, y_test) 
    
    # The balanced accuracy in binary and multiclass classification problems 
    # to deal with imbalanced datasets.
    # It is defined as the average of recall obtained on each class.
    # The best value is 1 and the worst value is 0 when adjusted=False.
    balanced_accuracy = metrics.balanced_accuracy_score(y_test, prediction)   
    
    #f1_score = metrics.f1_score(y_test, prediction)
    f1_score_macro = metrics.f1_score(y_test, prediction, average="macro")
    f1_score_weighted = metrics.f1_score(y_test, prediction, average="weighted")
    
    #recall = metrics.recall_score(y_test, prediction, pos_label=0)
    recall_macro = metrics.recall_score(y_test, prediction, average="macro")
    recall_weighted = metrics.recall_score(y_test, prediction, average="weighted")
    
    # precision is intuitively the ability of the classifier not to label as positive a sample that is negative.
    # we set pos_label=0 : that is negative rating (rating <=3)
    #precision = metrics.precision_score(y_test, prediction, pos_label=0)
    precision_macro = metrics.precision_score(y_test, prediction, average="macro" )
    precision_weighted = metrics.precision_score(y_test, prediction, average="weighted")

    cl_report = metrics.classification_report(y_test, prediction)
    cf_matrix = pd.crosstab(y_test, prediction, rownames = ["Real"], colnames=["Prediction"])

    # put metrics  into a dictionnary and into a DataFrame
    res = {"model": model_name, 
           "accuracy": accuracy, "balanced_accuracy": balanced_accuracy, 
           #"f1": f1_score, 
           "f1_macro": f1_score_macro, "f1_weighted": f1_score_weighted,
           #"recall_0": recall, 
           "recall_macro": recall_macro, "recall_weighted": recall_weighted,
           #"precision_0" : precision, 
           "precision_macro" : precision_macro, "precision_weighted" : precision_weighted,
            "time_train":time_train, "time_predict": time_predict}

    #DataFrame of model's performance
    df_p = pd.DataFrame(res, index=[0])

    #output model performance, classification report and confusion matrix
    return (df_p, cl_report, cf_matrix)
    

### Function to visualize models performance

In [6]:
# import customised plotting functions from plot_mtrics.py script
from plot_metrics import plot_metrics_and_time, plot_radar_metrics, plot_radar_mult
from plot_metrics import plot_radar_some, barplot_metric_mult, lineplot_metrics, stack_barplot

###  Models to train and evaluate
- Logistitics Regression
- Decision Trees
- Random Forest

- LinearSVC
- SVC
- MultinomialNB
- ComplementNB
- GaussianNB
  
- KNN
- Gradient Boost models

### Classification of text vectorize with TF-IDF

#### fit models and evaluate

In [7]:
# list of models (model name an model instance) to train and evaluate
models_list = [("Logistic_reg", LogisticRegression()),
               ("Random_forest", RandomForestClassifier()),
               ("LinearSVC", svm.LinearSVC()),
               ("SVC(kernel=rbf)", svm.SVC()),
               ("MultinomialNB", MultinomialNB()),
               ("GaussianNB", GaussianNB()),
               ("ComplementNB", ComplementNB()),
               ("KNN", KNeighborsClassifier()),
               ("Gradient Boosting Classifier", GradientBoostingClassifier())]


In [8]:
#train and evaluate the models after tf-idf transformation

list_of_models = []
cf_matrix = []
cl_report = []
df_perf = pd.DataFrame([])

for (model_name, model) in models_list:    
    (df0, clrep, cfmx)  = fit_and_evaluate(model_name, model, encoder=1)
    list_of_models.append(model_name)
    df_perf = pd.concat([df_perf, df0])
    cl_report.append(clrep)
    cf_matrix.append(cfmx)

df_perf = df_perf.reset_index().drop("index", axis = 1)   

# Save metrics into a csv_file
df_temp=df_perf.copy()
df_temp["model"] = df_temp["model"]+"_tfidf"
df_temp.to_csv("tfidf_clm_perf.csv")
del df_temp

#### Visualize and compare models performance

In [None]:
# Comparison of metrics scores
data = df_perf.iloc[:,:-2]  # to skip time_train and time_predict
plot_radar_mult(data)

# Compare metrics and time to train and time to predict
#plot_metrics_and_time(df_perf)


In [None]:
# Line plots of metrics for comparison
lineplot_metrics(data)


In [None]:
# Comparaison of Accuracy, recall_macro, precision_macro, recall_0, precision_0
for metric in ["accuracy", "recall_macro", "precision_macro"]:
    barplot_metric_mult(data, metric)

In [None]:
# Compare time_train and time_predict 
stack_barplot(df_perf, "time_train", "time_predict")

# Compare recall_macro and precision_macro
stack_barplot(df_perf, "recall_macro", "precision_macro")

In [9]:
#function to visualize confusion matrix side by side

def display_side_by_side(dfs:list, captions:list):
    """Display tables side by side to save vertical space
    Input:
        dfs: list of pandas.DataFrame
        captions: list of table captions
    """
    output = ""
    combined = dict(zip(captions, dfs))
    for caption, df in combined.items():
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
        output += "\xa0\xa0\xa0"
    display(HTML(output))



In [10]:
# confusion matrix
#Display(confusion matrix)
display_side_by_side(cf_matrix, list_of_models)


Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,221,133,12
0,106,174,26
1,339,604,4056

Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,230,108,28
0,112,163,31
1,364,565,4070

Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,213,128,25
0,117,157,32
1,354,540,4105

Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,222,137,7
0,99,192,15
1,330,784,3885

Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,229,120,17
0,120,160,26
1,374,585,4040

Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,109,174,83
0,92,146,68
1,340,807,3852

Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,237,99,30
0,128,136,42
1,348,327,4324

Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,218,101,47
0,126,124,56
1,442,355,4202

Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,213,144,9
0,104,181,21
1,324,953,3722


In [12]:
for modele, cfm, cl_rep in zip(list_of_models, cf_matrix, cl_report):
    print("---------------{}----------------".format(modele))
    display(cfm)
    print(cl_rep)

---------------Logistic_reg----------------


Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,221,133,12
0,106,174,26
1,339,604,4056


              precision    recall  f1-score   support

          -1       0.33      0.60      0.43       366
           0       0.19      0.57      0.29       306
           1       0.99      0.81      0.89      4999

    accuracy                           0.78      5671
   macro avg       0.50      0.66      0.54      5671
weighted avg       0.91      0.78      0.83      5671

---------------Random_forest----------------


Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,230,108,28
0,112,163,31
1,364,565,4070


              precision    recall  f1-score   support

          -1       0.33      0.63      0.43       366
           0       0.19      0.53      0.29       306
           1       0.99      0.81      0.89      4999

    accuracy                           0.79      5671
   macro avg       0.50      0.66      0.54      5671
weighted avg       0.90      0.79      0.83      5671

---------------LinearSVC----------------


Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,213,128,25
0,117,157,32
1,354,540,4105


              precision    recall  f1-score   support

          -1       0.31      0.58      0.41       366
           0       0.19      0.51      0.28       306
           1       0.99      0.82      0.90      4999

    accuracy                           0.79      5671
   macro avg       0.50      0.64      0.53      5671
weighted avg       0.90      0.79      0.83      5671

---------------SVC(kernel=rbf)----------------


Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,222,137,7
0,99,192,15
1,330,784,3885


              precision    recall  f1-score   support

          -1       0.34      0.61      0.44       366
           0       0.17      0.63      0.27       306
           1       0.99      0.78      0.87      4999

    accuracy                           0.76      5671
   macro avg       0.50      0.67      0.53      5671
weighted avg       0.91      0.76      0.81      5671

---------------MultinomialNB----------------


Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,229,120,17
0,120,160,26
1,374,585,4040


              precision    recall  f1-score   support

          -1       0.32      0.63      0.42       366
           0       0.18      0.52      0.27       306
           1       0.99      0.81      0.89      4999

    accuracy                           0.78      5671
   macro avg       0.50      0.65      0.53      5671
weighted avg       0.90      0.78      0.83      5671

---------------GaussianNB----------------


Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,109,174,83
0,92,146,68
1,340,807,3852


              precision    recall  f1-score   support

          -1       0.20      0.30      0.24       366
           0       0.13      0.48      0.20       306
           1       0.96      0.77      0.86      4999

    accuracy                           0.72      5671
   macro avg       0.43      0.52      0.43      5671
weighted avg       0.87      0.72      0.78      5671

---------------ComplementNB----------------


Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,237,99,30
0,128,136,42
1,348,327,4324


              precision    recall  f1-score   support

          -1       0.33      0.65      0.44       366
           0       0.24      0.44      0.31       306
           1       0.98      0.86      0.92      4999

    accuracy                           0.83      5671
   macro avg       0.52      0.65      0.56      5671
weighted avg       0.90      0.83      0.86      5671

---------------KNN----------------


Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,218,101,47
0,126,124,56
1,442,355,4202


              precision    recall  f1-score   support

          -1       0.28      0.60      0.38       366
           0       0.21      0.41      0.28       306
           1       0.98      0.84      0.90      4999

    accuracy                           0.80      5671
   macro avg       0.49      0.61      0.52      5671
weighted avg       0.89      0.80      0.84      5671

---------------Gradient Boosting Classifier----------------


Prediction,-1,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,213,144,9
0,104,181,21
1,324,953,3722


              precision    recall  f1-score   support

          -1       0.33      0.58      0.42       366
           0       0.14      0.59      0.23       306
           1       0.99      0.74      0.85      4999

    accuracy                           0.73      5671
   macro avg       0.49      0.64      0.50      5671
weighted avg       0.90      0.73      0.79      5671



### V.2 - CounterVectorize

#### V.2.1 - Model fit and evaluation

In [None]:
models_list = [("Logistic_reg", LogisticRegression()),
               ("Random_forest", RandomForestClassifier()),
               ("LinearSVC", svm.LinearSVC()),
               ("SVC(kernel=rbf)", svm.SVC()),
               ("MultinomialNB", MultinomialNB()),
               ("GaussianNB", GaussianNB()),
               ("ComplementNB", ComplementNB()),
               ("KNN", KNeighborsClassifier()),
               ("Gradient Boosting Classifier", GradientBoostingClassifier())]

#fit and evaluate  models after CountVectorization transformation

list_of_models_cvz = []
cf_matrix_mx_cvz = []
cl_report_cvz = []
df_perf_cvz = pd.DataFrame([])

for (model_name, model) in models_list:    
    (df0, clrep, cfmx)  = fit_and_evaluate(model_name, model, encoder=2)
    list_of_models_cvz.append(model_name)
    df_perf_cvz = pd.concat([df_perf_cvz, df0])
    cl_report_cvz.append(clrep)
    cf_matrix_mx_cvz.append(cfmx)

df_perf_cvz = df_perf_cvz.reset_index().drop("index", axis = 1)

# Save metrics into a csv_file
df_temp=df_perf.copy()
df_temp["model"] = df_temp["model"] + "_cvz"
df_temp.to_csv("cvz_clm_perf.csv")
del df_temp

In [None]:
df_temp=df_perf.copy()
df_temp["model"] = df_temp["model"] + "_cvz"
df_temp.to_csv("cvz_mcl_perf.csv")
del df_temp

#### V.2.2 - Visualize models performance

In [None]:
# Comparison of metrics scores
data = df_perf.iloc[:,:-2]  # to skip time_train and time_predict
plot_radar_mult(data)

# Compare metrics and time to train and time to predict
#plot_metrics_and_time(df_perf)

# Line plots of metrics for comparison
lineplot_metrics(data)

# Comparaison of Accuracy, recall_macro, precision_macro, recall_0, precision_0
for metric in ["accuracy", "recall_macro", "precision_macro"]:
    barplot_metric_mult(data, metric)

# Compare time_train and time_prodict 
stack_barplot(df_perf, "time_train", "time_predict")

# Compare recall_macro and precision_macro
stack_barplot(df_perf, "recall_macro", "precision_macro")


### Find best hyperparameters with GridSearchCV for better metrics values

In [None]:
from imblearn.pipeline import make_pipeline

# Define piplelines : vectorization ==> Random Undersampling ==> Model
pipe_lsvc = make_pipeline(TfidfVectorizer(), RandomUnderSampler(), svm.LinearSVC())
pipe_lreg = make_pipeline(TfidfVectorizer(), RandomUnderSampler(), LogisticRegression())
pipe_mnnb = make_pipeline(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB())

# parameter for the GridSearchCV
params_lsvc = {
            "tfidfvectorizer__min_df": [3, 4, 5],
            "tfidfvectorizer__max_df": [0.7, 0.8, 0.9, 1],
            "tfidfvectorizer__sublinear_tf": [True, False],
            "linearsvc__C": [0.01, 0.5, 1, 50]     
}

params_lreg = {
            "tfidfvectorizer__min_df": [3, 4, 5],
            "tfidfvectorizer__max_df": [0.7, 0.8, 0.9, 1],
            "tfidfvectorizer__sublinear_tf": [True, False],
            "logisticregression__C": [0.0001, 0.001, 0.01, 1, 10],
            "logisticregression__solver": ["lbfgs", "liblinear"]  
}

params_mnnb = {
            "tfidfvectorizer__min_df": [3, 4, 5],
            "tfidfvectorizer__max_df": [0.7, 0.8, 0.9, 1],
            "tfidfvectorizer__sublinear_tf": [True, False],
            "multinomialnb__alpha": [0.5, 0.8, 1]
}

grid_lsvc = GridSearchCV(estimator = pipe_lsvc, param_grid = params_lsvc, scoring = "recall_macro", cv = 5)
grid_lreg = GridSearchCV(estimator = pipe_lreg , param_grid = params_lreg, scoring = "recall_macro", cv = 5)
grid_mnnb = GridSearchCV(estimator = pipe_mnnb , param_grid = params_mnnb, scoring = "recall_macro", cv = 5)

# models to fit and evaluate
models_list = [("linear_svc", grid_lsvc), 
               ("logistic_regression", grid_lreg), 
               ("multinomialnb", grid_mnnb)]

#train and evaluate models in models_list
#train and evaluate the models after tf-idf transformation
list_of_models_pipe = []
cf_matrix_pipe = []
cl_report_pipe = []
df_pipe = pd.DataFrame([])

for (model_name, model) in models_list:    
    (df0, clrep, cfmx)  = fit_and_evaluate(model_name, model, encoder=0)
    list_of_models_pipe.append(model_name)
    df_pipe = pd.concat([df_pipe, df0])
    cl_report_pipe.append(clrep)
    cf_matrix_pipe.append(cfmx)

df_pipe = df_pipe.reset_index().drop("index", axis = 1)   

# Save metrics into a csv_file
df_temp=df_pipe.copy()
df_temp["model"] = df_temp["model"]+"_tfidf_grid_rcm"
df_temp.to_csv("grid_rcm_tfidf_mcl_perf.csv")
del df_temp

#### Models performance

In [None]:
# Comparison of metrics scores
data = df_pipe.iloc[:,:-2]  # to skip time_train and time_predict
plot_radar_mult(data)

# Compare metrics and time to train and time to predict
#plot_metrics_and_time(df_pipe)

# Line plots of metrics for comparison
lineplot_metrics(data)

# train performance
barplot_metric_mult(df_pipe, "time_train")
barplot_metric_mult(df_pipe, "time_predict")

# Comparaison of Accuracy, recall_macro, precision_macro, recall_0, precision_0
for metric in ["accuracy", "recall_macro", "precision_macro"]:
    barplot_metric_mult(data, metric,dtick=0.5)

# Compare time_train and time_prodict 
stack_barplot(df_pipe, "time_train", "time_predict")

# Compare recall_macro and precision_macro
stack_barplot(df_pipe, "recall_macro", "precision_macro")

### Another GridSearch

In [None]:
# GridSearch with scoring unspecified
grid_lsvc = GridSearchCV(estimator = pipe_lsvc, param_grid = params_lsvc, cv = 5)
grid_lreg = GridSearchCV(estimator = pipe_lreg , param_grid = params_lreg, cv = 5)
grid_mnnb = GridSearchCV(estimator = pipe_mnnb , param_grid = params_mnnb, cv = 5)

# models to fit and evaluate
models_list = [("linear_svc", grid_lsvc), 
               ("logisticregression", grid_lreg), 
               ("multinomialnb", grid_mnnb)]

#train and evaluate models in models_list
#train and evaluate the models after tf-idf transformation
list_of_models_pipe = []
cf_matrix_pipe_2 = []
cl_report_pipe_2 = []
df_pipe_2 = pd.DataFrame([])

for (model_name, model) in models_list:    
    (df0, clrep, cfmx)  = fit_and_evaluate(model_name, model, encoder=0)
    list_of_models_pipe.append(model_name)
    df_pipe_2 = pd.concat([df_pipe_2, df0])
    cl_report_pipe_2.append(clrep)
    cf_matrix_pipe_2.append(cfmx)

df_pipe_2 = df_pipe_2.reset_index().drop("index", axis = 1)   

# Save metrics into a csv_file
df_temp=df_pipe_2.copy()
df_temp["model"] = df_temp["model"]+"_tfidf_grid"
df_temp.to_csv("tfidf_grid_clm2_perf.csv")
del df_temp

#### Models performance

In [None]:
# Comparison of metrics scores
data = df_pipe_2.iloc[:,:-2]  # to skip time_train and time_predict
plot_radar_mult(data)

# Compare metrics and time to train and time to predict
#plot_metrics_and_time(df_pipe_2)

# Line plots of metrics for comparison
lineplot_metrics(data)

# train performance
barplot_metric_mult(df_pipe_2, "time_train")
barplot_metric_mult(df_pipe_2, "time_predict")

# Comparaison of Accuracy, recall_macro, precision_macro, recall_0, precision_0
for metric in ["accuracy", "recall_macro", "precision_macro"]:
    barplot_metric_mult(data, metric)

# Compare time_train and time_prodict 
stack_barplot(df_pipe_2, "time_train", "time_predict")

# Compare recall_macro and precision_macro
stack_barplot(df_pipe_2, "recall_macro", "precision_macro")