# Sentiment Modeling

## Imports

In [1]:
import os

import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt 

from scipy.sparse import save_npz, load_npz

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, StratifiedKFold

from sklearn.metrics import (
    classification_report, confusion_matrix, 
    accuracy_score, precision_score, recall_score, 
    f1_score, roc_auc_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import (
    MultinomialNB, ComplementNB, GaussianNB
)
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier
)

import sys
sys.path.append("../")
from src.feature_engineering import *
from src.utility import *

## Loading Config

## Loading data

In [2]:
df = load_pickle("../data/interim/IMDB_feature_engineered.pkl")
df.head()

Unnamed: 0,review,sentiment,review_charecters_len,review_word_len,has_html,cleaned_review,tokens,cleaned_review_charecter_len,cleaned_review_word_len,cleaned_review_has_html,positive_tokens,negative_tokens,positive_tokens_len,negative_tokens_len
0,One of the other reviewers has mentioned that ...,1,1377,320,True,one reviewer mentioned watching oz episode hoo...,"[one, reviewer, mentioned, watching, oz, episo...",931,162,False,"[right, right, trust, regard, classic, appeal,...","[struck, brutality, faint, timid, punch, priso...",13,20
1,A wonderful little production. <br /><br />The...,1,793,166,True,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn...",557,84,False,"[wonderful, comforting, well, seamless, well, ...",[terribly],11,1
2,I thought this was a wonderful way to spend ti...,1,721,172,True,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su...",495,83,False,"[wonderful, hot, witty, likable, well, impress...","[plot, simplistic, killer, disappointed, risk,...",11,6
3,Basically there's a family where a little boy ...,0,569,141,True,basically family little boy jake think zombie ...,"[basically, family, little, boy, jake, think, ...",362,62,False,"[like, well]","[zombie, slower, kill, ruin, meaningless, ignore]",2,6
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,1032,236,True,petter matteis love time money visually stunni...,"[petter, matteis, love, time, money, visually,...",725,123,False,"[love, stunning, vivid, success, stylishly, so...","[loneliness, anxiously]",15,2


In [3]:
# "review_charecters_len","review_word_len",
features = ["cleaned_review_charecter_len","cleaned_review_word_len","positive_tokens_len","negative_tokens_len"]
target = "sentiment"

In [4]:
countVectorized = load_npz("../data/interim/count_vectorized_reviwes.npz")
tfidfVectorized = load_npz("../data/interim/tfidf_vectorized_reviwes.npz")

In [5]:
countVectorized, tfidfVectorized

(<Compressed Sparse Row sparse matrix of dtype 'float32'
 	with 5044360 stored elements and shape (50000, 20000)>,
 <Compressed Sparse Row sparse matrix of dtype 'float32'
 	with 5044360 stored elements and shape (50000, 20000)>)

## SSplitting the data

In [6]:
test_size = 0.2

In [7]:
df_X = df[features] 
y = df[target] 

In [8]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, y, test_size=test_size, stratify=y)

In [9]:
df_X_train.shape, df_X_test.shape, df_y_train.shape, df_y_test.shape

((40000, 4), (10000, 4), (40000,), (10000,))

In [10]:
count_X_train, count_X_test, count_y_train, count_y_test = train_test_split(countVectorized, y, test_size=test_size, stratify=y)

In [11]:
count_X_train.shape, count_X_test.shape, count_y_train.shape, count_y_test.shape

((40000, 20000), (10000, 20000), (40000,), (10000,))

In [12]:
tfidf_X_train, tfidf_X_test, tfidf_y_train, tfidf_y_test = train_test_split(tfidfVectorized, y, test_size=test_size, stratify=y)

In [13]:
tfidf_X_train.shape, tfidf_X_test.shape, tfidf_y_train.shape, tfidf_y_test.shape

((40000, 20000), (10000, 20000), (40000,), (10000,))

In [14]:
folds = 5

## Traning models on Numeric cols in df

In [15]:
df_models = {
    "LogisticRegression": LogisticRegression(random_state=42, max_iter=1000), 
    "LinearSVC": LinearSVC(random_state=42, max_iter=5000), 
    "MultinomialNB": MultinomialNB(),
    "ComplementNB": ComplementNB(),
    "RandomForestClassifier": RandomForestClassifier(random_state=42, n_jobs=-1), 
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=42, n_jobs=-1),
}

In [16]:
results = []

for name, model in df_models.items():
    print(f"Training {name}...")
    
    model.fit(df_X_train, df_y_train)
    
    y_train_pred = model.predict(df_X_train)
    y_test_pred = model.predict(df_X_test)
    
    if hasattr(model, "predict_proba"):
        y_train_proba = model.predict_proba(df_X_train)[:, 1]
        y_test_proba = model.predict_proba(df_X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_train_proba = model.decision_function(df_X_train)
        y_test_proba = model.decision_function(df_X_test)
    else:
        y_train_proba = None
        y_test_proba = None

    model_result = {"Model": name}

    for dataset_type, y_true, y_pred, y_proba in [
        ('Train', df_y_train, y_train_pred, y_train_proba),
        ('Test', df_y_test, y_test_pred, y_test_proba)
    ]:
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        
        roc_auc = roc_auc_score(y_true, y_proba) if y_proba is not None else np.nan
        
        model_result[dataset_type + "_" + "Accuracy"] = accuracy
        model_result[dataset_type + "_" + "Precision"] = precision
        model_result[dataset_type + "_" + "Recall"] = recall
        model_result[dataset_type + "_" + "F1-Score"] = f1
        model_result[dataset_type + "_" + "ROC-AUC"] = roc_auc
        
    results.append(model_result)

print("\nEvaluation Complete.")
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by='Test_F1-Score', ascending=False)
save_data_csv(df_results, "../results/metrics/df_results.csv")
save_pickle(df_models, "../models/df_models.pkl")
df_results

Training LogisticRegression...
Training LinearSVC...
Training MultinomialNB...
Training ComplementNB...
Training RandomForestClassifier...
Training ExtraTreesClassifier...

Evaluation Complete.


Unnamed: 0,Model,Train_Accuracy,Train_Precision,Train_Recall,Train_F1-Score,Train_ROC-AUC,Test_Accuracy,Test_Precision,Test_Recall,Test_F1-Score,Test_ROC-AUC
3,ComplementNB,0.729125,0.718579,0.75325,0.735506,0.795237,0.7343,0.724727,0.7556,0.739841,0.797317
2,MultinomialNB,0.729125,0.718579,0.75325,0.735506,0.795237,0.7343,0.724727,0.7556,0.739841,0.797317
1,LinearSVC,0.732125,0.731189,0.73415,0.732666,0.797839,0.7357,0.737075,0.7328,0.734931,0.799777
0,LogisticRegression,0.73205,0.731287,0.7337,0.732491,0.7979,0.7355,0.736779,0.7328,0.734784,0.799848
4,RandomForestClassifier,0.995675,0.995403,0.99595,0.995676,0.99993,0.7065,0.711708,0.6942,0.702845,0.771588
5,ExtraTreesClassifier,0.995725,0.999798,0.99165,0.995708,0.999963,0.6878,0.692339,0.676,0.684072,0.751558


In [17]:
df_cv_models = {
    "LogisticRegression": LogisticRegression(random_state=42, max_iter=1000), 
    "LinearSVC": LinearSVC(random_state=42, max_iter=5000), 
    "MultinomialNB": MultinomialNB(),
    "ComplementNB": ComplementNB(),
    "RandomForestClassifier": RandomForestClassifier(random_state=42, n_jobs=-1), 
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=42, n_jobs=-1),
}

In [18]:
scoring_metrics = [
    'accuracy', 'precision', 'recall', 'f1', 'roc_auc'
]

cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_results = []

for name, model in df_cv_models.items():
    print(f"Running {folds}-Fold CV for {name}...")
    
    # Perform cross-validation
    scores = cross_validate(
        estimator=model,
        X=df_X, # Use ALL data for CV
        y=y,
        cv=cv_strategy, # 5 folds
        scoring=scoring_metrics,
        return_train_score=True, # Get train scores for overfitting check
        n_jobs=-1 # Use all cores for parallel processing
    )
    
    # Store the average results
    model_data = {
        "Model": name,
        "Fit_Time_sec": np.mean(scores['fit_time']),
        "Train_Accuracy": np.mean(scores['train_accuracy']),
        "Train_Precision": np.mean(scores['train_precision']),
        "Train_Recall": np.mean(scores['train_recall']),
        "Train_F1-Score": np.mean(scores['train_f1']),
        "Train_ROC-AUC": np.mean(scores['train_roc_auc']),
        "Test_Accuracy": np.mean(scores['test_accuracy']),
        "Test_Precision": np.mean(scores['test_precision']),
        "Test_Recall": np.mean(scores['test_recall']),
        "Test_F1-Score": np.mean(scores['test_f1']),
        "Test_ROC-AUC": np.mean(scores['test_roc_auc']),
    }
    cv_results.append(model_data)

df_cv_results = pd.DataFrame(cv_results)
print("\nCross-Validation Complete.")
df_cv_results = df_cv_results.sort_values(by='Test_F1-Score', ascending=False)
save_data_csv(df_cv_results, "../results/metrics/df_cv_results.csv")
save_pickle(df_cv_models, "../models/df_cv_models.pkl")
df_cv_results

Running 5-Fold CV for LogisticRegression...
Running 5-Fold CV for LinearSVC...
Running 5-Fold CV for MultinomialNB...
Running 5-Fold CV for ComplementNB...
Running 5-Fold CV for RandomForestClassifier...
Running 5-Fold CV for ExtraTreesClassifier...

Cross-Validation Complete.


Unnamed: 0,Model,Fit_Time_sec,Train_Accuracy,Train_Precision,Train_Recall,Train_F1-Score,Train_ROC-AUC,Test_Accuracy,Test_Precision,Test_Recall,Test_F1-Score,Test_ROC-AUC
3,ComplementNB,0.037047,0.730098,0.719887,0.753316,0.736222,0.795684,0.73002,0.719767,0.75336,0.736172,0.795656
2,MultinomialNB,0.041717,0.730098,0.719887,0.753316,0.736222,0.795684,0.73002,0.719767,0.75336,0.736172,0.795656
0,LogisticRegression,0.532806,0.732591,0.732472,0.732849,0.73266,0.798258,0.73272,0.732596,0.733,0.732784,0.798209
1,LinearSVC,0.259542,0.732356,0.732468,0.732116,0.732291,0.798171,0.73258,0.732651,0.73244,0.73253,0.798123
4,RandomForestClassifier,17.891951,0.995296,0.995452,0.995138,0.995295,0.999922,0.69966,0.705607,0.68524,0.695261,0.767708
5,ExtraTreesClassifier,12.96483,0.995358,0.999807,0.990907,0.995337,0.999957,0.6861,0.690795,0.67384,0.682178,0.747613


## Traning models on count vectors

In [19]:
count_models = {
    "LinearSVC": LinearSVC(random_state=42, max_iter=5000),  
    "LogisticRegression": LogisticRegression(random_state=42, max_iter=5000), 
    "MultinomialNB": MultinomialNB(),
    "ComplementNB": ComplementNB(),
    "RandomForestClassifier": RandomForestClassifier(random_state=42, n_jobs=-1), 
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=42, n_jobs=-1),
}

In [20]:
results = []

for name, model in count_models.items():
    print(f"Training {name}...")
    
    model.fit(count_X_train, count_y_train)
    
    y_train_pred = model.predict(count_X_train)
    y_test_pred = model.predict(count_X_test)
    
    if hasattr(model, "predict_proba"):
        y_train_proba = model.predict_proba(count_X_train)[:, 1]
        y_test_proba = model.predict_proba(count_X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_train_proba = model.decision_function(count_X_train)
        y_test_proba = model.decision_function(count_X_test)
    else:
        y_train_proba = None
        y_test_proba = None

    model_result = {"Model": name}

    for dataset_type, y_true, y_pred, y_proba in [
        ('Train', count_y_train, y_train_pred, y_train_proba),
        ('Test', count_y_test, y_test_pred, y_test_proba)
    ]:
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        
        roc_auc = roc_auc_score(y_true, y_proba) if y_proba is not None else np.nan
        
        model_result[dataset_type + "_" + "Accuracy"] = accuracy
        model_result[dataset_type + "_" + "Precision"] = precision
        model_result[dataset_type + "_" + "Recall"] = recall
        model_result[dataset_type + "_" + "F1-Score"] = f1
        model_result[dataset_type + "_" + "ROC-AUC"] = roc_auc
        
    results.append(model_result)

print("\nEvaluation Complete.")
count_results = pd.DataFrame(results)
count_results = count_results.sort_values(by='Test_F1-Score', ascending=False)
save_data_csv(count_results, "../results/metrics/count_results.csv")
save_pickle(count_models, "../models/count_models.pkl")
count_results

Training LinearSVC...
Training LogisticRegression...
Training MultinomialNB...
Training ComplementNB...
Training RandomForestClassifier...
Training ExtraTreesClassifier...

Evaluation Complete.


Unnamed: 0,Model,Train_Accuracy,Train_Precision,Train_Recall,Train_F1-Score,Train_ROC-AUC,Test_Accuracy,Test_Precision,Test_Recall,Test_F1-Score,Test_ROC-AUC
1,LogisticRegression,0.9971,0.996752,0.99745,0.997101,0.999929,0.8803,0.878559,0.8826,0.880575,0.947454
5,ExtraTreesClassifier,1.0,1.0,1.0,1.0,1.0,0.8756,0.8753,0.876,0.87565,0.945764
3,ComplementNB,0.881475,0.879483,0.8841,0.881785,0.943762,0.8699,0.865586,0.8758,0.870663,0.93201
2,MultinomialNB,0.881475,0.879483,0.8841,0.881785,0.943761,0.8699,0.865586,0.8758,0.870663,0.932028
0,LinearSVC,1.0,1.0,1.0,1.0,1.0,0.8626,0.86289,0.8622,0.862545,0.93183
4,RandomForestClassifier,1.0,1.0,1.0,1.0,1.0,0.8617,0.857482,0.8676,0.862511,0.932952


In [21]:
count_cv_models = {
    "LogisticRegression": LogisticRegression(random_state=42, max_iter=1000), 
    "LinearSVC": LinearSVC(random_state=42, max_iter=5000), 
    "MultinomialNB": MultinomialNB(),
    "ComplementNB": ComplementNB(),
    "RandomForestClassifier": RandomForestClassifier(random_state=42, n_jobs=-1), 
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=42, n_jobs=-1),
}

In [22]:
scoring_metrics = [
    'accuracy', 'precision', 'recall', 'f1', 'roc_auc'
]

cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_results = []

for name, model in count_cv_models.items():
    print(f"Running {folds}-Fold CV for {name}...")
    
    # Perform cross-validation
    scores = cross_validate(
        estimator=model,
        X=countVectorized, # Use ALL data for CV
        y=y,
        cv=cv_strategy, # 5 folds
        scoring=scoring_metrics,
        return_train_score=True, # Get train scores for overfitting check
        n_jobs=-1 # Use all cores for parallel processing
    )
    
    # Store the average results
    model_data = {
        "Model": name,
        "Fit_Time_sec": np.mean(scores['fit_time']),
        "Train_Accuracy": np.mean(scores['train_accuracy']),
        "Train_Precision": np.mean(scores['train_precision']),
        "Train_Recall": np.mean(scores['train_recall']),
        "Train_F1-Score": np.mean(scores['train_f1']),
        "Train_ROC-AUC": np.mean(scores['train_roc_auc']),
        "Test_Accuracy": np.mean(scores['test_accuracy']),
        "Test_Precision": np.mean(scores['test_precision']),
        "Test_Recall": np.mean(scores['test_recall']),
        "Test_F1-Score": np.mean(scores['test_f1']),
        "Test_ROC-AUC": np.mean(scores['test_roc_auc']),
    }
    cv_results.append(model_data)

count_cv_results = pd.DataFrame(cv_results)
print("\nCross-Validation Complete.")
count_cv_results = count_cv_results.sort_values(by='Test_F1-Score', ascending=False)
save_data_csv(count_cv_results, "../results/metrics/count_cv_results.csv")
save_pickle(count_cv_models, "../models/count_cv_models.pkl")
count_cv_results

Running 5-Fold CV for LogisticRegression...
Running 5-Fold CV for LinearSVC...
Running 5-Fold CV for MultinomialNB...
Running 5-Fold CV for ComplementNB...
Running 5-Fold CV for RandomForestClassifier...
Running 5-Fold CV for ExtraTreesClassifier...

Cross-Validation Complete.


Unnamed: 0,Model,Fit_Time_sec,Train_Accuracy,Train_Precision,Train_Recall,Train_F1-Score,Train_ROC-AUC,Test_Accuracy,Test_Precision,Test_Recall,Test_F1-Score,Test_ROC-AUC
0,LogisticRegression,17.192942,0.996071,0.995521,0.996627,0.996073,0.999864,0.88134,0.878941,0.8846,0.881722,0.946948
5,ExtraTreesClassifier,651.651123,1.0,1.0,1.0,1.0,1.0,0.87492,0.876682,0.8726,0.874625,0.94394
3,ComplementNB,0.263666,0.880358,0.877934,0.883564,0.88074,0.942488,0.86554,0.862454,0.8698,0.866087,0.92963
2,MultinomialNB,0.247663,0.880358,0.877934,0.883564,0.88074,0.942489,0.86554,0.862454,0.8698,0.866087,0.92963
1,LinearSVC,198.415673,1.0,1.0,1.0,1.0,1.0,0.86008,0.859869,0.86044,0.860115,0.92978
4,RandomForestClassifier,477.51805,1.0,1.0,1.0,1.0,1.0,0.856,0.854219,0.85856,0.856368,0.931241


# Traning models on count vectors

In [23]:
tfidf_models = {
    "LogisticRegression": LogisticRegression(random_state=42, max_iter=1000), 
    "LinearSVC": LinearSVC(random_state=42, max_iter=5000), 
    "MultinomialNB": MultinomialNB(),
    "ComplementNB": ComplementNB(),
    "RandomForestClassifier": RandomForestClassifier(random_state=42, n_jobs=-1), 
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=42, n_jobs=-1),
}

In [27]:
results = []

for name, model in tfidf_models.items():
    print(f"Training {name}...")
    
    model.fit(tfidf_X_train, tfidf_y_train)
    
    y_train_pred = model.predict(tfidf_X_train)
    y_test_pred = model.predict(tfidf_X_test)
    
    if hasattr(model, "predict_proba"):
        y_train_proba = model.predict_proba(tfidf_X_train)[:, 1]
        y_test_proba = model.predict_proba(tfidf_X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_train_proba = model.decision_function(tfidf_X_train)
        y_test_proba = model.decision_function(tfidf_X_test)
    else:
        y_train_proba = None
        y_test_proba = None

    model_result = {"Model": name}

    for dataset_type, y_true, y_pred, y_proba in [
        ('Train', tfidf_y_train, y_train_pred, y_train_proba),
        ('Test', tfidf_y_test, y_test_pred, y_test_proba)
    ]:
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        
        roc_auc = roc_auc_score(y_true, y_proba) if y_proba is not None else np.nan
        
        model_result[dataset_type + "_" + "Accuracy"] = accuracy
        model_result[dataset_type + "_" + "Precision"] = precision
        model_result[dataset_type + "_" + "Recall"] = recall
        model_result[dataset_type + "_" + "F1-Score"] = f1
        model_result[dataset_type + "_" + "ROC-AUC"] = roc_auc
        
    results.append(model_result)

print("\nEvaluation Complete.")
tfidf_results = pd.DataFrame(results)
tfidf_results = tfidf_results.sort_values(by='Test_F1-Score', ascending=False)
save_data_csv(tfidf_results, "../results/metrics/tfidf_results.csv")
save_pickle(tfidf_models, "../models/tfidf_models.pkl")
tfidf_results

Training LogisticRegression...
Training LinearSVC...
Training MultinomialNB...
Training ComplementNB...
Training RandomForestClassifier...
Training ExtraTreesClassifier...

Evaluation Complete.


Unnamed: 0,Model,Train_Accuracy,Train_Precision,Train_Recall,Train_F1-Score,Train_ROC-AUC,Test_Accuracy,Test_Precision,Test_Recall,Test_F1-Score,Test_ROC-AUC
0,LogisticRegression,0.92745,0.919521,0.9369,0.928129,0.979258,0.9031,0.894964,0.9134,0.904088,0.96459
1,LinearSVC,0.9816,0.979825,0.98345,0.981634,0.998145,0.8974,0.89145,0.905,0.898174,0.961497
2,MultinomialNB,0.894175,0.885181,0.90585,0.895396,0.959201,0.8751,0.862556,0.8924,0.877224,0.9468
3,ComplementNB,0.894175,0.885181,0.90585,0.895396,0.959201,0.8751,0.862556,0.8924,0.877224,0.9468
5,ExtraTreesClassifier,1.0,1.0,1.0,1.0,1.0,0.8756,0.874253,0.8774,0.875824,0.944765
4,RandomForestClassifier,1.0,1.0,1.0,1.0,1.0,0.8617,0.861772,0.8616,0.861686,0.936078


In [28]:
tfidf_cv_models = {
    "LogisticRegression": LogisticRegression(random_state=42, max_iter=1000), 
    "LinearSVC": LinearSVC(random_state=42, max_iter=5000), 
    "MultinomialNB": MultinomialNB(),
    "ComplementNB": ComplementNB(),
    "RandomForestClassifier": RandomForestClassifier(random_state=42, n_jobs=-1), 
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=42, n_jobs=-1),
}

In [29]:
scoring_metrics = [
    'accuracy', 'precision', 'recall', 'f1', 'roc_auc'
]

cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_results = []

for name, model in tfidf_cv_models.items():
    print(f"Running {folds}-Fold CV for {name}...")
    
    # Perform cross-validation
    scores = cross_validate(
        estimator=model,
        X=tfidfVectorized, # Use ALL data for CV
        y=y,
        cv=cv_strategy, # 5 folds
        scoring=scoring_metrics,
        return_train_score=True, # Get train scores for overfitting check
        n_jobs=-1 # Use all cores for parallel processing
    )
    
    # Store the average results
    model_data = {
        "Model": name,
        "Fit_Time_sec": np.mean(scores['fit_time']),
        "Train_Accuracy": np.mean(scores['train_accuracy']),
        "Train_Precision": np.mean(scores['train_precision']),
        "Train_Recall": np.mean(scores['train_recall']),
        "Train_F1-Score": np.mean(scores['train_f1']),
        "Train_ROC-AUC": np.mean(scores['train_roc_auc']),
        "Test_Accuracy": np.mean(scores['test_accuracy']),
        "Test_Precision": np.mean(scores['test_precision']),
        "Test_Recall": np.mean(scores['test_recall']),
        "Test_F1-Score": np.mean(scores['test_f1']),
        "Test_ROC-AUC": np.mean(scores['test_roc_auc']),
    }
    cv_results.append(model_data)

tfidf_cv_results = pd.DataFrame(cv_results)
print("\nCross-Validation Complete.")
tfidf_cv_results = tfidf_cv_results.sort_values(by='Test_F1-Score', ascending=False)
save_data_csv(tfidf_cv_results, "../results/metrics/tfidf_cv_results.csv")
save_pickle(tfidf_cv_models, "../models/tfidf_cv_models.pkl")
tfidf_cv_results

Running 5-Fold CV for LogisticRegression...
Running 5-Fold CV for LinearSVC...
Running 5-Fold CV for MultinomialNB...
Running 5-Fold CV for ComplementNB...
Running 5-Fold CV for RandomForestClassifier...
Running 5-Fold CV for ExtraTreesClassifier...

Cross-Validation Complete.


Unnamed: 0,Model,Fit_Time_sec,Train_Accuracy,Train_Precision,Train_Recall,Train_F1-Score,Train_ROC-AUC,Test_Accuracy,Test_Precision,Test_Recall,Test_F1-Score,Test_ROC-AUC
0,LogisticRegression,1.9172,0.930924,0.924049,0.939031,0.93148,0.980964,0.8988,0.890782,0.90908,0.899826,0.962854
1,LinearSVC,5.855544,0.979162,0.976867,0.981569,0.979212,0.997744,0.89308,0.888821,0.8986,0.893658,0.959203
2,MultinomialNB,0.262634,0.892602,0.883037,0.905089,0.893927,0.958285,0.87484,0.864539,0.889,0.876574,0.945481
3,ComplementNB,0.241003,0.892602,0.883037,0.905089,0.893927,0.958285,0.87484,0.864539,0.889,0.876574,0.945481
5,ExtraTreesClassifier,663.145445,1.0,1.0,1.0,1.0,1.0,0.86846,0.867887,0.86928,0.868557,0.940811
4,RandomForestClassifier,444.101484,1.0,1.0,1.0,1.0,1.0,0.8556,0.855997,0.85508,0.855525,0.932736
