## Sentiment analysis models (Machine Learning)

### Importing all the required libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
import gensim.downloader as api
from datasets import Dataset

### Loading the data from csv file 

In [2]:
data = pd.read_csv("tweet_sentiment.csv")
data["cleaned_text"] = data["cleaned_text"].fillna("")

### Converting the text data into TF-IDF and Bag of Words vectors

In [3]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectors = tfidf_vectorizer.fit_transform(data['cleaned_text'])

print("TF-IDF shape:", tfidf_vectors.shape)

TF-IDF shape: (4869, 5000)


In [4]:
bow_vectorizer = CountVectorizer(max_features=5000)
bow_vectors = bow_vectorizer.fit_transform(data['cleaned_text'])

print("BoW shape:", bow_vectors.shape)


BoW shape: (4869, 5000)


In [5]:
X_tfidf = tfidf_vectors.toarray()
X_bow = bow_vectors.toarray()

y = data['label']

### Creating training, validation and test sets for model development and evaluation

In [6]:

X_trainval_tfidf, X_test_tfidf, y_trainval_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size=0.2, stratify=y, random_state=2025)
X_train_tfidf, X_val_tfidf, y_train_tfidf, y_val_tfidf = train_test_split(X_trainval_tfidf, y_trainval_tfidf, test_size=0.2, stratify=y_trainval_tfidf, random_state=42)


In [7]:
X_trainval_bow, X_test_bow, y_trainval_bow, y_test_bow = train_test_split(X_bow, y, test_size=0.2, stratify=y, random_state=2025)
X_train_bow, X_val_bow, y_train_bow, y_val_bow = train_test_split(X_trainval_bow, y_trainval_bow, test_size=0.2, stratify=y_trainval_bow, random_state=2025)

## Building the models

SVM model for TF-IDF vectors

In [12]:
svm_params = [
    {'C': 1, 'kernel': 'linear', 'gamma': 'scale'},
    {'C': 10, 'kernel': 'rbf', 'gamma': 'scale'},
    {'C': 0.1, 'kernel': 'linear', 'gamma': 'auto'}
]

best_svm1 = None
best_f1_svm1 = 0

for params in svm_params:
    model = SVC(**params)
    model.fit(X_train_tfidf, y_train_tfidf)
    preds = model.predict(X_val_tfidf)
    score = f1_score(y_val_tfidf, preds, average='weighted')
    if score > best_f1_svm1:
        best_f1_svm1 = score
        best_svm1 = model

print("Best SVM Model Validation F1:", best_f1_svm1)
print("SVM Test Results:\n", classification_report(y_test_tfidf, best_svm1.predict(X_test_tfidf)))
print(best_svm1.get_params())

Best SVM Model Validation F1: 0.6409101137431773
SVM Test Results:
               precision    recall  f1-score   support

          -1       0.68      0.57      0.62       291
           0       0.56      0.66      0.60       354
           1       0.69      0.66      0.67       329

    accuracy                           0.63       974
   macro avg       0.64      0.63      0.63       974
weighted avg       0.64      0.63      0.63       974

{'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


SVM model for BOW vectors

In [13]:
best_svm2 = None
best_f1_svm2 = 0

for params in svm_params:
    model = SVC(**params)
    model.fit(X_train_bow, y_train_bow)
    preds = model.predict(X_val_bow)
    score = f1_score(y_val_bow, preds, average='weighted')
    if score > best_f1_svm2:
        best_f1_svm2 = score
        best_svm2 = model

print("Best SVM Model Validation F1:", best_f1_svm2)
print("SVM Test Results:\n", classification_report(y_test_bow, best_svm2.predict(X_test_bow)))
print(best_svm2.get_params())

Best SVM Model Validation F1: 0.6392189044527785
SVM Test Results:
               precision    recall  f1-score   support

          -1       0.61      0.56      0.59       291
           0       0.56      0.63      0.59       354
           1       0.68      0.64      0.66       329

    accuracy                           0.61       974
   macro avg       0.62      0.61      0.61       974
weighted avg       0.62      0.61      0.62       974

{'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Naive Bayes model for TF-IDF vectors

In [8]:
nb_params = [0.1, 0.5, 1.0]
best_nb1 = None
best_f1_nb1 = 0

for alpha in nb_params:
    model = MultinomialNB(alpha=alpha)
    model.fit(X_train_tfidf, y_train_tfidf)
    preds = model.predict(X_val_tfidf)
    score = f1_score(y_val_tfidf, preds, average='weighted')
    if score > best_f1_nb1:
        best_f1_nb1 = score
        best_nb1 = model

print("Best NB Validation F1:", best_f1_nb1)
print("Naive Bayes Test Results:\n", classification_report(y_test_tfidf, best_nb1.predict(X_test_tfidf)))
print(best_nb1.get_params())

Best NB Validation F1: 0.6202973310687443
Naive Bayes Test Results:
               precision    recall  f1-score   support

          -1       0.67      0.49      0.57       291
           0       0.55      0.64      0.59       354
           1       0.63      0.67      0.65       329

    accuracy                           0.61       974
   macro avg       0.62      0.60      0.61       974
weighted avg       0.62      0.61      0.61       974

{'alpha': 0.5, 'class_prior': None, 'fit_prior': True, 'force_alpha': 'warn'}


Naive Bayes model for BOW vectors

In [9]:
nb_params = [0.1, 0.5, 1.0]
best_nb2 = None
best_f1_nb2 = 0

for alpha in nb_params:
    model = MultinomialNB(alpha=alpha)
    model.fit(X_train_bow, y_train_bow)
    preds = model.predict(X_val_bow)
    score = f1_score(y_val_bow, preds, average='weighted')
    if score > best_f1_nb2:
        best_f1_nb2 = score
        best_nb2 = model

print("Best NB Validation F1:", best_f1_nb2)
print("Naive Bayes Test Results:\n", classification_report(y_test_bow, best_nb2.predict(X_test_bow)))
print(best_nb2.get_params())

Best NB Validation F1: 0.6320292029485731
Naive Bayes Test Results:
               precision    recall  f1-score   support

          -1       0.65      0.56      0.60       291
           0       0.56      0.60      0.58       354
           1       0.66      0.69      0.67       329

    accuracy                           0.62       974
   macro avg       0.63      0.62      0.62       974
weighted avg       0.62      0.62      0.62       974

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': 'warn'}


Random Forest for TF-IDF vectors

In [10]:
rf_params = [
    {'n_estimators': 100, 'max_depth': None},
    {'n_estimators': 200, 'max_depth': 10},
    {'n_estimators': 100, 'max_depth': 20}
]

best_rf1 = None
best_f1_rf1 = 0

for params in rf_params:
    model = RandomForestClassifier(random_state=2025, **params)
    model.fit(X_train_tfidf, y_train_tfidf)
    preds = model.predict(X_val_tfidf)
    score = f1_score(y_val_tfidf, preds, average='weighted')
    if score > best_f1_rf1:
        best_f1_rf1 = score
        best_rf1 = model

print("Best RF Validation F1:", best_f1_rf1)
print("Random Forest Test Results:\n", classification_report(y_test_tfidf, best_rf1.predict(X_test_tfidf)))
print(best_rf1.get_params())

Best RF Validation F1: 0.6236110243366157
Random Forest Test Results:
               precision    recall  f1-score   support

          -1       0.75      0.43      0.55       291
           0       0.52      0.79      0.62       354
           1       0.74      0.60      0.67       329

    accuracy                           0.62       974
   macro avg       0.67      0.61      0.61       974
weighted avg       0.66      0.62      0.61       974

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 2025, 'verbose': 0, 'warm_start': False}


Random Forest for BOW vectors

In [11]:
rf_params = [
    {'n_estimators': 100, 'max_depth': None},
    {'n_estimators': 200, 'max_depth': 10},
    {'n_estimators': 100, 'max_depth': 20}
]

best_rf2 = None
best_f1_rf2 = 0

for params in rf_params:
    model = RandomForestClassifier(random_state=2025, **params)
    model.fit(X_train_bow, y_train_bow)
    preds = model.predict(X_val_bow)
    score = f1_score(y_val_bow, preds, average='weighted')
    if score > best_f1_rf2:
        best_f1_rf2 = score
        best_rf2 = model

print("Best RF Validation F1:", best_f1_rf2)
print("Random Forest Test Results:\n", classification_report(y_test_bow, best_rf2.predict(X_test_bow)))
print(best_rf2.get_params())

Best RF Validation F1: 0.6437552550750975
Random Forest Test Results:
               precision    recall  f1-score   support

          -1       0.73      0.49      0.59       291
           0       0.52      0.72      0.60       354
           1       0.71      0.63      0.67       329

    accuracy                           0.62       974
   macro avg       0.66      0.61      0.62       974
weighted avg       0.65      0.62      0.62       974

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 2025, 'verbose': 0, 'warm_start': False}


### Creating a function to create results for a summary table

In [14]:
def get_metrics(y_true, y_pred, model_name):
    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, average='weighted'),
        'Recall': recall_score(y_true, y_pred, average='weighted'),
        'F1 Score': f1_score(y_true, y_pred, average='weighted')
    }

In [15]:
svm_pred1 = best_svm1.predict(X_test_tfidf)
rf_pred1 = best_rf1.predict(X_test_tfidf)
nb_pred1 = best_nb1.predict(X_test_tfidf)

svm_pred2 = best_svm2.predict(X_test_bow)
rf_pred2 = best_rf2.predict(X_test_bow)
nb_pred2 = best_nb2.predict(X_test_bow)

In [16]:
results = [
    get_metrics(y_test_tfidf, svm_pred1, "SVM with TF-IDF"),
    get_metrics(y_test_tfidf, rf_pred1, "Random Forest with TF-IDF"),
    get_metrics(y_test_tfidf, nb_pred1, "Naive Bayes with TF-IDF"),
    get_metrics(y_test_bow, svm_pred2, "SVM with BOW"),
    get_metrics(y_test_bow, rf_pred2, "Random Forest with BOW"),
    get_metrics(y_test_bow, nb_pred2, "Naive Bayes with BOW")
]

In [18]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,SVM with TF-IDF,0.631417,0.640146,0.631417,0.632379
1,Random Forest with TF-IDF,0.61807,0.661901,0.61807,0.614562
2,Naive Bayes with TF-IDF,0.607803,0.616069,0.607803,0.606162
3,SVM with BOW,0.61499,0.618498,0.61499,0.615568
4,Random Forest with BOW,0.62115,0.649708,0.62115,0.621738
5,Naive Bayes with BOW,0.620123,0.622373,0.620123,0.619829
