In [81]:
import pandas as pd
import numpy as np
import re
import pickle

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV


Data Loading and Preprocessing

In [3]:
angriness_df = pd.read_csv('data/angriness.csv')
happiness_df = pd.read_csv('data/happiness.csv')
sadness_df = pd.read_csv('data/sadness.csv')

In [5]:
angriness_df['label'] = 2
happiness_df['label'] = 1
sadness_df['label'] = 0

In [6]:
df = pd.concat([angriness_df, happiness_df, sadness_df], ignore_index=True)

In [12]:
df.shape

(2039, 3)

In [9]:
df.head()

Unnamed: 0,content,intensity,label
0,"Sometimes I’m not angry, I’m hurt and there’s ...",angriness,2
1,Not available for busy people☺,angriness,2
2,I do not exist to impress the world. I exist t...,angriness,2
3,Everything is getting expensive except some pe...,angriness,2
4,My phone screen is brighter than my future 🙁,angriness,2


In [4]:
def preprocess_data(text):
    """Function to clean and preprocess the text data."""
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [10]:
df['content'] = df['content'].apply(preprocess_data)

In [13]:
X = df['content']
y = df['label']

Train/Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Feature Engineering and feature selection

In [32]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)



In [88]:
vectorizer.fit(X_train)

with open('app/model/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

Model building

Logistic Regression Model

In [27]:
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train_vec, y_train)

y_pred_logreg = logreg_model.predict(X_test_vec)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Classification Report (Logistic Regression):\n", classification_report(y_test, y_pred_logreg))
print("Confusion Matrix (Logistic Regression):\n", confusion_matrix(y_test, y_pred_logreg))

Logistic Regression Accuracy: 0.7941176470588235
Classification Report (Logistic Regression):
               precision    recall  f1-score   support

           0       0.86      0.71      0.77       119
           1       0.72      0.81      0.76       137
           2       0.83      0.85      0.84       152

    accuracy                           0.79       408
   macro avg       0.80      0.79      0.79       408
weighted avg       0.80      0.79      0.79       408

Confusion Matrix (Logistic Regression):
 [[ 84  26   9]
 [  8 111  18]
 [  6  17 129]]


Neural Network Model (MLPClassifier)

In [29]:
mlp_model = MLPClassifier(hidden_layer_sizes=(128,), max_iter=1000, random_state=42)
mlp_model.fit(X_train_vec, y_train)

y_pred_mlp = mlp_model.predict(X_test_vec)

print("MLP Classifier Accuracy:", accuracy_score(y_test, y_pred_mlp))
print("Classification Report (MLP):\n", classification_report(y_test, y_pred_mlp))
print("Confusion Matrix (MLP):\n", confusion_matrix(y_test, y_pred_mlp))

MLP Classifier Accuracy: 0.7450980392156863
Classification Report (MLP):
               precision    recall  f1-score   support

           0       0.79      0.71      0.74       119
           1       0.68      0.75      0.71       137
           2       0.79      0.77      0.78       152

    accuracy                           0.75       408
   macro avg       0.75      0.74      0.74       408
weighted avg       0.75      0.75      0.75       408

Confusion Matrix (MLP):
 [[ 84  23  12]
 [ 14 103  20]
 [  9  26 117]]


Random Forest Classifier Model

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vec, y_train)

y_pred_rf = rf_model.predict(X_test_vec)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report (Random Forest):\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix (Random Forest):\n", confusion_matrix(y_test, y_pred_rf))

Random Forest Accuracy: 0.7818627450980392
Classification Report (Random Forest):
               precision    recall  f1-score   support

           0       0.87      0.68      0.76       119
           1       0.66      0.88      0.75       137
           2       0.88      0.78      0.83       152

    accuracy                           0.78       408
   macro avg       0.80      0.78      0.78       408
weighted avg       0.80      0.78      0.78       408

Confusion Matrix (Random Forest):
 [[ 81  32   6]
 [  7 120  10]
 [  5  29 118]]


XG Boost

In [63]:
xgb_model = XGBClassifier(
    n_estimators=100,          
    max_depth=6,               
    learning_rate=0.1,         
    subsample=0.8,             
    colsample_bytree=0.8,      
    random_state=42,           
    use_label_encoder=False,   
    eval_metric='mlogloss'     
)

xgb_model.fit(X_train_vec, y_train)

y_pred_xgb = xgb_model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.7549019607843137
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.66      0.74       119
           1       0.68      0.85      0.76       137
           2       0.78      0.74      0.76       152

    accuracy                           0.75       408
   macro avg       0.77      0.75      0.75       408
weighted avg       0.77      0.75      0.75       408

Confusion Matrix:
 [[ 79  26  14]
 [  4 116  17]
 [ 11  28 113]]


Support Vector Machine

In [None]:
svm_model = SVC(
    C=1.0,             
    kernel='rbf',      
    gamma='scale',     
    probability=True,  
    random_state=42    
)

svm_model.fit(X_train_vec, y_train)

y_pred_svm = svm_model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))

Accuracy: 0.7892156862745098
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.65      0.74       119
           1       0.66      0.89      0.76       137
           2       0.90      0.81      0.85       152

    accuracy                           0.79       408
   macro avg       0.81      0.78      0.79       408
weighted avg       0.81      0.79      0.79       408

Confusion Matrix:
 [[ 77  37   5]
 [  7 122   8]
 [  4  25 123]]


Fine tuning

Hyperparameter Tuning

In [None]:
# Logistic Regression
logreg_model = LogisticRegression(max_iter=10000)

log_reg_params  = {
    'C': [0.1, 1, 10, 100],
    'solver': ['saga'], 
    'penalty': ['l2', 'elasticnet'],  
    'l1_ratio': [0.1, 0.5, 0.7, 1.0], 
    'max_iter': [1000, 2000, 3000]
}

LogisticRegression_grid_search = GridSearchCV(estimator=logreg_model, param_grid=log_reg_params , cv=3, n_jobs=-1, verbose=1)
LogisticRegression_grid_search.fit(X_train_vec, y_train)

print(f"Best Parameters: {LogisticRegression_grid_search.best_params_}")


Fitting 3 folds for each of 96 candidates, totalling 288 fits




Best Parameters: {'C': 1, 'l1_ratio': 0.7, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}




In [None]:
best_log_model = LogisticRegression_grid_search.best_estimator_

best_log_model.fit(X_train_vec, y_train)

y_pred_log = best_log_model.predict(X_test_vec)

accuracy = accuracy_score(y_test, y_pred_log)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_log)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_log)}")


Accuracy: 0.7941176470588235
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.71      0.77       119
           1       0.72      0.81      0.76       137
           2       0.83      0.85      0.84       152

    accuracy                           0.79       408
   macro avg       0.80      0.79      0.79       408
weighted avg       0.80      0.79      0.79       408

Confusion Matrix:
[[ 84  26   9]
 [  8 111  18]
 [  6  17 129]]




In [None]:
# MLP
mlp_model = MLPClassifier(random_state=42)

mlp_params = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001],
    'learning_rate_init': [0.001, 0.01],
    'max_iter': [200, 500]
}

mlp_grid_search = GridSearchCV(estimator=mlp_model, param_grid=mlp_params, cv=3, n_jobs=-1, verbose=1)
mlp_grid_search.fit(X_train_vec, y_train)

print(f"Best Parameters: {mlp_grid_search.best_params_}")


Fitting 3 folds for each of 96 candidates, totalling 288 fits




Best Parameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50,), 'learning_rate_init': 0.01, 'max_iter': 200, 'solver': 'sgd'}




In [75]:
best_mlp_model = mlp_grid_search.best_estimator_

best_mlp_model.fit(X_train_vec, y_train)

y_pred_mlp = best_mlp_model.predict(X_test_vec)

accuracy = accuracy_score(y_test, y_pred_mlp)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_mlp)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_mlp)}")

Accuracy: 0.8063725490196079
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.76      0.81       119
           1       0.76      0.78      0.77       137
           2       0.82      0.86      0.84       152

    accuracy                           0.81       408
   macro avg       0.81      0.80      0.80       408
weighted avg       0.81      0.81      0.81       408

Confusion Matrix:
[[ 91  19   9]
 [ 10 107  20]
 [  6  15 131]]




In [None]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42)

rf_params = {
    'bootstrap': [True, False],
    'max_depth': [10, 20, None],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [50, 100, 150]
}

rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_params, cv=3, n_jobs=-1, verbose=2)
rf_grid_search.fit(X_train_vec, y_train)

print(f"Best Parameters: {rf_grid_search.best_params_}")


Fitting 3 folds for each of 486 candidates, totalling 1458 fits
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.3s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.3s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.3s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total tim

In [None]:
best_rf_model = rf_grid_search.best_estimator_

best_rf_model.fit(X_train_vec, y_train)

y_pred_rf = best_rf_model.predict(X_test_vec)

accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_rf)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_rf)}")

Accuracy: 0.7916666666666666
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.73      0.78       119
           1       0.71      0.84      0.77       137
           2       0.85      0.80      0.82       152

    accuracy                           0.79       408
   macro avg       0.80      0.79      0.79       408
weighted avg       0.80      0.79      0.79       408

Confusion Matrix:
[[ 87  24   8]
 [  9 115  13]
 [  9  22 121]]


In [None]:
# XG Boost
xgb_model = XGBClassifier()

xgb_params = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2]
}

xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_params, cv=3, n_jobs=-1, verbose=1)
xgb_grid_search.fit(X_train_vec, y_train)

print(f"Best Parameters: {xgb_grid_search.best_params_}")

Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 150, 'subsample': 0.8}


In [77]:
best_xgb_model = xgb_grid_search.best_estimator_

best_xgb_model.fit(X_train_vec, y_train)

y_pred_xgb = best_xgb_model.predict(X_test_vec)

accuracy = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_xgb)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_xgb)}")

Accuracy: 0.7647058823529411
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.70      0.76       119
           1       0.70      0.80      0.75       137
           2       0.78      0.79      0.79       152

    accuracy                           0.76       408
   macro avg       0.77      0.76      0.76       408
weighted avg       0.77      0.76      0.76       408

Confusion Matrix:
[[ 83  23  13]
 [  8 109  20]
 [  9  23 120]]


In [None]:
# SVM

svm_model = SVC()

svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
    'degree': [3, 4],
    'coef0': [0, 1]
}

svm_grid_search = GridSearchCV(estimator=svm_model, param_grid=svm_params, cv=3, n_jobs=-1, verbose=2)
svm_grid_search.fit(X_train_vec, y_train)

print(f"Best Parameters: {svm_grid_search.best_params_}")


Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END C=0.1, coef0=0, degree=3, gamma=scale, kernel=linear; total time=   0.7s
[CV] END C=0.1, coef0=0, degree=3, gamma=scale, kernel=linear; total time=   0.7s
[CV] END C=0.1, coef0=0, degree=3, gamma=auto, kernel=linear; total time=   0.7s
[CV] END C=0.1, coef0=0, degree=4, gamma=scale, kernel=linear; total time=   0.7s
[CV] END ..C=0.1, coef0=0, degree=3, gamma=scale, kernel=rbf; total time=   0.8s
[CV] END C=0.1, coef0=0, degree=3, gamma=scale, kernel=linear; total time=   0.8s
[CV] END C=0.1, coef0=0, degree=3, gamma=auto, kernel=linear; total time=   0.8s
[CV] END C=0.1, coef0=0, degree=3, gamma=auto, kernel=linear; total time=   0.8s
[CV] END ..C=0.1, coef0=0, degree=4, gamma=scale, kernel=rbf; total time=   0.8s
[CV] END ..C=0.1, coef0=0, degree=4, gamma=scale, kernel=rbf; total time=   0.8s
[CV] END C=0.1, coef0=0, degree=4, gamma=auto, kernel=linear; total time=   0.8s
[CV] END C=0.1, coef0=0, degree=4, gamma=sc

In [73]:
best_svm_model = svm_grid_search.best_estimator_

best_svm_model.fit(X_train_vec, y_train)

y_pred_svm = best_svm_model.predict(X_test_vec)

accuracy = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_svm)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_svm)}")

Accuracy: 0.7941176470588235
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.72      0.77       119
           1       0.73      0.77      0.75       137
           2       0.84      0.87      0.85       152

    accuracy                           0.79       408
   macro avg       0.80      0.79      0.79       408
weighted avg       0.80      0.79      0.79       408

Confusion Matrix:
[[ 86  25   8]
 [ 13 106  18]
 [  6  14 132]]


Save Best Model