In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

In [2]:
# Load Dataframe

df = pd.read_csv('Preprocessed_Data/Final_df.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head()

Unnamed: 0,score,thumbsUpCount,accurate,add,ai,also,always,amazing,answer,anything,...,voice,want,way,well,wonderful,work,would,wow,sentiment,text_length
0,5,0,0.0,0.0,0.540835,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6369,4
1,5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4404,2
2,5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7096,3
3,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4215,1
4,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1372,9


# MODELLING

In [3]:
# Split df

X = df.drop(columns=['score'])
y = df['score'] - 1     # Mapping target column from [1,2,3,4,5] to [0,1,2,3,4]

print(f'Shape for X features set: {X.shape}')
print(f'Shape for y target column: {y.shape}')

Shape for X features set: (134416, 103)
Shape for y target column: (134416,)


In [4]:
# Initialize ML models

models = {
    'RandomForest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss', n_jobs=-1)
}

In [5]:
# Initialize Stratified K fold

k = 5

skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [6]:
# Dict to save metric evaluation scores

metrics = ['accuracy', 'precision', 'recall', 'f1', 'balanced_accuracy']

model_metrics = {name : {metric : [] for metric in metrics} for name in models}


In [None]:
# Models training 

for i, (train_index, test_index) in enumerate(skf.split(X, y)):

    print(f'Fold {i+1} evaluating...')

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    print('-'*30)
    
    for name, model in models.items():

        print(f'{name} model training...')

        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        model_metrics[name]['accuracy'].append(accuracy_score(y_test, y_pred))
        model_metrics[name]['precision'].append(precision_score(y_test, y_pred, average='weighted'))
        model_metrics[name]['recall'].append(recall_score(y_test, y_pred, average='weighted'))
        model_metrics[name]['f1'].append(f1_score(y_test, y_pred, average='weighted'))
        model_metrics[name]['balanced_accuracy'].append(balanced_accuracy_score(y_test, y_pred))

    print('-'*30)