In [1]:
import pandas as pd

df = pd.read_csv('labeled_data.csv')
df.dropna(inplace = True)
df.head()

Unnamed: 0,Q1A,Q2A,Q3A,Q4A,Q5A,Q6A,Q7A,Q8A,Q9A,Q10A,...,Q34A,Q35A,Q36A,Q37A,Q38A,Q39A,Q40A,Q41A,Q42A,label
0,4.0,4.0,2.0,4.0,4.0,4.0,4.0,4.0,2.0,1.0,...,3.0,4.0,4.0,1.0,2.0,4.0,3.0,4.0,4.0,stress
1,4.0,1.0,2.0,3.0,4.0,4.0,3.0,4.0,3.0,2.0,...,2.0,2.0,3.0,4.0,2.0,2.0,1.0,2.0,2.0,stress
2,3.0,1.0,4.0,1.0,4.0,3.0,1.0,3.0,2.0,4.0,...,4.0,3.0,4.0,4.0,4.0,2.0,2.0,1.0,4.0,depression
3,2.0,3.0,2.0,1.0,3.0,3.0,4.0,2.0,3.0,3.0,...,4.0,1.0,1.0,2.0,1.0,3.0,4.0,4.0,2.0,anxiety
4,2.0,2.0,3.0,4.0,4.0,2.0,4.0,4.0,4.0,3.0,...,4.0,3.0,4.0,3.0,3.0,3.0,4.0,4.0,3.0,anxiety


In [2]:
df.label.value_counts()

stress        17897
anxiety       13594
depression     8284
Name: label, dtype: int64

In [3]:
X = df.drop(['label'], axis = 1) 
y = df.label 

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [5]:
#nb

import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

algo = GaussianNB()
params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}

clf_nb = GridSearchCV(estimator=algo, param_grid=params_NB) 
clf_nb.fit(X_train, y_train)
clf_nb.best_params_

{'var_smoothing': 0.005336699231206307}

In [6]:
y_pred_nb = clf_nb.predict(X_test)

In [7]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))
print('Classification Report:')
print(classification_report(y_test, y_pred_nb))

Accuracy: 0.7604022627278442
Confusion Matrix:
[[2043  134  520]
 [ 129 1269  262]
 [ 564  297 2737]]
Classification Report:
              precision    recall  f1-score   support

     anxiety       0.75      0.76      0.75      2697
  depression       0.75      0.76      0.76      1660
      stress       0.78      0.76      0.77      3598

    accuracy                           0.76      7955
   macro avg       0.76      0.76      0.76      7955
weighted avg       0.76      0.76      0.76      7955



In [8]:
import pickle

pickle.dump(clf_nb, open('model_nb.sav', 'wb'))

In [9]:
#knn

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

algo = KNeighborsClassifier(n_jobs=-1)
params = {'n_neighbors':[5,6,7,8,9,10],
          'leaf_size':[1,2,3,5],
          'weights':['uniform', 'distance'],
          'algorithm':['auto', 'ball_tree','kd_tree','brute'],
          'n_jobs':[-1]}

clf_knn = GridSearchCV(algo, param_grid=params, n_jobs=1)
clf_knn.fit(X_train,y_train)
clf_knn.best_params_

{'algorithm': 'brute',
 'leaf_size': 1,
 'n_jobs': -1,
 'n_neighbors': 10,
 'weights': 'uniform'}

In [10]:
y_pred_knn = clf_knn.predict(X_test)

In [11]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))
print('Classification Report:')
print(classification_report(y_test, y_pred_knn))

Accuracy: 0.7881835323695788
Confusion Matrix:
[[2149   64  484]
 [ 132 1193  335]
 [ 503  167 2928]]
Classification Report:
              precision    recall  f1-score   support

     anxiety       0.77      0.80      0.78      2697
  depression       0.84      0.72      0.77      1660
      stress       0.78      0.81      0.80      3598

    accuracy                           0.79      7955
   macro avg       0.80      0.78      0.79      7955
weighted avg       0.79      0.79      0.79      7955



In [12]:
import pickle

pickle.dump(clf_knn, open('model_knn.sav', 'wb'))

In [13]:
#dt

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

algo= DecisionTreeClassifier(random_state=1234)
params = {'max_features': ['auto', 'sqrt', 'log2'],
          'min_samples_split': [2,3,4,5,6,7,8,9,10,11,12,13,14,15], 
          'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,11],
          'random_state':[123]}

clf_dt = GridSearchCV(algo, param_grid=params, n_jobs=-1)
clf_dt.fit(X_train,y_train)
clf_dt.best_params_

{'max_features': 'auto',
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'random_state': 123}

In [14]:
y_pred_dt = clf_dt.predict(X_test)

In [15]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))
print('Classification Report:')
print(classification_report(y_test, y_pred_dt))

Accuracy: 0.6927718416090509
Confusion Matrix:
[[1868  223  606]
 [ 262 1048  350]
 [ 646  357 2595]]
Classification Report:
              precision    recall  f1-score   support

     anxiety       0.67      0.69      0.68      2697
  depression       0.64      0.63      0.64      1660
      stress       0.73      0.72      0.73      3598

    accuracy                           0.69      7955
   macro avg       0.68      0.68      0.68      7955
weighted avg       0.69      0.69      0.69      7955



In [16]:
import pickle

pickle.dump(clf_dt, open('model_dt.sav', 'wb'))