In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Loading The Dataset

In [2]:
std_data = pd.read_csv('/kaggle/input/student-stress-factors-a-comprehensive-analysis/StressLevelDataset.csv')
std_data.head()

Unnamed: 0,anxiety_level,self_esteem,mental_health_history,depression,headache,blood_pressure,sleep_quality,breathing_problem,noise_level,living_conditions,...,basic_needs,academic_performance,study_load,teacher_student_relationship,future_career_concerns,social_support,peer_pressure,extracurricular_activities,bullying,stress_level
0,14,20,0,11,2,1,2,4,2,3,...,2,3,2,3,3,2,3,3,2,1
1,15,8,1,15,5,3,1,4,3,1,...,2,1,4,1,5,1,4,5,5,2
2,12,18,1,14,2,1,2,2,2,2,...,2,2,3,3,2,2,3,2,2,1
3,16,12,1,15,4,3,1,3,4,2,...,2,2,4,1,4,1,4,4,5,2
4,16,28,0,7,2,3,5,1,3,2,...,3,4,3,1,2,1,5,0,5,1


# Exploratory Data Analysis

In [3]:
std_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   anxiety_level                 1100 non-null   int64
 1   self_esteem                   1100 non-null   int64
 2   mental_health_history         1100 non-null   int64
 3   depression                    1100 non-null   int64
 4   headache                      1100 non-null   int64
 5   blood_pressure                1100 non-null   int64
 6   sleep_quality                 1100 non-null   int64
 7   breathing_problem             1100 non-null   int64
 8   noise_level                   1100 non-null   int64
 9   living_conditions             1100 non-null   int64
 10  safety                        1100 non-null   int64
 11  basic_needs                   1100 non-null   int64
 12  academic_performance          1100 non-null   int64
 13  study_load                    110

In [4]:
std_data.columns.tolist()

['anxiety_level',
 'self_esteem',
 'mental_health_history',
 'depression',
 'headache',
 'blood_pressure',
 'sleep_quality',
 'breathing_problem',
 'noise_level',
 'living_conditions',
 'safety',
 'basic_needs',
 'academic_performance',
 'study_load',
 'teacher_student_relationship',
 'future_career_concerns',
 'social_support',
 'peer_pressure',
 'extracurricular_activities',
 'bullying',
 'stress_level']

In [5]:
std_data.isnull().sum()

anxiety_level                   0
self_esteem                     0
mental_health_history           0
depression                      0
headache                        0
blood_pressure                  0
sleep_quality                   0
breathing_problem               0
noise_level                     0
living_conditions               0
safety                          0
basic_needs                     0
academic_performance            0
study_load                      0
teacher_student_relationship    0
future_career_concerns          0
social_support                  0
peer_pressure                   0
extracurricular_activities      0
bullying                        0
stress_level                    0
dtype: int64

# Splitting Data

In [6]:
X = std_data.drop('stress_level', axis = 1)
y = std_data['stress_level']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


# Building Models

In [7]:
models = {
    'SVM' : SVC(), 
    "Random Forest" : RandomForestClassifier(), 
    "Decision Tree" : DecisionTreeClassifier(),
    "K-Neighbours Classifier" : KNeighborsClassifier(),
    "Logistic Regression" : LogisticRegression(max_iter = 1000), 
    "KMean" : KMeans(n_clusters=len(y.unique()), random_state = 43)
}

In [8]:
X = std_data.drop('stress_level', axis=1)
y = std_data['stress_level']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


models = {
    'SVM': SVC(), 
    "Random Forest": RandomForestClassifier(), 
    "Decision Tree": DecisionTreeClassifier(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000), 
    "KMeans": KMeans(n_clusters=len(y.unique()), random_state=43)
}


accuracies = {}


for model_name, model in models.items():
    if model_name == "KMeans":
        model.fit(X_train)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)  # This is not a true accuracy for KMeans
        accuracies[model_name] = accuracy
        print(f"{model_name} Accuracy Score: {accuracy:.2f}%")
        print(f'{model_name} Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
        print(f"{model_name} Classification Report:\n", classification_report(y_test, y_pred))
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies[model_name] = accuracy
        print(f"Accuracy Score of {model_name}: {accuracy:.2f}%")
        print(f'{model_name} Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
        print(f"{model_name} Classification Report:\n", classification_report(y_test, y_pred))
    
    print("\n" + "="*50 + "\n")  # Separator for readability

# Compare accuracies of all models
print("Model Accuracies:")
for model_name, accuracy in accuracies.items():
    print(f"{model_name}: {accuracy:.2f}")

Accuracy Score of SVM: 0.90%
SVM Confusion Matrix:
 [[67  3  6]
 [ 4 64  5]
 [ 1  4 66]]
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.88      0.91        76
           1       0.90      0.88      0.89        73
           2       0.86      0.93      0.89        71

    accuracy                           0.90       220
   macro avg       0.90      0.90      0.90       220
weighted avg       0.90      0.90      0.90       220



Accuracy Score of Random Forest: 0.88%
Random Forest Confusion Matrix:
 [[69  2  5]
 [ 6 63  4]
 [ 9  1 61]]
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.91      0.86        76
           1       0.95      0.86      0.91        73
           2       0.87      0.86      0.87        71

    accuracy                           0.88       220
   macro avg       0.88      0.88      0.88       220
weighted avg       0.88    