<a href="https://colab.research.google.com/github/PradipaJavierFatah/Model_Comparison_for_Student_Mental_Health_Analysis/blob/main/Model_Comparison_for_Student_Mental_Health_Analysis_KNN_vs_Logistic_Regression_vs_Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Model Comparison for Student Mental Health Analysis KNN vs Logistic Regression vs Naive Bayes**

## **1. KNN**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Load the CSV data into a pandas DataFrame
data = pd.read_csv('/content/students_mental_health_survey.csv')

# Remove rows with any missing values
data = data.dropna()

# Handle missing values in categorical columns by replacing empty strings with 'Unknown'
categorical_columns = ['Course', 'Gender', 'Sleep_Quality', 'Physical_Activity', 'Diet_Quality', 'Social_Support', 'Relationship_Status', 'Substance_Use', 'Counseling_Service_Use', 'Family_History', 'Chronic_Illness', 'Extracurricular_Involvement', 'Residence_Type']

# Encode categorical variables
le = LabelEncoder()
for col in categorical_columns:
    data[col] = le.fit_transform(data[col])

# Split the data into features and target variable
X = data.drop(columns=['Depression_Score'])
y = data['Depression_Score'] > 2  # Binary classification: 1 if Depression_Score > 2, else 0

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=0)
X_smote, y_smote = smote.fit_resample(X, y)

# Split the SMOTE-transformed data into training (80%), validation (10%), and testing (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X_smote, y_smote, test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)  # 0.5 x 0.2 = 0.1

# Print the sizes of the training, validation, and testing sets
print(f'Training set size: {X_train.shape[0]} samples')
print(f'Validation set size: {X_val.shape[0]} samples')
print(f'Testing set size: {X_test.shape[0]} samples')

# Define the pipeline with preprocessing and KNN classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])

# Define a hyperparameter grid for GridSearchCV
param_grid = {
    'classifier__n_neighbors': [3, 5, 7],  # Number of neighbors
    'classifier__weights': ['uniform', 'distance'],  # Weight function used in prediction
}

# Perform grid search with 3-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Make predictions and evaluate the model on the validation set
y_val_pred = grid_search.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
print(f'Validation set accuracy: {accuracy_val:.2f}')
print('Validation set classification report:')
print(classification_report(y_val, y_val_pred))

# Make predictions and evaluate the model on the testing set
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Testing set accuracy for KNN with SMOTE: {accuracy:.2f}')
print('Testing set classification report:')
print(classification_report(y_test, y_pred))


Training set size: 6225 samples
Validation set size: 778 samples
Testing set size: 779 samples
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best Hyperparameters: {'classifier__n_neighbors': 7, 'classifier__weights': 'distance'}
Validation set accuracy: 0.56
Validation set classification report:
              precision    recall  f1-score   support

       False       0.60      0.50      0.54       404
        True       0.54      0.64      0.59       374

    accuracy                           0.56       778
   macro avg       0.57      0.57      0.56       778
weighted avg       0.57      0.56      0.56       778

Testing set accuracy for KNN with SMOTE: 0.59
Testing set classification report:
              precision    recall  f1-score   support

       False       0.59      0.55      0.57       390
        True       0.58      0.62      0.60       389

    accuracy                           0.59       779
   macro avg       0.59      0.59      0.58       779
weighted 

## 2. NAIVE BAYES

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# Load the CSV data into a pandas DataFrame
data = pd.read_csv('/content/students_mental_health_survey.csv')

# Remove rows with any missing values
data = data.dropna()

# Handle missing values in categorical columns by replacing empty strings with 'Unknown'
categorical_columns = ['Course', 'Gender', 'Sleep_Quality', 'Physical_Activity', 'Diet_Quality', 'Social_Support', 'Relationship_Status', 'Substance_Use', 'Counseling_Service_Use', 'Family_History', 'Chronic_Illness', 'Extracurricular_Involvement', 'Residence_Type']

# Encode categorical variables
le = LabelEncoder()
for col in categorical_columns:
    data[col] = le.fit_transform(data[col])

# Split the data into features and target variable
X = data.drop(columns=['Depression_Score'])
y = data['Depression_Score'] > 2  # Binary classification: 1 if Depression_Score > 2, else 0

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=0)
X_smote, y_smote = smote.fit_resample(X, y)

# Split the data into training (80%), validation (10%), and testing (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X_smote, y_smote, test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)  # 0.5 x 0.2 = 0.1

# Print the sizes of the training, validation, and testing sets
print(f'Training set size: {X_train.shape[0]} samples')
print(f'Validation set size: {X_val.shape[0]} samples')
print(f'Testing set size: {X_test.shape[0]} samples')

# Define the pipeline with preprocessing and GaussianNB
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GaussianNB())
])

# Perform grid search with 3-fold cross-validation (no hyperparameters to tune for GaussianNB)
grid_search = GridSearchCV(pipeline, {}, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found (will be empty for GaussianNB)
print("Best Hyperparameters:", grid_search.best_params_)

# Make predictions and evaluate the model on the validation set
y_val_pred = grid_search.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
print(f'Validation set accuracy for naive bayes: {accuracy_val:.2f}')
print('Validation set classification report:')
print(classification_report(y_val, y_val_pred))

# Make predictions and evaluate the model on the testing set
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Testing set accuracy for naive bayes: {accuracy:.2f}')
print('Testing set classification report:')
print(classification_report(y_test, y_pred))


Training set size: 6225 samples
Validation set size: 778 samples
Testing set size: 779 samples
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Hyperparameters: {}
Validation set accuracy for naive bayes: 0.57
Validation set classification report:
              precision    recall  f1-score   support

       False       0.61      0.49      0.54       404
        True       0.54      0.66      0.59       374

    accuracy                           0.57       778
   macro avg       0.57      0.57      0.57       778
weighted avg       0.58      0.57      0.57       778

Testing set accuracy for naive bayes: 0.58
Testing set classification report:
              precision    recall  f1-score   support

       False       0.59      0.48      0.53       390
        True       0.56      0.67      0.61       389

    accuracy                           0.58       779
   macro avg       0.58      0.58      0.57       779
weighted avg       0.58      0.58      0.57       779



## 3. LOGISTIC REGRESSION

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# Load the CSV data into a pandas DataFrame
data = pd.read_csv('/content/students_mental_health_survey.csv')

# Remove rows with any missing values
data = data.dropna()

# Handle missing values in categorical columns by replacing empty strings with 'Unknown'
categorical_columns = ['Course', 'Gender', 'Sleep_Quality', 'Physical_Activity', 'Diet_Quality', 'Social_Support', 'Relationship_Status', 'Substance_Use', 'Counseling_Service_Use', 'Family_History', 'Chronic_Illness', 'Extracurricular_Involvement', 'Residence_Type']

# Encode categorical variables
le = LabelEncoder()
for col in categorical_columns:
    data[col] = le.fit_transform(data[col])

# Split the data into features and target variable
X = data.drop(columns=['Depression_Score'])
y = data['Depression_Score'] > 2  # Binary classification: 1 if Depression_Score > 2, else 0

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=0)
X_smote, y_smote = smote.fit_resample(X, y)

# Split the data into training (80%), validation (10%), and testing (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X_smote, y_smote, test_size=0.5, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=0)  # 0.5 x 0.2 = 0.1

# Print the sizes of the training, validation, and testing sets
print(f'Training set size: {X_train.shape[0]} samples')
print(f'Validation set size: {X_val.shape[0]} samples')
print(f'Testing set size: {X_test.shape[0]} samples')

# Define the pipeline with preprocessing and LogisticRegression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=0))
])

# Define a simplified hyperparameter grid for GridSearchCV
param_grid = {
    'classifier__C': [0.1, 1.0, 10.0],  # Regularization parameter
    'classifier__solver': ['liblinear', 'lbfgs']  # Optimization algorithm
}

# Perform grid search with 3-fold cross-validation (reduced from 5)
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Make predictions and evaluate the model on the validation set
y_val_pred = grid_search.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
print(f'Validation set accuracy: {accuracy_val:.2f}')
print('Validation set classification report:')
print(classification_report(y_val, y_val_pred))

# Make predictions and evaluate the model on the testing set
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Testing set accuracy: {accuracy:.2f}')
print('Testing set classification report:')
print(classification_report(y_test, y_pred))

Training set size: 3891 samples
Validation set size: 3112 samples
Testing set size: 779 samples
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best Hyperparameters: {'classifier__C': 0.1, 'classifier__solver': 'liblinear'}
Validation set accuracy: 0.55
Validation set classification report:
              precision    recall  f1-score   support

       False       0.56      0.53      0.55      1571
        True       0.55      0.58      0.56      1541

    accuracy                           0.55      3112
   macro avg       0.55      0.55      0.55      3112
weighted avg       0.55      0.55      0.55      3112

Testing set accuracy: 0.59
Testing set classification report:
              precision    recall  f1-score   support

       False       0.61      0.57      0.59       407
        True       0.56      0.60      0.58       372

    accuracy                           0.59       779
   macro avg       0.59      0.59      0.59       779
weighted avg       0.59      0.59  