In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn


from sklearn.naive_bayes import GaussianNB

In [312]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
import os

from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [313]:
depression_threshold = 4

In [314]:
data = pd.read_csv('../Data/clean_df_engineering.csv')

data.drop(['Course'], axis=1, inplace=True)

features = ['Age', 'CGPA', 'Anxiety_Score', 'Semester_Credit_Load']
target = 'Depression_Score'

In [315]:
# Fitting and transforming y_train and y_test
data['Depression_Score'] = data['Depression_Score'].apply(lambda x: 1 if x>depression_threshold else 0)
y = data['Depression_Score']

X = data[features]

In [316]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [317]:
# Normalizing X
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

In [318]:
def assessment(model, title = "Default"):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print('Accuracy of', title, ':', round(accuracy_score(y_test, preds), 5))

# Naive Bayes
nb = GaussianNB()
assessment(nb, "Naive Bayes")

Accuracy of Naive Bayes : 0.88015


K FOLD CROSS VALIDATION

In [319]:
from sklearn.model_selection import cross_val_score

data = pd.read_csv('../Data/clean_df_engineering.csv')

data.drop(['Course'], axis=1, inplace=True)

features = ['Age', 'CGPA', 'Anxiety_Score', 'Semester_Credit_Load']
target = 'Depression_Score'

In [320]:
# Apply threshold to the Depression_Score to convert to binary (1 for depression, 0 for no depression)
depression_threshold = 4
data['Depression_Score'] = data['Depression_Score'].apply(lambda x: 1 if x > depression_threshold else 0)

In [321]:
# Define X and y
X = data[features]
y = data[target]

In [322]:
# Convert features to numeric and handle missing values
X = X.apply(pd.to_numeric, errors='coerce')
X = X.dropna()  
y = y[X.index]  

In [323]:
# Initialize Naive Bayes model
nb = GaussianNB()

In [324]:
# Perform K-Fold Cross Validation (e.g., 5 folds)
cross_val_scores = cross_val_score(nb, X, y, cv=5 , scoring='accuracy')

In [325]:
# Print the average accuracy from K-Fold Cross Validation
print(f'Average Accuracy from 10-Fold Cross Validation: {round(np.mean(cross_val_scores), 5)}')
print(f'Fold Scores: {cross_val_scores}')

Average Accuracy from 10-Fold Cross Validation: 0.90722
Fold Scores: [0.90654206 0.90654206 0.91079812 0.90610329 0.90610329]


In [326]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]  # The Gaussian Naive Bayes hyperparameter for smoothing???
}

grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, 
                           scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print("Test set accuracy with best hyperparameters:", accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best hyperparameters: {'var_smoothing': 1e-09}
Best cross-validation accuracy: 0.91625
Test set accuracy with best hyperparameters: 0.8801498127340824


STRATIFIED K FOLD CROSS VALIDATION

In [327]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
# Set up Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5)  

# Define the parameter grid for hyperparameter tuning (var_smoothing)
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
}

# Set up GridSearchCV with StratifiedKFold cross-validation
grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, 
                           scoring='accuracy', cv=skf, n_jobs=-1, verbose=1)

# Perform GridSearchCV with cross-validation
grid_search.fit(X, y)

# Print the best hyperparameters and the best cross-validation score
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X)  
print("Test set accuracy with best hyperparameters:", accuracy_score(y, y_pred))



Fitting 5 folds for each of 7 candidates, totalling 35 fits


Best hyperparameters: {'var_smoothing': 1e-09}
Best cross-validation accuracy: 0.9072177613970427
Test set accuracy with best hyperparameters: 0.9072164948453608
