In [521]:

import pandas as pd            
import streamlit as st
import numpy as np               
import matplotlib.pyplot as plt  
import seaborn as sns            
from sklearn import datasets                              
from sklearn.svm import SVC  
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
import joblib                                            
from flask import Flask, request, jsonify               


In [None]:
df=pd.read_csv('Student Depression Dataset.csv')
df

In [None]:
df.describe()


In [None]:
df.shape


In [None]:
df.isnull().sum()


In [None]:
df.info()


In [None]:
df['Gender'].hist()


In [None]:
sns.countplot(x='Gender', data=df)

In [529]:
df = df.drop(columns =  ['id','Age', 'Degree', 'Profession','Work Pressure','City'])

In [None]:
df.info()

In [None]:
mean = df['Financial Stress'].mean()
print("Mean:", mean)

In [532]:
df['Financial Stress'] = df['Financial Stress'].fillna(df['Financial Stress'].mean())

In [None]:
df.isnull().sum()

In [None]:



df = pd.get_dummies(df, columns=['Sleep Duration','Dietary Habits'])

df

In [None]:

binary_columns = ['Have you ever had suicidal thoughts ?', 'Family History of Mental Illness', 'Gender']


for col in binary_columns:
    if col == "Gender":
        df[col] = df[col].map({'Male': 1, 'Female': 0})
    else:
        df[col] = df[col].map({'Yes': 1, 'No': 0}) 


print(df.head())


In [None]:
df

In [None]:
df.info()

In [538]:

X = df.drop(columns=['Depression']).to_numpy()
y = df['Depression'].to_numpy()  # Target variable

In [539]:
    # Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [540]:
#  models
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}


In [None]:

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    
    print(f"{model_name} Performance:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print("-" * 50)

In [None]:
# GradientBoostingClassifier model
model = GradientBoostingClassifier(random_state=0)


param_grid = {
    'n_estimators': [100, 300],  
    'max_depth': [3, 5],  
    'min_samples_leaf': [5, 7],  
    'learning_rate': [0.1, 0.05],  
    'max_features': [0.5, 1.0],
    'loss': ['log_loss']  
}


gs_cv = GridSearchCV(model, param_grid, n_jobs=4, verbose=50, cv=3, scoring='accuracy')

gs_cv.fit(X_train, y_train)


print("Best Parameters from GridSearchCV:", gs_cv.best_params_)


best_model = gs_cv.best_estimator_
train_accuracy = accuracy_score(y_train, best_model.predict(X_train))
test_accuracy = accuracy_score(y_test, best_model.predict(X_test))

print(f"Training Set Accuracy: {train_accuracy:.4f}")
print(f"Test Set Accuracy: {test_accuracy:.4f}")


In [None]:

# Code for not overwrite
best_model = gs_cv.best_estimator_
best_model.fit(X_train, y_train)


joblib.dump(best_model, 'Student_Depression_Model.pkl')


In [None]:
# Feature labels (excluding 'Depression')
feature_labels = np.array([
    'Gender', 'Academic Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
    'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Financial Stress', 
    'Family History of Mental Illness', 'Sleep Duration_5-6 hours', 
    'Sleep Duration_7-8 hours', 'Sleep Duration_Less than 5 hours', 
    'Sleep Duration_More than 8 hours', 'Sleep Duration_Others', 
    'Dietary Habits_Healthy', 'Dietary Habits_Moderate', 
    'Dietary Habits_Others', 'Dietary Habits_Unhealthy'
])


model = joblib.load('Student_Depression_Model.pkl')


importance = model.feature_importances_


feature_indexes_by_importance = importance.argsort()[::-1]  


print("Feature Importance Rankings:")
for index in feature_indexes_by_importance:
    print(f"{feature_labels[index]} - {importance[index]:.2f}%")


# importance_df.to_csv('feature_importance.csv', index=False)

In [545]:
df = df.drop(columns =  ['Gender', 'CGPA', 'Sleep Duration_Less than 5 hours', 'Sleep Duration_More than 8 hours',
    'Sleep Duration_5-6 hours', 'Sleep Duration_7-8 hours', 'Sleep Duration_Others',
    'Dietary Habits_Moderate', 'Dietary Habits_Others', 'Job Satisfaction', 
    'Family History of Mental Illness'])

In [None]:
df

In [547]:
X = df.drop(columns=['Depression']).to_numpy()
y = df['Depression'].to_numpy()  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# # Train and evaluate each model
# for model_name, model in models.items():
#     print(f"Training {model_name}...")
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
    
#     # Evaluate
#     print(f"{model_name} Performance:")
#     print(classification_report(y_test, y_pred))
#     print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
#     print("-" * 50)

best_model = GradientBoostingClassifier(learning_rate= 0.1)  
print("Training Gradient Boosting Model")
best_model.fit(X_train, y_train)


y_pred = best_model.predict(X_test)
print("Gradient Boosting Model Performance:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")


joblib.dump(best_model, 'New_Student_Depression_Model.pkl')
print("Model saved as 'New_Student_Depression_Model.pkl'")