In [None]:
# =========================
# 1. Import Libraries
# =========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Machine Learning libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix, classification_report)

# =========================
# 2. Load the Dataset
# =========================
from google.colab import files
uploaded = files.upload()  # Upload your CSV file

# Use the correct filename
df = pd.read_csv('Expanded_data_with_more_features.csv')

# =========================
# 3. Explore the Dataset
# =========================
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
display(df.head())

print("\nDataset Info:")
df.info()

print("\nSummary Statistics:")
display(df.describe())

print("\nMissing Values in each column:")
display(df.isnull().sum())

# =========================
# 4. Data Preprocessing
# =========================
# Convert target variable "Depression_Status" to binary (Yes=1, No=0)
df['Depression_Status'] = df['Depression_Status'].map({'Yes': 1, 'No': 0})

# Identify numeric and categorical features.
# Adjust these lists based on your dataset
numeric_features = ['Age', 'CGPA', 'Sleep Duration']
categorical_features = ['Gender', 'City', 'Profession', 'Work Pressure',
                        'Academic Pressure', 'Study Satisfaction', 'Job Satisfaction', 'Dietary Habits']

# Handle missing values:
for col in numeric_features:
    df[col] = df[col].fillna(df[col].median())
for col in categorical_features:
    df[col] = df[col].fillna(df[col].mode()[0])

# Encode categorical variables using one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Separate features and target variable
X = df_encoded.drop(['ID', 'Depression_Status'], axis=1, errors='ignore')
y = df_encoded['Depression_Status']

# Scale numeric features
scaler = StandardScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])

print("\nProcessed feature set preview:")
display(X.head())

# =========================
# 5. Exploratory Data Analysis (EDA)
# =========================
# Plot distribution of target variable
plt.figure(figsize=(6,4))
sns.countplot(x=y)
plt.title("Distribution of Depression Status")
plt.xlabel("Depression Status (0: No, 1: Yes)")
plt.ylabel("Count")
plt.show()

# Limit correlation heatmap to numeric features for speed
plt.figure(figsize=(8,6))
numeric_df = df_encoded[numeric_features + ['Depression_Status']]
corr_matrix = numeric_df.corr()
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix (Numeric Features Only)")
plt.show()

# Plotting CGPA vs Depression_Status (if CGPA is numeric)
plt.figure(figsize=(8,6))
sns.boxplot(x='Depression_Status', y='CGPA', data=df)
plt.title("CGPA Distribution by Depression Status")
plt.xlabel("Depression Status (0: No, 1: Yes)")
plt.ylabel("CGPA")
plt.show()

# =========================
# 6. Split Data into Training and Testing Sets
# =========================
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=42,
                                                    stratify=y)
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

# =========================
# 7. Model Training: Logistic Regression
# =========================
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)
y_proba_lr = lr_model.predict_proba(X_test)[:, 1]

print("=== Logistic Regression Evaluation ===")
print("Accuracy: ", accuracy_score(y_test, y_pred_lr))
print("Precision: ", precision_score(y_test, y_pred_lr))
print("Recall: ", recall_score(y_test, y_pred_lr))
print("F1 Score: ", f1_score(y_test, y_pred_lr))
print("ROC AUC: ", roc_auc_score(y_test, y_proba_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))

cm_lr = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(6,4))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

# =========================
# 8. Model Training: Random Forest Classifier
# =========================
# Optionally, reduce n_estimators for faster execution
rf_model = RandomForestClassifier(n_estimators=50, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]

print("=== Random Forest Evaluation ===")
print("Accuracy: ", accuracy_score(y_test, y_pred_rf))
print("Precision: ", precision_score(y_test, y_pred_rf))
print("Recall: ", recall_score(y_test, y_pred_rf))
print("F1 Score: ", f1_score(y_test, y_pred_rf))
print("ROC AUC: ", roc_auc_score(y_test, y_proba_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(6,4))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Random Forest')
plt.show()

# =========================
# 9. Optional: Cross-Validation and Feature Importance
# =========================
# Use fewer folds for faster cross-validation (e.g., cv=3)
cv_scores = cross_val_score(rf_model, X, y, cv=3, scoring='roc_auc')
print("Random Forest Cross-Validated ROC AUC Scores:", cv_scores)
print("Mean ROC AUC Score:", np.mean(cv_scores))

# Feature Importance Plot
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]
feature_names = X.columns

plt.figure(figsize=(12,6))
plt.title("Feature Importances (Random Forest)")
sns.barplot(x=importances[indices], y=feature_names[indices], palette="viridis")
plt.xlabel("Relative Importance")
plt.ylabel("Features")
plt.show()

# =========================
# 10. Conclusion
# =========================
print("The optimized analysis pipeline has completed successfully!")


Saving Expanded_data_with_more_features.csv to Expanded_data_with_more_features (1).csv
Dataset Shape: (30641, 15)

First few rows:


Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30641 entries, 0 to 30640
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           30641 non-null  int64  
 1   Gender               30641 non-null  object 
 2   EthnicGroup          28801 non-null  object 
 3   ParentEduc           28796 non-null  object 
 4   LunchType            30641 non-null  object 
 5   TestPrep             28811 non-null  object 
 6   ParentMaritalStatus  29451 non-null  object 
 7   PracticeSport        30010 non-null  object 
 8   IsFirstChild         29737 non-null  object 
 9   NrSiblings           29069 non-null  float64
 10  TransportMeans       27507 non-null  object 
 11  WklyStudyHours       29686 non-null  object 
 12  MathScore            30641 non-null  int64  
 13  ReadingScore         30641 non-null  int64  
 14  WritingScore         30641 non-null  int64  
dtypes: float64(1), int64(

Unnamed: 0.1,Unnamed: 0,NrSiblings,MathScore,ReadingScore,WritingScore
count,30641.0,29069.0,30641.0,30641.0,30641.0
mean,499.556607,2.145894,66.558402,69.377533,68.418622
std,288.747894,1.458242,15.361616,14.758952,15.443525
min,0.0,0.0,0.0,10.0,4.0
25%,249.0,1.0,56.0,59.0,58.0
50%,500.0,2.0,67.0,70.0,69.0
75%,750.0,3.0,78.0,80.0,79.0
max,999.0,7.0,100.0,100.0,100.0



Missing Values in each column:


Unnamed: 0,0
Unnamed: 0,0
Gender,0
EthnicGroup,1840
ParentEduc,1845
LunchType,0
TestPrep,1830
ParentMaritalStatus,1190
PracticeSport,631
IsFirstChild,904
NrSiblings,1572


KeyError: 'Depression_Status'