In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

In [2]:
def find_root():
    p = Path.cwd()
    for _ in range(8):
        if (p/'data'/'raw'/'student_depression_dataset.csv').exists() and (p/'results').exists():
            return p
        p = p.parent
    return Path.cwd()

ROOT = find_root()
DATA_PATH = ROOT/'data'/'raw'/'student_depression_dataset.csv'
VIS_DIR  = ROOT/'results'/'eda_visualizations'
OUT_DIR  = ROOT/'results'/'outputs'
VIS_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

TARGET = 'Depression'
df0 = pd.read_csv(DATA_PATH).copy()
if 'Sleep Duration' in df0.columns:
    df0['Sleep Duration'] = df0['Sleep Duration'].astype(str).str.strip("'").str.strip('"')

print("Loaded raw:", DATA_PATH.resolve())
print("Raw shape:", df0.shape)
df0.head()

Loaded raw: D:\Depression_detector\data\raw\student_depression_dataset.csv
Raw shape: (27901, 18)


Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [3]:
# CELL 2: Encoding categorical variables
df = df0.copy()

cat_cols = df.select_dtypes(include='object').columns.tolist()
binary_cat = [c for c in cat_cols if df[c].nunique(dropna=False) == 2]

# Label-encode binary categoricals
for c in binary_cat:
    le = LabelEncoder()
    df[c] = le.fit_transform(df[c].astype(str))

# One-hot encode the remaining categoricals
onehot_cols = [c for c in cat_cols if c not in binary_cat]
if onehot_cols:
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    oh = ohe.fit_transform(df[onehot_cols])
    oh_cols = ohe.get_feature_names_out(onehot_cols)
    oh_df = pd.DataFrame(oh, columns=oh_cols, index=df.index)
    df = pd.concat([df.drop(columns=onehot_cols), oh_df], axis=1)

df_enc = df.copy()

print("\n[END Encoding]")
print("df_enc.shape:", df_enc.shape)
df_enc.head()



[END Encoding]
df_enc.shape: (27901, 121)


Unnamed: 0,id,Gender,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Have you ever had suicidal thoughts ?,Work/Study Hours,...,Degree_MHM,Degree_MSc,Degree_Others,Degree_PhD,Financial Stress_1.0,Financial Stress_2.0,Financial Stress_3.0,Financial Stress_4.0,Financial Stress_5.0,Financial Stress_?
0,2,1,33.0,5.0,0.0,8.97,2.0,0.0,1,3.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,8,0,24.0,2.0,0.0,5.9,5.0,0.0,0,3.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,26,1,31.0,3.0,0.0,7.03,5.0,0.0,0,9.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,30,0,28.0,3.0,0.0,5.59,2.0,0.0,1,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,32,0,25.0,4.0,0.0,8.13,3.0,0.0,1,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# CELL 3: Outlier handling with IQR winsorization
def iqr_winsorize(s, k=1.5):
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    return s.clip(q1 - k*iqr, q3 + k*iqr)

df_o = df_enc.copy()
num_cols = df_o.select_dtypes(include=[np.number]).columns.drop(TARGET, errors='ignore')
for c in num_cols:
    df_o[c] = iqr_winsorize(df_o[c])

print("\n[END Outliers]")
print("df_o.shape:", df_o.shape)
df_o.head()



[END Outliers]
df_o.shape: (27901, 121)


Unnamed: 0,id,Gender,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Have you ever had suicidal thoughts ?,Work/Study Hours,...,Degree_MHM,Degree_MSc,Degree_Others,Degree_PhD,Financial Stress_1.0,Financial Stress_2.0,Financial Stress_3.0,Financial Stress_4.0,Financial Stress_5.0,Financial Stress_?
0,2,1,33.0,5.0,0.0,8.97,2.0,0.0,1,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8,0,24.0,2.0,0.0,5.9,5.0,0.0,0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,26,1,31.0,3.0,0.0,7.03,5.0,0.0,0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,30,0,28.0,3.0,0.0,5.59,2.0,0.0,1,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,32,0,25.0,4.0,0.0,8.13,3.0,0.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# CELL 4: Feature engineering
df_fe = df_o.copy()

# Ensure columns exist for formulas
for req in ['Academic Pressure', 'Work Pressure', 'Study Satisfaction']:
    if req not in df_fe.columns:
        df_fe[req] = 0.0

# 1) Pressure intensity
df_fe['Pressure_Intensity'] = (df_fe['Academic Pressure'] + df_fe['Work Pressure']) / (df_fe['Study Satisfaction'] + 1)

# 2) Sleep hours estimation from Sleep Duration (if present in raw)
sleep_map = {
    'Less than 5 hours': 4.5, '5-6 hours': 5.5, '6-7 hours': 6.5,
    '7-8 hours': 7.5, 'More than 8 hours': 8.5
}
if 'Sleep Duration' in df0.columns:
    sd = df0['Sleep Duration'].astype(str).str.replace('"','').str.replace("'","")
    sd = sd.str.replace('to','-').str.replace('hours','').str.strip()
    sd = sd.replace({'5-6':'5-6 hours','6-7':'6-7 hours','7-8':'7-8 hours'})
    df_fe['Sleep_Hours_Est'] = sd.map(sleep_map).fillna(6.5)
else:
    df_fe['Sleep_Hours_Est'] = 6.5

# 3) Student indicator based on Profession (from raw text)
df_fe['Is_Student'] = 0
if 'Profession' in df0.columns:
    df_fe['Is_Student'] = df0['Profession'].astype(str).str.lower().str.contains('student').astype(int)

# 4) Interaction
if 'CGPA' in df_fe.columns and 'Study Satisfaction' in df_fe.columns:
    df_fe['CGPAxStudySat'] = df_fe['CGPA'] * df_fe['Study Satisfaction']

print("\n[END Feature Engineering]")
print("df_fe.shape:", df_fe.shape)
df_fe.head()



[END Feature Engineering]
df_fe.shape: (27901, 125)


Unnamed: 0,id,Gender,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Have you ever had suicidal thoughts ?,Work/Study Hours,...,Financial Stress_1.0,Financial Stress_2.0,Financial Stress_3.0,Financial Stress_4.0,Financial Stress_5.0,Financial Stress_?,Pressure_Intensity,Sleep_Hours_Est,Is_Student,CGPAxStudySat
0,2,1,33.0,5.0,0.0,8.97,2.0,0.0,1,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.666667,5.5,1,17.94
1,8,0,24.0,2.0,0.0,5.9,5.0,0.0,0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,5.5,1,29.5
2,26,1,31.0,3.0,0.0,7.03,5.0,0.0,0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,6.5,1,35.15
3,30,0,28.0,3.0,0.0,5.59,2.0,0.0,1,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.5,1,11.18
4,32,0,25.0,4.0,0.0,8.13,3.0,0.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.5,1,24.39


In [6]:
# CELL 5: PCA + scaling of remaining numeric
X = df_fe.drop(columns=[TARGET])
y = df_fe[TARGET]

# Scale numeric for PCA
num_cols = X.select_dtypes(include=[np.number]).columns
scaler_num = StandardScaler()
X_num_scaled = scaler_num.fit_transform(X[num_cols])

# PCA retain 95% variance
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_num_scaled)
X_pca_df = pd.DataFrame(X_pca, columns=[f'PCA_{i+1}' for i in range(X_pca.shape[1])])

# Recombine: non-numeric (if any) + PCA + target
df_pca = pd.concat([
    X.drop(columns=num_cols).reset_index(drop=True),
    X_pca_df.reset_index(drop=True),
    y.reset_index(drop=True)
], axis=1)

# Scale any remaining numeric columns that were NOT part of PCA (usually none, but kept for generality)
pca_cols = [c for c in df_pca.columns if c.startswith('PCA_')]
other_num = df_pca.select_dtypes(include=[np.number]).columns.drop(pca_cols + [TARGET], errors='ignore')
df_sc = df_pca.copy()
if len(other_num):
    df_sc[other_num] = StandardScaler().fit_transform(df_sc[other_num])

print("\n[END PCA + Scaling]")
print("Explained variance (cumulative):", float(np.sum(pca.explained_variance_ratio_)))
print("df_sc.shape:", df_sc.shape)
df_sc.head()



[END PCA + Scaling]
Explained variance (cumulative): 0.9894810057075297
df_sc.shape: (27901, 15)


Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10,PCA_11,PCA_12,PCA_13,PCA_14,Depression
0,1.345733,-1.189052,-0.360244,-1.439301,1.33424,2.229301,0.0626,0.424245,-0.397411,-1.288575,-0.219147,1.213798,1.695695,1.232469,1
1,-2.146965,-1.387035,-1.228293,0.916543,-0.823694,-1.395977,0.285489,1.837112,-1.15083,-0.934695,-1.009531,-0.041661,1.12298,-0.522067,0
2,-2.093255,-1.022261,-0.184173,-1.363837,0.987528,0.173374,1.283317,1.745278,0.955815,0.151978,1.457303,0.925073,0.766565,-1.076069,0
3,1.03456,2.031836,-1.85257,0.988217,-0.618096,-1.345814,0.484203,1.954781,-0.63095,-0.645493,0.237955,0.272683,0.437379,1.08341,1
4,0.088718,-1.285079,-0.764918,1.413909,0.291767,0.280937,-0.150941,0.513996,-1.580528,-2.252769,-0.757421,0.200893,0.788921,1.096808,0


In [7]:
# CELL 6: SMOTE class balancing (safe install)
import sys, subprocess

try:
    from imblearn.over_sampling import SMOTE
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "imbalanced-learn"])
    from imblearn.over_sampling import SMOTE

X_fin = df_sc.drop(columns=[TARGET])
y_fin = df_sc[TARGET]

print("\nClass distribution before SMOTE:")
print(y_fin.value_counts())

sm = SMOTE(random_state=42)
X_bal, y_bal = sm.fit_resample(X_fin, y_fin)

df_final = pd.concat([
    pd.DataFrame(X_bal, columns=X_fin.columns),
    pd.Series(y_bal, name=TARGET)
], axis=1)

print("\nClass distribution after SMOTE:")
print(pd.Series(y_bal).value_counts())

print("\n[END SMOTE]")
print("df_final.shape:", df_final.shape)
df_final.head()



Class distribution before SMOTE:
Depression
1    16336
0    11565
Name: count, dtype: int64

Class distribution after SMOTE:
Depression
1    16336
0    16336
Name: count, dtype: int64

[END SMOTE]
df_final.shape: (32672, 15)


Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10,PCA_11,PCA_12,PCA_13,PCA_14,Depression
0,1.345733,-1.189052,-0.360244,-1.439301,1.33424,2.229301,0.0626,0.424245,-0.397411,-1.288575,-0.219147,1.213798,1.695695,1.232469,1
1,-2.146965,-1.387035,-1.228293,0.916543,-0.823694,-1.395977,0.285489,1.837112,-1.15083,-0.934695,-1.009531,-0.041661,1.12298,-0.522067,0
2,-2.093255,-1.022261,-0.184173,-1.363837,0.987528,0.173374,1.283317,1.745278,0.955815,0.151978,1.457303,0.925073,0.766565,-1.076069,0
3,1.03456,2.031836,-1.85257,0.988217,-0.618096,-1.345814,0.484203,1.954781,-0.63095,-0.645493,0.237955,0.272683,0.437379,1.08341,1
4,0.088718,-1.285079,-0.764918,1.413909,0.291767,0.280937,-0.150941,0.513996,-1.580528,-2.252769,-0.757421,0.200893,0.788921,1.096808,0


In [8]:
# CELL 7: Save final dataset
FINAL = OUT_DIR/'final_cleaned.csv'
df_final.to_csv(FINAL, index=False)
print("Saved final cleaned dataset to:", FINAL.resolve())

Saved final cleaned dataset to: D:\Depression_detector\results\outputs\final_cleaned.csv
