In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def find_root():
    p = Path.cwd()
    for _ in range(5):
        if (p/'data'/'raw'/'student_depression_dataset.csv').exists():
            return p
        p = p.parent
    return Path.cwd()

In [3]:
ROOT = find_root()
DATA_PATH = ROOT/'data'/'raw'/'student_depression_dataset.csv'
VIS_DIR = ROOT/'results'/'eda_visualizations'
OUT_DIR = ROOT/'results'/'outputs'
VIS_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
df = pd.read_csv(DATA_PATH)
TARGET = 'Depression'
print(df.shape)
df.head()

(27901, 18)


Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No,0


In [5]:
if 'Sleep Duration' in df.columns:
    df['Sleep Duration'] = df['Sleep Duration'].astype(str).str.strip().str.strip("'").str.strip('"')

df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [6]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
num_cols = [c for c in df.columns if c not in cat_cols + [TARGET]]
cat_cols, num_cols

(['Gender',
  'City',
  'Profession',
  'Sleep Duration',
  'Dietary Habits',
  'Degree',
  'Have you ever had suicidal thoughts ?',
  'Financial Stress',
  'Family History of Mental Illness'],
 ['id',
  'Age',
  'Academic Pressure',
  'Work Pressure',
  'CGPA',
  'Study Satisfaction',
  'Job Satisfaction',
  'Work/Study Hours'])

In [7]:
df_enc = df.copy()

binary_cat = [c for c in cat_cols if df[c].nunique(dropna=False) == 2]
for c in binary_cat:
    le = LabelEncoder()
    df_enc[c] = le.fit_transform(df_enc[c].astype(str))

onehot_cols = [c for c in cat_cols if c not in binary_cat]
if onehot_cols:
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    oh = ohe.fit_transform(df_enc[onehot_cols])
    oh_cols = ohe.get_feature_names_out(onehot_cols)
    oh_df = pd.DataFrame(oh, columns=oh_cols, index=df_enc.index)
    df_enc = pd.concat([df_enc.drop(columns=onehot_cols), oh_df], axis=1)

df_enc.shape
df_enc.head()

Unnamed: 0,id,Gender,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Have you ever had suicidal thoughts ?,Work/Study Hours,...,Degree_MHM,Degree_MSc,Degree_Others,Degree_PhD,Financial Stress_1.0,Financial Stress_2.0,Financial Stress_3.0,Financial Stress_4.0,Financial Stress_5.0,Financial Stress_?
0,2,1,33.0,5.0,0.0,8.97,2.0,0.0,1,3.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,8,0,24.0,2.0,0.0,5.9,5.0,0.0,0,3.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,26,1,31.0,3.0,0.0,7.03,5.0,0.0,0,9.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,30,0,28.0,3.0,0.0,5.59,2.0,0.0,1,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,32,0,25.0,4.0,0.0,8.13,3.0,0.0,1,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [8]:
if 'Gender' in df.columns:
    ax = df['Gender'].value_counts().plot(kind='bar', title='Gender counts')
    ax.set_xlabel('Gender'); ax.set_ylabel('Count')
    plt.tight_layout(); plt.savefig(VIS_DIR/'01_encoding_gender_counts.png', dpi=150); plt.close()


In [9]:
if 'City' in df.columns:
    ax = df['City'].value_counts().head(10).plot(kind='bar', title='Top 10 Cities')
    ax.set_xlabel('City'); ax.set_ylabel('Count')
    plt.tight_layout(); plt.savefig(VIS_DIR/'02_encoding_top_cities.png', dpi=150); plt.close()

In [10]:
bin_like = [c for c in binary_cat if c in df_enc.columns] + ([TARGET] if TARGET in df_enc.columns else [])

if len(bin_like) >= 2:
    # Compute correlation matrix
    corr = df_enc[bin_like].corr()

    # Set up the plot
    plt.figure(figsize=(8,6))  # Bigger figure for readability
    cmap = sns.diverging_palette(220, 20, as_cmap=True)  # Blue → White → Red palette

    # Heatmap with annotations
    sns.heatmap(
        corr,
        cmap=cmap,
        annot=True,          # show correlation values
        fmt=".2f",           # 2 decimal places
        linewidths=0.5,      # thin grid lines
        cbar_kws={"shrink": 0.8},  # smaller colorbar
        square=True,         # keep cells square
        annot_kws={"size":10}  # control annotation font size
    )

    plt.title('Correlation Heatmap (Binary Encodings + Target)', fontsize=14, pad=12)
    plt.tight_layout()
    plt.savefig(VIS_DIR/'03_encoding_binary_corr_clean.png', dpi=200)
    plt.close()


In [11]:
plt.figure()
plt.bar(['Before', 'After'], [df.shape[1], df_enc.shape[1]])
plt.title('Feature count: before vs after encoding')
plt.tight_layout(); plt.savefig(VIS_DIR/'04_encoding_feature_count.png', dpi=150); plt.close()

In [12]:
out_path = OUT_DIR/'01_encoded.csv'
df_enc.to_csv(out_path, index=False)
out_path

WindowsPath('D:/Depression_detector/results/outputs/01_encoded.csv')