In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

In [2]:
def find_root():
    p = Path.cwd()
    for _ in range(5):
        if (p/'results'/'outputs'/'02_outliers_winsorized.csv').exists() or (p/'results').exists():
            return p
        p = p.parent
    return Path.cwd()

ROOT = find_root()
VIS_DIR = ROOT/'results'/'eda_visualizations'
OUT_DIR = ROOT/'results'/'outputs'
df = pd.read_csv(OUT_DIR/'02_outliers_winsorized.csv')
TARGET = 'Depression'
print(df.shape)
df.head()

(27901, 121)


Unnamed: 0,id,Gender,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Have you ever had suicidal thoughts ?,Work/Study Hours,...,Degree_MHM,Degree_MSc,Degree_Others,Degree_PhD,Financial Stress_1.0,Financial Stress_2.0,Financial Stress_3.0,Financial Stress_4.0,Financial Stress_5.0,Financial Stress_?
0,2,1,33.0,5.0,0.0,8.97,2.0,0.0,1,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8,0,24.0,2.0,0.0,5.9,5.0,0.0,0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,26,1,31.0,3.0,0.0,7.03,5.0,0.0,0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,30,0,28.0,3.0,0.0,5.59,2.0,0.0,1,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,32,0,25.0,4.0,0.0,8.13,3.0,0.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# CELL 2: Create features (adaptable if cols missing)
df_fe = df.copy()

for req in ['Academic Pressure','Work Pressure','Study Satisfaction']:
    if req not in df_fe.columns: df_fe[req] = 0.0

df_fe['Pressure_Intensity'] = (df_fe['Academic Pressure'] + df_fe['Work Pressure']) / (df_fe['Study Satisfaction'] + 1)

sleep_map = {
    'Less than 5 hours': 4.5, '5-6 hours': 5.5, '6-7 hours': 6.5, '7-8 hours': 7.5, 'More than 8 hours': 8.5
}
if 'Sleep Duration' in df.columns:
    sd = df['Sleep Duration'].astype(str).str.replace('"','').str.replace("'","")
    sd = sd.str.replace('to','-').str.replace('hours','').str.strip()
    sd = sd.replace({'5-6':'5-6 hours','6-7':'6-7 hours','7-8':'7-8 hours'})
    df_fe['Sleep_Hours_Est'] = sd.map(sleep_map).fillna(6.5)
else:
    df_fe['Sleep_Hours_Est'] = 6.5

df_fe['Is_Student'] = 0
if 'Profession' in df.columns:
    df_fe['Is_Student'] = df['Profession'].astype(str).str.lower().str.contains('student').astype(int)

if 'CGPA' in df_fe.columns and 'Study Satisfaction' in df_fe.columns:
    df_fe['CGPAxStudySat'] = df_fe['CGPA'] * df_fe['Study Satisfaction']

df_fe.head()


Unnamed: 0,id,Gender,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Have you ever had suicidal thoughts ?,Work/Study Hours,...,Financial Stress_1.0,Financial Stress_2.0,Financial Stress_3.0,Financial Stress_4.0,Financial Stress_5.0,Financial Stress_?,Pressure_Intensity,Sleep_Hours_Est,Is_Student,CGPAxStudySat
0,2,1,33.0,5.0,0.0,8.97,2.0,0.0,1,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.666667,6.5,0,17.94
1,8,0,24.0,2.0,0.0,5.9,5.0,0.0,0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,6.5,0,29.5
2,26,1,31.0,3.0,0.0,7.03,5.0,0.0,0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,6.5,0,35.15
3,30,0,28.0,3.0,0.0,5.59,2.0,0.0,1,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.5,0,11.18
4,32,0,25.0,4.0,0.0,8.13,3.0,0.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.5,0,24.39


In [4]:
if 'Sleep_Hours_Est' in df_fe.columns and TARGET in df_fe.columns:
    plt.figure()

    drew_kde = False
    for cls, grp in df_fe.groupby(TARGET):
        s = pd.to_numeric(grp['Sleep_Hours_Est'], errors='coerce').dropna()
        # KDE needs at least 3 points and non-zero variance
        if len(s) >= 3 and s.std() > 0 and s.nunique() > 1:
            xs = np.linspace(s.min(), s.max(), 200)
            kde = gaussian_kde(s)
            plt.plot(xs, kde(xs), label=f'{TARGET}={cls}')
            drew_kde = True

    if not drew_kde:
        # Fallback: density hist overlay (always safe)
        for cls, grp in df_fe.groupby(TARGET):
            s = pd.to_numeric(grp['Sleep_Hours_Est'], errors='coerce').dropna()
            plt.hist(s, bins=20, density=True, alpha=0.5, label=f'{TARGET}={cls}')

    plt.title('Sleep hours density by Depression')
    plt.xlabel('Sleep_Hours_Est'); plt.ylabel('Density')
    plt.legend()
    plt.tight_layout()
    plt.savefig(VIS_DIR/'10_fe_sleep_density.png', dpi=150)
    plt.close()


In [5]:
# CELL 4: Save
out_path = OUT_DIR/'03_feature_engineered.csv'
df_fe.to_csv(out_path, index=False)
out_path

WindowsPath('D:/Depression_detector/results/outputs/03_feature_engineered.csv')