In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import subprocess

In [2]:
def find_root():
    p = Path.cwd()
    for _ in range(5):
        if (p/'results'/'outputs'/'05_scaled.csv').exists() or (p/'results').exists():
            return p
        p = p.parent
    return Path.cwd()

ROOT = find_root()
VIS_DIR = ROOT/'results'/'eda_visualizations'
OUT_DIR = ROOT/'results'/'outputs'
df = pd.read_csv(OUT_DIR/'05_scaled.csv')
TARGET = 'Depression'
print(df.shape)

X = df.drop(columns=[TARGET])
y = df[TARGET]
df.head()

(27901, 13)


Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10,PCA_11,PCA_12,Depression
0,1.354889,-0.461946,-1.304167,-0.553253,1.339322,2.171098,-0.46477,-0.124972,-1.283599,1.268966,1.423577,0.920441,1
1,-2.137472,-1.382699,0.934879,0.333345,-0.752611,-1.465197,-1.806167,-1.050295,-1.165202,-0.167835,0.749534,-0.800566,0
2,-2.089977,-0.39226,-0.89418,-2.097222,0.607102,0.326287,-1.983398,0.31208,0.471597,1.269501,1.118669,-0.864279,0
3,1.022568,-1.523374,0.414855,1.907671,-0.342986,-1.364335,-1.877241,-0.879892,-0.74827,0.375049,0.579325,1.195464,1
4,0.097937,-0.91212,1.411545,0.414433,0.397165,0.195281,-0.460175,-0.782131,-2.592791,0.096868,0.476846,0.803571,0


In [3]:
# CELL 2: Install + import SMOTE safely


# Ensure imbalanced-learn is installed in the current kernel env
try:
    from imblearn.over_sampling import SMOTE
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "imbalanced-learn"])
    from imblearn.over_sampling import SMOTE

# Show class distribution before balancing
print('Class distribution before:\n', y.value_counts())

# Apply SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

print('\nClass distribution after:\n', pd.Series(y_res).value_counts())

# Combine back into one DataFrame
df_bal = pd.concat(
    [pd.DataFrame(X_res, columns=X.columns),
     pd.Series(y_res, name=TARGET)],
    axis=1
)
df_bal.head()


Class distribution before:
 Depression
1    16336
0    11565
Name: count, dtype: int64

Class distribution after:
 Depression
1    16336
0    16336
Name: count, dtype: int64


Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10,PCA_11,PCA_12,Depression
0,1.354889,-0.461946,-1.304167,-0.553253,1.339322,2.171098,-0.46477,-0.124972,-1.283599,1.268966,1.423577,0.920441,1
1,-2.137472,-1.382699,0.934879,0.333345,-0.752611,-1.465197,-1.806167,-1.050295,-1.165202,-0.167835,0.749534,-0.800566,0
2,-2.089977,-0.39226,-0.89418,-2.097222,0.607102,0.326287,-1.983398,0.31208,0.471597,1.269501,1.118669,-0.864279,0
3,1.022568,-1.523374,0.414855,1.907671,-0.342986,-1.364335,-1.877241,-0.879892,-0.74827,0.375049,0.579325,1.195464,1
4,0.097937,-0.91212,1.411545,0.414433,0.397165,0.195281,-0.460175,-0.782131,-2.592791,0.096868,0.476846,0.803571,0


In [4]:
# CELL 3: (≤4 visuals)
# counts before/after
plt.figure()
df[TARGET].value_counts().plot(kind='bar', title='Class counts before SMOTE')
plt.tight_layout(); plt.savefig(VIS_DIR/'21_balance_counts_before.png', dpi=150); plt.close()

plt.figure()
pd.Series(y_res).value_counts().plot(kind='bar', title='Class counts after SMOTE')
plt.tight_layout(); plt.savefig(VIS_DIR/'22_balance_counts_after.png', dpi=150); plt.close()

# PCA scatter (if available)
pc_cols = [c for c in df.columns if c.startswith('PCA_')]
if len(pc_cols) >= 2:
    samp = min(10000, len(df_bal))
    xs = df_bal.iloc[:samp]
    plt.figure()
    plt.scatter(xs[pc_cols[0]], xs[pc_cols[1]], c=xs[TARGET], s=5, alpha=0.4)
    plt.title('Balanced data (PCA_1 vs PCA_2)')
    plt.tight_layout(); plt.savefig(VIS_DIR/'23_balance_pca_scatter.png', dpi=150); plt.close()

# simple sanity check: mean shift of one feature
feat = pc_cols[0] if pc_cols else df.columns[0]
means = pd.Series({'before': df[feat].mean(), 'after': df_bal[feat].mean()})
means.plot(kind='bar', title=f'{feat} mean: before vs after SMOTE')
plt.tight_layout(); plt.savefig(VIS_DIR/'24_balance_feat_mean.png', dpi=150); plt.close()


In [5]:
# CELL 4: Save
out_path = OUT_DIR/'06_balanced.csv'
df_bal.to_csv(out_path, index=False)
out_path

WindowsPath('D:/Depression_detector/results/outputs/06_balanced.csv')