In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
def find_root():
    p = Path.cwd()
    for _ in range(5):
        if (p/'results'/'outputs'/'04_pca_reduced.csv').exists() or (p/'results').exists():
            return p
        p = p.parent
    return Path.cwd()

ROOT = find_root()
VIS_DIR = ROOT/'results'/'eda_visualizations'
OUT_DIR = ROOT/'results'/'outputs'
df = pd.read_csv(OUT_DIR/'04_pca_reduced.csv')
TARGET = 'Depression'
print(df.shape)


num_cols = df.select_dtypes(include=[np.number]).columns.drop(TARGET, errors='ignore')
pca_cols = [c for c in df.columns if c.startswith('PCA_')]
other_num = [c for c in num_cols if c not in pca_cols]
df.head()

(27901, 13)


Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10,PCA_11,PCA_12,Depression
0,1.354889,-0.461946,-1.304167,-0.553253,1.339322,2.171098,-0.46477,-0.124972,-1.283599,1.268966,1.423577,0.920441,1
1,-2.137472,-1.382699,0.934879,0.333345,-0.752611,-1.465197,-1.806167,-1.050295,-1.165202,-0.167835,0.749534,-0.800566,0
2,-2.089977,-0.39226,-0.89418,-2.097222,0.607102,0.326287,-1.983398,0.31208,0.471597,1.269501,1.118669,-0.864279,0
3,1.022568,-1.523374,0.414855,1.907671,-0.342986,-1.364335,-1.877241,-0.879892,-0.74827,0.375049,0.579325,1.195464,1
4,0.097937,-0.91212,1.411545,0.414433,0.397165,0.195281,-0.460175,-0.782131,-2.592791,0.096868,0.476846,0.803571,0


In [3]:
# Scale any remaining numeric columns (that were not part of PCA)
scaler = StandardScaler()
df_scaled = df.copy()
if other_num:
    df_scaled[other_num] = scaler.fit_transform(df_scaled[other_num])
df_scaled.head()

Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10,PCA_11,PCA_12,Depression
0,1.354889,-0.461946,-1.304167,-0.553253,1.339322,2.171098,-0.46477,-0.124972,-1.283599,1.268966,1.423577,0.920441,1
1,-2.137472,-1.382699,0.934879,0.333345,-0.752611,-1.465197,-1.806167,-1.050295,-1.165202,-0.167835,0.749534,-0.800566,0
2,-2.089977,-0.39226,-0.89418,-2.097222,0.607102,0.326287,-1.983398,0.31208,0.471597,1.269501,1.118669,-0.864279,0
3,1.022568,-1.523374,0.414855,1.907671,-0.342986,-1.364335,-1.877241,-0.879892,-0.74827,0.375049,0.579325,1.195464,1
4,0.097937,-0.91212,1.411545,0.414433,0.397165,0.195281,-0.460175,-0.782131,-2.592791,0.096868,0.476846,0.803571,0


In [4]:
#Histogram
feat = other_num[0] if len(other_num) else None
if feat:
    plt.figure()
    df[feat].plot(kind='hist', bins=30, alpha=0.6, label='before')
    df_scaled[feat].plot(kind='hist', bins=30, alpha=0.6, label='after')
    plt.legend(); plt.title(f'{feat}: before vs after scaling')
    plt.tight_layout(); plt.savefig(VIS_DIR/'17_scaling_hist_before_after.png', dpi=150); plt.close()

In [5]:
feat2 = other_num[1] if len(other_num) > 1 else None
if feat2:
    plt.figure()
    df[feat2].plot(kind='kde', label='before')
    df_scaled[feat2].plot(kind='kde', label='after')
    plt.legend(); plt.title(f'{feat2}: KDE before vs after')
    plt.tight_layout(); plt.savefig(VIS_DIR/'18_scaling_kde.png', dpi=150); plt.close()

In [6]:
if pca_cols:
    plt.figure()
    df[pca_cols[0]].plot(kind='hist', bins=30, alpha=0.6, label='PCA_1 (pre)')
    df_scaled[pca_cols[0]].plot(kind='hist', bins=30, alpha=0.6, label='PCA_1 (post)')
    plt.legend(); plt.title('PCA_1 distribution pre/post scaling step')
    plt.tight_layout(); plt.savefig(VIS_DIR/'19_scaling_pca_check.png', dpi=150); plt.close()

In [7]:
#boxplot
if other_num:
    df_scaled[other_num].plot(kind='box', title='Scaled (non-PCA) numeric features')
    plt.tight_layout(); plt.savefig(VIS_DIR/'20_scaling_boxplot.png', dpi=150); plt.close()

In [8]:
out_path = OUT_DIR/'05_scaled.csv'
df_scaled.to_csv(out_path, index=False)
out_path

WindowsPath('D:/Depression_detector/results/outputs/05_scaled.csv')