In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [3]:
def find_root():
    p = Path.cwd()
    for _ in range(5):
        if (p/'results'/'outputs'/'03_feature_engineered.csv').exists() or (p/'results').exists():
            return p
        p = p.parent
    return Path.cwd()

ROOT = find_root()
VIS_DIR = ROOT/'results'/'eda_visualizations'
OUT_DIR = ROOT/'results'/'outputs'
df = pd.read_csv(OUT_DIR/'03_feature_engineered.csv')
TARGET = 'Depression'
print(df.shape)


X = df.drop(columns=[TARGET])
y = df[TARGET]
num_cols = X.select_dtypes(include=[np.number]).columns
df.head()

(27901, 125)


Unnamed: 0,id,Gender,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Have you ever had suicidal thoughts ?,Work/Study Hours,...,Financial Stress_1.0,Financial Stress_2.0,Financial Stress_3.0,Financial Stress_4.0,Financial Stress_5.0,Financial Stress_?,Pressure_Intensity,Sleep_Hours_Est,Is_Student,CGPAxStudySat
0,2,1,33.0,5.0,0.0,8.97,2.0,0.0,1,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.666667,6.5,0,17.94
1,8,0,24.0,2.0,0.0,5.9,5.0,0.0,0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,6.5,0,29.5
2,26,1,31.0,3.0,0.0,7.03,5.0,0.0,0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,6.5,0,35.15
3,30,0,28.0,3.0,0.0,5.59,2.0,0.0,1,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.5,0,11.18
4,32,0,25.0,4.0,0.0,8.13,3.0,0.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.5,0,24.39


In [4]:
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X[num_cols])

pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_num_scaled)
X_pca_df = pd.DataFrame(X_pca, columns=[f'PCA_{i+1}' for i in range(X_pca.shape[1])])

In [5]:
df_pca = pd.concat([X.drop(columns=num_cols).reset_index(drop=True),
                    X_pca_df.reset_index(drop=True),
                    y.reset_index(drop=True)], axis=1)

df_pca.head(), X_pca_df.shape, pca.explained_variance_ratio_.sum()

(      PCA_1     PCA_2     PCA_3     PCA_4     PCA_5     PCA_6     PCA_7  \
 0  1.354889 -0.461946 -1.304167 -0.553253  1.339322  2.171098 -0.464770   
 1 -2.137472 -1.382699  0.934879  0.333345 -0.752611 -1.465197 -1.806167   
 2 -2.089977 -0.392260 -0.894180 -2.097222  0.607102  0.326287 -1.983398   
 3  1.022568 -1.523374  0.414855  1.907671 -0.342986 -1.364335 -1.877241   
 4  0.097937 -0.912120  1.411545  0.414433  0.397165  0.195281 -0.460175   
 
       PCA_8     PCA_9    PCA_10    PCA_11    PCA_12  Depression  
 0 -0.124972 -1.283599  1.268966  1.423577  0.920441           1  
 1 -1.050295 -1.165202 -0.167835  0.749534 -0.800566           0  
 2  0.312080  0.471597  1.269501  1.118669 -0.864279           0  
 3 -0.879892 -0.748270  0.375049  0.579325  1.195464           1  
 4 -0.782131 -2.592791  0.096868  0.476846  0.803571           0  ,
 (27901, 12),
 np.float64(0.9567872420950032))

In [6]:
plt.figure()
plt.plot(range(1, len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_, marker='o')
plt.title('PCA explained variance ratio'); plt.xlabel('Component'); plt.ylabel('Variance Ratio')
plt.tight_layout(); plt.savefig(VIS_DIR/'13_pca_scree.png', dpi=150); plt.close()

In [7]:
plt.figure()
plt.plot(range(1, len(pca.explained_variance_ratio_)+1), np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.title('PCA cumulative explained variance'); plt.xlabel('Component'); plt.ylabel('Cumulative Variance')
plt.tight_layout(); plt.savefig(VIS_DIR/'14_pca_cumulative.png', dpi=150); plt.close()

In [8]:
if X_pca_df.shape[1] >= 2:
    plt.figure()
    plt.scatter(X_pca_df.iloc[:,0], X_pca_df.iloc[:,1], s=5, c=y, alpha=0.4)
    plt.title('PCA_1 vs PCA_2 (colored by target)')
    plt.xlabel('PCA_1'); plt.ylabel('PCA_2')
    plt.tight_layout(); plt.savefig(VIS_DIR/'15_pca_scatter.png', dpi=150); plt.close()

In [9]:
loadings = pd.Series(np.abs(pca.components_[0]), index=num_cols).sort_values(ascending=False).head(10)
ax = loadings.plot(kind='bar', title='Top |loadings| on PC1')
plt.tight_layout(); plt.savefig(VIS_DIR/'16_pca_top_loadings_pc1.png', dpi=150); plt.close()

In [10]:
out_path = OUT_DIR/'04_pca_reduced.csv'
df_pca.to_csv(out_path, index=False)
out_path

WindowsPath('D:/Depression_detector/results/outputs/04_pca_reduced.csv')