In [5]:
import os
import pandas as pd
import numpy as np
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

if 'notebooks' in os.getcwd():
    os.chdir('..')

 
print("LOADING RAW DATA & QUICK PREPROCESSING")
 

# Load raw data
try:
    df = pd.read_csv('data/raw/student_depression_dataset.csv')
    print(f" Loaded: student_depression_dataset.csv")
except:
    df = pd.read_csv('data/raw/mental-heath-in-tech-2016_20161114.csv')
    print(f" Loaded: mental-heath-in-tech-2016_20161114.csv")

print(f"Data shape: {df.shape}")

#data claning
print("\n[1/4] Quick Data Cleaning...")

# Remove rows with missing target
target_col = None
for col in ['depression', 'mental_health', 'status']:
    if col in df.columns:
        target_col = col
        break

if target_col is None:
    target_col = df.columns[-1]  # Use last column as target

print(f"Target column: {target_col}")

# Drop rows with missing target
df = df[df[target_col].notna()].copy()

# Handle missing values
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if df[col].dtype in ['float64', 'int64']:
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else 'Unknown', inplace=True)

#label encoding
from sklearn.preprocessing import LabelEncoder
for col in df.columns:
    if df[col].dtype == 'object' and col != target_col:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

print(" Data cleaned")

#feature engineering
print("\n[2/4] Feature Engineering...")

# Get numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numeric_cols:
    numeric_cols.remove(target_col)

# Create a few interaction features
if len(numeric_cols) >= 2:
    df[f'{numeric_cols[0]}_x_{numeric_cols[1]}'] = df[numeric_cols[0]] * df[numeric_cols[1]]

print(f" Created features | Total: {df.shape[1]}")

#selecting features
print("\n[3/4] Feature Selection...")

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop(target_col, axis=1)
y = df[target_col]

#binary conversion(according to the needs)
if y.dtype == 'object':
    y = (y.astype(str).str.lower().isin(['yes', 'true', '1'])).astype(int)
elif y.max() > 1:
    y = (y > y.median()).astype(int)

print(f"Target distribution: {y.value_counts().to_dict()}")

#quick feature importance
rf_selector = RandomForestClassifier(n_estimators=10, max_depth=5, n_jobs=-1, random_state=42)
rf_selector.fit(X, y)

importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_selector.feature_importances_
}).nlargest(15, 'importance')

selected_features = importance['feature'].tolist()
X_selected = X[selected_features]

print(f" Selected {len(selected_features)} features")

#data split and scale
print("\n[4/4] Training Models...")

X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#train models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score

models_dict = {
    'Logistic Regression': LogisticRegression(max_iter=1000, n_jobs=-1, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=20, max_depth=8, n_jobs=-1, random_state=42)
}

results = {}
trained_models = {}

for name, model in models_dict.items():
    print(f"  Training {name}...", end=" ")
    
    if name == 'Logistic Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    results[name] = {
        'F1-Score': f1_score(y_test, y_pred, zero_division=0),
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'ROC-AUC': roc_auc_score(y_test, y_pred_proba)
    }
    trained_models[name] = model
    print(f"F1={results[name]['F1-Score']:.4f} ‚úì")

#results
print("\n" + "="*70)
results_df = pd.DataFrame(results).T.sort_values('F1-Score', ascending=False)
print(results_df)

best_model_name = results_df.index[0]
best_model = trained_models[best_model_name]

print("\n" + "="*70)
print(f" BEST MODEL: {best_model_name}")
print(f"  F1-Score: {results_df.iloc[0]['F1-Score']:.4f}")
print(f"  Accuracy: {results_df.iloc[0]['Accuracy']:.4f}")
print("="*70)

#saving
print("\nSaving models...")
joblib.dump(best_model, 'models/model_best.pkl')
joblib.dump(scaler, 'models/scaler.pkl')
with open('models/selected_features.json', 'w') as f:
    json.dump(selected_features, f)

results_df.to_csv('docs/model_comparison.csv')

print(" Model saved to: models/model_best.pkl")
print(" Scaler saved to: models/scaler.pkl")
print(" Features saved to: models/selected_features.json")
 


LOADING RAW DATA & QUICK PREPROCESSING
 Loaded: student_depression_dataset.csv
Data shape: (27901, 18)

[1/4] Quick Data Cleaning...
Target column: Depression
 Data cleaned

[2/4] Feature Engineering...
 Created features | Total: 19

[3/4] Feature Selection...
Target distribution: {1: 16336, 0: 11565}
 Selected 15 features

[4/4] Training Models...
  Training Logistic Regression... F1=0.8683 ‚úì
  Training Random Forest... F1=0.8618 ‚úì

                     F1-Score  Accuracy  Precision    Recall   ROC-AUC
Logistic Regression  0.868306  0.843576   0.856293  0.880661  0.917548
Random Forest        0.861849  0.833184   0.836646  0.888617  0.911815

 BEST MODEL: Logistic Regression
  F1-Score: 0.8683
  Accuracy: 0.8436

Saving models...
 Model saved to: models/model_best.pkl
 Scaler saved to: models/scaler.pkl
 Features saved to: models/selected_features.json


In [8]:
import os
import pandas as pd
import numpy as np
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

if 'notebooks' in os.getcwd():
    os.chdir('..')

print("="*70)
print("MODEL TRAINING WITH BIAS CORRECTION")
print("="*70)


print("\n[1/5] Loading and Cleaning Data...")

# Load raw data
try:
    df = pd.read_csv('data/raw/student_depression_dataset.csv')
    print(f"‚úì Loaded: student_depression_dataset.csv")
except:
    df = pd.read_csv('data/raw/mental-heath-in-tech-2016_20161114.csv')
    print(f"‚úì Loaded: mental-heath-in-tech-2016_20161114.csv")

print(f"Data shape: {df.shape}")

# Remove rows with missing target
target_col = None
for col in ['depression', 'mental_health', 'status']:
    if col in df.columns:
        target_col = col
        break

if target_col is None:
    target_col = df.columns[-1]

print(f"Target column: {target_col}")

# Drop rows with missing target
df = df[df[target_col].notna()].copy()

# Handle missing values
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if df[col].dtype in ['float64', 'int64']:
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else 'Unknown', inplace=True)

# Label encoding
from sklearn.preprocessing import LabelEncoder
for col in df.columns:
    if df[col].dtype == 'object' and col != target_col:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

print("‚úì Data cleaned")


print("\n[2/5] Feature Engineering...")


numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numeric_cols:
    numeric_cols.remove(target_col)


if len(numeric_cols) >= 2:
    df[f'{numeric_cols[0]}_x_{numeric_cols[1]}'] = df[numeric_cols[0]] * df[numeric_cols[1]]

print(f"‚úì Created features | Total: {df.shape[1]}")


print("\n[3/5] Feature Selection...")

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop(target_col, axis=1)
y = df[target_col]

# Binary conversion
if y.dtype == 'object':
    y = (y.astype(str).str.lower().isin(['yes', 'true', '1'])).astype(int)
elif y.max() > 1:
    y = (y > y.median()).astype(int)

print(f"Target distribution: {y.value_counts().to_dict()}")


print("\n[4/5] CLASS IMBALANCE ANALYSIS & CORRECTION...")

class_0_count = (y == 0).sum()
class_1_count = (y == 1).sum()
imbalance_ratio = class_1_count / class_0_count

print(f"\n    Original Distribution:")
print(f"      Class 0 (Not Depressed): {class_0_count} ({class_0_count/len(y)*100:.1f}%)")
print(f"      Class 1 (Depressed):     {class_1_count} ({class_1_count/len(y)*100:.1f}%)")
print(f"      Imbalance Ratio: {imbalance_ratio:.2f}:1")

if imbalance_ratio > 1.2:
    print(f"\n     IMBALANCE DETECTED - Applying SMOTE...")
    from imblearn.over_sampling import SMOTE
    
    
    smote = SMOTE(random_state=42, k_neighbors=5)
    X_balanced, y_balanced = smote.fit_resample(X, y)
    
    print(f"\n   ‚úì After SMOTE:")
    print(f"      Class 0: {(y_balanced == 0).sum()}")
    print(f"      Class 1: {(y_balanced == 1).sum()}")
    print(f"      Total samples: {len(y_balanced)}")
    
    
    X = X_balanced
    y = y_balanced
else:
    print(f"\n    Dataset is balanced - No SMOTE needed")


rf_selector = RandomForestClassifier(n_estimators=10, max_depth=5, n_jobs=-1, random_state=42)
rf_selector.fit(X, y)

importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_selector.feature_importances_
}).nlargest(15, 'importance')

selected_features = importance['feature'].tolist()
X_selected = X[selected_features]

print(f"\n   ‚úì Selected {len(selected_features)} features")


print("\n[5/5] Training Models with Bias Corrections...")

X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score, confusion_matrix, classification_report


models_dict = {
    'Logistic Regression (Debiased)': LogisticRegression(
        max_iter=1000, 
        class_weight='balanced',  
        n_jobs=-1, 
        random_state=42
    ),
    'Random Forest (Debiased)': RandomForestClassifier(
        n_estimators=20, 
        max_depth=8, 
        class_weight='balanced',  
        n_jobs=-1, 
        random_state=42
    )
}

results = {}
trained_models = {}

for name, model in models_dict.items():
    print(f"\n   Training {name}...", end=" ")
    
    if 'Logistic' in name:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    
   
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)  
    specificity = tn / (tn + fp)  
    
    results[name] = {
        'F1-Score': f1_score(y_test, y_pred, zero_division=0),
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'Specificity': specificity,
        'ROC-AUC': roc_auc_score(y_test, y_pred_proba),
        'Bias Gap': abs(sensitivity - specificity)
    }
    trained_models[name] = model
    print(f"F1={results[name]['F1-Score']:.4f} | Bias Gap={results[name]['Bias Gap']:.4f} ‚úì")


print("\n" + "="*70)
print("MODEL COMPARISON (WITH BIAS ANALYSIS)")
print("="*70)

results_df = pd.DataFrame(results).T.sort_values('F1-Score', ascending=False)
print(results_df)


best_model_name = results_df.index[0]
best_model = trained_models[best_model_name]

print("\n" + "="*70)
print(f"‚úì BEST MODEL: {best_model_name}")
print(f"  F1-Score: {results_df.iloc[0]['F1-Score']:.4f}")
print(f"  Accuracy: {results_df.iloc[0]['Accuracy']:.4f}")
print(f"  Recall: {results_df.iloc[0]['Recall']:.4f} (Sensitivity)")
print(f"  Specificity: {results_df.iloc[0]['Specificity']:.4f}")
print(f"  Bias Gap: {results_df.iloc[0]['Bias Gap']:.4f} ‚Üê Should be < 0.1")
print("="*70)


print("\n" + "="*70)
print("BIAS VERIFICATION TEST")
print("="*70)


healthy_person = {feature: 10 for feature in selected_features}
healthy_df = pd.DataFrame([healthy_person])

if 'Logistic' in best_model_name:
    healthy_scaled = scaler.transform(healthy_df)
    healthy_pred = best_model.predict(healthy_scaled)[0]
else:
    healthy_pred = best_model.predict(healthy_df)[0]

print(f"\nTest: Healthy person with all low mental health stress indicators")
print(f"Model Prediction: {'üö® DEPRESSED' if healthy_pred == 1 else '‚úÖ NOT DEPRESSED'}")

if healthy_pred == 1:
    print("  WARNING: Model still shows bias for healthy individuals")
else:
    print(" PASSED: Model correctly identifies healthy individuals")


print("\n" + "="*70)
print("SAVING MODELS...")
print("="*70)

joblib.dump(best_model, 'models/model_best.pkl')
joblib.dump(scaler, 'models/scaler.pkl')
with open('models/selected_features.json', 'w') as f:
    json.dump(selected_features, f)

results_df.to_csv('docs/model_comparison.csv')

print("‚úì Model saved to: models/model_best.pkl")
print("‚úì Scaler saved to: models/scaler.pkl")
print("‚úì Features saved to: models/selected_features.json")
print("‚úì Results saved to: docs/model_comparison.csv")

print("\n" + "="*70)
print(" DEBIASED MODEL TRAINING COMPLETE!")
print("="*70)
print(f"\nKey Improvements:")
print(f"  1.  Applied SMOTE for class balance")
print(f"  2.  Used class_weight='balanced' in models")
print(f"  3.  Calculated Sensitivity vs Specificity (Bias Gap < 0.1)")
print(f"  4.  Tested on healthy profiles")
print(f"  5.  Ready for production deployment")


MODEL TRAINING WITH BIAS CORRECTION

[1/5] Loading and Cleaning Data...
‚úì Loaded: student_depression_dataset.csv
Data shape: (27901, 18)
Target column: Depression
‚úì Data cleaned

[2/5] Feature Engineering...
‚úì Created features | Total: 19

[3/5] Feature Selection...
Target distribution: {1: 16336, 0: 11565}

[4/5] CLASS IMBALANCE ANALYSIS & CORRECTION...

    Original Distribution:
      Class 0 (Not Depressed): 11565 (41.5%)
      Class 1 (Depressed):     16336 (58.5%)
      Imbalance Ratio: 1.41:1

     IMBALANCE DETECTED - Applying SMOTE...

   ‚úì After SMOTE:
      Class 0: 16336
      Class 1: 16336
      Total samples: 32672

   ‚úì Selected 15 features

[5/5] Training Models with Bias Corrections...

   Training Logistic Regression (Debiased)... F1=0.8622 | Bias Gap=0.0143 ‚úì

   Training Random Forest (Debiased)... F1=0.8639 | Bias Gap=0.0373 ‚úì

MODEL COMPARISON (WITH BIAS ANALYSIS)
                                F1-Score  Accuracy  Precision    Recall  \
Random Fore