In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# --- 0. Define File Path ---
file_path = "/Users/wanderer/Thesis/ModelProject/Processed_Data/n_indresp_FINAL_FEATURES_OPTIMIZED.csv"
TARGET_CONT_COL = 'ghq12_continuous_score'
TARGET_BIN_COL = 'ghq12_distress'
GHQ12_THRESHOLD = 4

# --- 1. Load and Clean Column Names ---

def make_columns_unique(df):
    """Makes column names unique by appending a counter to duplicates."""
    cols = df.columns
    seen = {}
    new_cols = []
    for item in cols:
        original_name = item
        count = seen.get(original_name, 0)
        if count == 0:
            new_cols.append(original_name)
        else:
            new_cols.append(f"{original_name}_{count}")
        seen[original_name] = count + 1
    df.columns = new_cols
    return df

try:
    df = pd.read_csv(file_path)
    df = make_columns_unique(df)
except FileNotFoundError:
    print(f"Error: File not found at the specified path: {file_path}")
    raise

# --- 2. Target Transformation and Filtering ---
# Drop rows where GHQ-12 score is missing or negative (inapplicable)
df = df.dropna(subset=[TARGET_CONT_COL]).copy()
df = df[df[TARGET_CONT_COL] >= 0].copy()

# Create the binary target variable
# 1: Distress (GHQ-12 >= 4)
# 0: No Distress (GHQ-12 < 4)
df[TARGET_BIN_COL] = (df[TARGET_CONT_COL] >= GHQ12_THRESHOLD).astype(int)

print(f"Target variable '{TARGET_BIN_COL}' created (Threshold: {GHQ12_THRESHOLD}).")
print("Target distribution:")
print(df[TARGET_BIN_COL].value_counts().to_markdown(numalign="left", stralign="left"))

# --- 3. Feature Preparation (Imputation and Encoding) ---

# Define columns to be dropped
drop_cols = ['pidp', 'mental_health_status', TARGET_CONT_COL, TARGET_BIN_COL, 'has_mh_issue']
X = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')
y = df[TARGET_BIN_COL]

# Identify column types
numerical_cols = X.select_dtypes(include=np.number).columns
categorical_cols = X.select_dtypes(include='object').columns

# Impute unemployment/job chance features with 0
unemp_impute_zero = ['last_unemployment_duration_months', 'subjective_job_chance_likely', 'unemployment_spells_count']
for col in unemp_impute_zero:
    if col in X.columns:
        X[col] = X[col].fillna(0)

# Impute other numerical features with the median
numerical_to_impute = [col for col in numerical_cols if col not in unemp_impute_zero]
for col in numerical_to_impute:
    X[col] = X[col].fillna(X[col].median())

# Impute categorical features with the mode
for col in categorical_cols:
    X[col] = X[col].fillna(X[col].mode()[0])

# Perform One-Hot Encoding on all categorical features
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)


# --- 4. Train-Test Split (80/20) ---
X_train_full, X_test_full, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

# --- 5. Define Feature Sets for Two Models ---

# Columns to look for in the encoded set:
# Demographics: n_age_dv, female, n_hhsize, n_nchild_dv, marstat_final_, race_grp_, is_urban
demographic_cols = [col for col in X_encoded.columns if any(c in col for c in 
    ['n_age_dv', 'female', 'n_hhsize', 'n_nchild_dv', 'marstat_final_', 'race_grp_', 'is_urban', 'financial_difficulty', 'has_disability'])]

# Core features: is_currently_unemployed, education_level
core_unemp_cols = [col for col in X_encoded.columns if 'is_currently_unemployed' == col]
core_edu_cols = [col for col in X_encoded.columns if 'education_level' in col]

# Model 1 Feature Set (Baseline)
M1_FEATURES = list(set(demographic_cols + core_unemp_cols + core_edu_cols))
M1_FEATURES = [f for f in M1_FEATURES if f in X_train_full.columns] # Filter out non-existent columns

X_train_M1 = X_train_full[M1_FEATURES]
X_test_M1 = X_test_full[M1_FEATURES]
M1_feature_count = len(M1_FEATURES)

# Model 2 Feature Set (Augmented - All Features)
M2_FEATURES = X_train_full.columns.tolist()
X_train_M2 = X_train_full
X_test_M2 = X_test_full
M2_feature_count = len(M2_FEATURES)

print(f"\nModel 1 (Baseline) feature count: {M1_feature_count}")
print(f"Model 2 (Augmented) feature count: {M2_feature_count}")

# --- 6. Feature Scaling (All Models) ---
scaler = StandardScaler()
X_train_M1_scaled = scaler.fit_transform(X_train_M1)
X_test_M1_scaled = scaler.transform(X_test_M1)

# Fit a new scaler instance for Model 2 (since the feature sets are different)
scaler = StandardScaler()
X_train_M2_scaled = scaler.fit_transform(X_train_M2)
X_test_M2_scaled = scaler.transform(X_test_M2)

# --- 7. Train and Evaluate Model 1 (Baseline) ---

# Using 'balanced' class weights to handle class imbalance
M1_model = LogisticRegression(
    random_state=42, 
    class_weight='balanced', 
    solver='liblinear',
    C=1.0, 
    max_iter=1000
)

print("\n--- Training Model 1 (Baseline: Core Unemp + Core Edu + Demographics) ---")
M1_model.fit(X_train_M1_scaled, y_train)

y_pred_M1 = M1_model.predict(X_test_M1_scaled)
y_proba_M1 = M1_model.predict_proba(X_test_M1_scaled)[:, 1]

roc_auc_M1 = roc_auc_score(y_test, y_proba_M1)

print(f"Model 1 ROC AUC Score: {roc_auc_M1:.4f}")
print("Classification Report M1:")
print(classification_report(y_test, y_pred_M1))

# --- 8. Train and Evaluate Model 2 (Augmented) ---

# Using 'balanced' class weights
M2_model = LogisticRegression(
    random_state=42, 
    class_weight='balanced', 
    solver='liblinear',
    C=1.0, 
    max_iter=1000
)

print("\n--- Training Model 2 (Augmented: All Features) ---")
M2_model.fit(X_train_M2_scaled, y_train)

y_pred_M2 = M2_model.predict(X_test_M2_scaled)
y_proba_M2 = M2_model.predict_proba(X_test_M2_scaled)[:, 1]

roc_auc_M2 = roc_auc_score(y_test, y_proba_M2)

print(f"Model 2 ROC AUC Score: {roc_auc_M2:.4f}")
print("Classification Report M2:")
print(classification_report(y_test, y_pred_M2))

# --- 9. Summary and Comparison ---

print("\n--- Model Comparison Summary (GHQ-12 Distress Classification) ---")

comparison_data = {
    'Model': ['Model 1 (Baseline)', 'Model 2 (Augmented)'],
    'Features': [M1_feature_count, M2_feature_count],
    'ROC AUC Score': [roc_auc_M1, roc_auc_M2]
}

comparison_df = pd.DataFrame(comparison_data)

print(comparison_df.to_markdown(index=False, numalign="left", stralign="left", floatfmt=".4f"))

# Detailed Analysis of Core Coefficients (Model 1)
coefficients_M1 = pd.Series(M1_model.coef_[0], index=X_train_M1.columns)
top_10_features_M1 = coefficients_M1.abs().sort_values(ascending=False).head(10).index.tolist()

print("\nTop 10 Most Influential Features (Coefficients) for Model 1:")
print(coefficients_M1.loc[top_10_features_M1].sort_values(ascending=False).to_markdown(numalign="left", stralign="left"))

Target variable 'ghq12_distress' created (Threshold: 4).
Target distribution:
| ghq12_distress   | count   |
|:-----------------|:--------|
| 0                | 26779   |
| 1                | 7335    |

Model 1 (Baseline) feature count: 48
Model 2 (Augmented) feature count: 251

--- Training Model 1 (Baseline: Core Unemp + Core Edu + Demographics) ---
Model 1 ROC AUC Score: 0.7040
Classification Report M1:
              precision    recall  f1-score   support

           0       0.86      0.69      0.77      5356
           1       0.35      0.59      0.44      1467

    accuracy                           0.67      6823
   macro avg       0.60      0.64      0.60      6823
weighted avg       0.75      0.67      0.70      6823


--- Training Model 2 (Augmented: All Features) ---
Model 2 ROC AUC Score: 0.7322
Classification Report M2:
              precision    recall  f1-score   support

           0       0.87      0.73      0.79      5356
           1       0.38      0.61      0.47   

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, make_scorer

# --- 1. Load Data ---
# Load the full feature set (X_encoded) and the binary target (y) from the previous step's logic

file_path = "/Users/wanderer/Thesis/ModelProject/Processed_Data/n_indresp_FINAL_FEATURES_OPTIMIZED.csv"
TARGET_CONT_COL = 'ghq12_continuous_score'
TARGET_BIN_COL = 'ghq12_distress'
GHQ12_THRESHOLD = 4

def make_columns_unique(df):
    cols = df.columns
    seen = {}
    new_cols = []
    for item in cols:
        original_name = item
        count = seen.get(original_name, 0)
        if count == 0:
            new_cols.append(original_name)
        else:
            new_cols.append(f"{original_name}_{count}")
        seen[original_name] = count + 1
    df.columns = new_cols
    return df

try:
    df = pd.read_csv(file_path)
    df = make_columns_unique(df)
except FileNotFoundError:
    raise

# Target Transformation and Filtering
df = df.dropna(subset=[TARGET_CONT_COL]).copy()
df = df[df[TARGET_CONT_COL] >= 0].copy()
df[TARGET_BIN_COL] = (df[TARGET_CONT_COL] >= GHQ12_THRESHOLD).astype(int)

# Feature Preparation (Full X_encoded)
drop_cols = ['pidp', 'mental_health_status', TARGET_CONT_COL, TARGET_BIN_COL, 'has_mh_issue']
X = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')
y = df[TARGET_BIN_COL]

numerical_cols = X.select_dtypes(include=np.number).columns
categorical_cols = X.select_dtypes(include='object').columns

unemp_impute_zero = ['last_unemployment_duration_months', 'subjective_job_chance_likely', 'unemployment_spells_count']
for col in unemp_impute_zero:
    if col in X.columns:
        X[col] = X[col].fillna(0)

numerical_to_impute = [col for col in numerical_cols if col not in unemp_impute_zero]
for col in numerical_to_impute:
    X[col] = X[col].fillna(X[col].median())

for col in categorical_cols:
    X[col] = X[col].fillna(X[col].mode()[0])

X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
X_full = X_encoded.copy()


# --- 2. Train-Test Split (Re-split for full consistency) ---
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y, test_size=0.2, random_state=42, stratify=y
)

# --- 3. Feature Scaling ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)


# --- 4. Hyperparameter Tuning (Model 2: Augmented) ---

# Define the search space for the regularization parameter C (inverse of regularization strength)
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'] # Test both L1 (LASSO) and L2 (Ridge)
}

# Initialize the model
log_reg_base = LogisticRegression(
    random_state=42,
    class_weight='balanced',
    solver='liblinear', # Supports both l1 and l2
    max_iter=1000
)

# Define ROC AUC as the scoring metric
scorer = make_scorer(roc_auc_score)

# Initialize GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=log_reg_base,
    param_grid=param_grid,
    scoring=scorer,
    cv=5,                 # 5-fold cross-validation
    verbose=1,
    n_jobs=-1             # Use all available cores
)

print("\nStarting GridSearchCV for Model 2 (Augmented) with 5-Fold CV...")
grid_search.fit(X_train_scaled, y_train)

# --- 5. Evaluate Best Model ---
best_model = grid_search.best_estimator_
y_proba_tuned = best_model.predict_proba(X_test_scaled)[:, 1]
y_pred_tuned = best_model.predict(X_test_scaled)

# Final Metrics
roc_auc_tuned = roc_auc_score(y_test, y_proba_tuned)

print("\n--- Hyperparameter Tuning Results (Model 2) ---")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV ROC AUC Score: {grid_search.best_score_:.4f}")
print(f"Test Set ROC AUC Score (Tuned Model): {roc_auc_tuned:.4f}")
print("\nClassification Report (Tuned Model):")
print(classification_report(y_test, y_pred_tuned))

# --- 6. Feature Importance (Best Model) ---
coefficients_tuned = pd.Series(best_model.coef_[0], index=X_train.columns)
# Filter for non-zero coefficients if L1 was chosen, or top 10 if L2
if best_model.penalty == 'l1':
    selected_features = coefficients_tuned[coefficients_tuned != 0]
    print(f"\nNumber of Features Selected by L1: {len(selected_features)} out of {X_train.shape[1]}")
    top_features = selected_features.abs().sort_values(ascending=False).head(10).index.tolist()
else:
    top_features = coefficients_tuned.abs().sort_values(ascending=False).head(10).index.tolist()

print(f"\nTop 10 Most Influential Features (Coefficients) for Tuned Model ({best_model.penalty}):")
print(coefficients_tuned.loc[top_features].sort_values(ascending=False).to_markdown(numalign="left", stralign="left"))


Starting GridSearchCV for Model 2 (Augmented) with 5-Fold CV...
Fitting 5 folds for each of 12 candidates, totalling 60 fits

--- Hyperparameter Tuning Results (Model 2) ---
Best Parameters: {'C': 0.01, 'penalty': 'l1'}
Best CV ROC AUC Score: 0.6778
Test Set ROC AUC Score (Tuned Model): 0.7334

Classification Report (Tuned Model):
              precision    recall  f1-score   support

           0       0.87      0.72      0.79      5356
           1       0.38      0.61      0.47      1467

    accuracy                           0.70      6823
   macro avg       0.62      0.67      0.63      6823
weighted avg       0.77      0.70      0.72      6823


Number of Features Selected by L1: 94 out of 251

Top 10 Most Influential Features (Coefficients) for Tuned Model (l1):
|                         | 0          |
|:------------------------|:-----------|
| has_disability          | 0.43185    |
| financial_difficulty    | 0.337458   |
| finfut_2.0              | 0.273781   |
| female     

In [7]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import numpy as np
from sklearn.model_selection import train_test_split

# 0. Define Constants 
FILE_PATH = "/Users/wanderer/Thesis/ModelProject/Processed_Data/n_indresp_FINAL_FEATURES_OPTIMIZED.csv"
RANDOM_SEED = 42
GHQ12_THRESHOLD = 4
TARGET_CONT_COL = 'ghq12_continuous_score'
TARGET_BIN_COL = 'ghq12_distress'

# 1. Load and Prepare Data (Re-run preparation logic to ensure data integrity) 

def make_columns_unique(df):
    """Makes column names unique by appending a counter to duplicates."""
    cols = df.columns
    seen = {}
    new_cols = []
    for item in cols:
        original_name = item
        count = seen.get(original_name, 0)
        if count == 0:
            new_cols.append(original_name)
        else:
            new_cols.append(f"{original_name}_{count}")
        seen[original_name] = count + 1
    df.columns = new_cols
    return df

# Load Data
try:
    df = pd.read_csv(FILE_PATH)
    df = make_columns_unique(df)
except FileNotFoundError:
    print(f"Error: File not found at the specified path: {FILE_PATH}")
    raise

# Target Transformation and Filtering
df = df.dropna(subset=[TARGET_CONT_COL]).copy()
df = df[df[TARGET_CONT_COL] >= 0].copy()
df[TARGET_BIN_COL] = (df[TARGET_CONT_COL] >= GHQ12_THRESHOLD).astype(int)

# Feature Preparation (Full X_encoded)
drop_cols = ['pidp', 'mental_health_status', TARGET_CONT_COL, TARGET_BIN_COL, 'has_mh_issue']
X = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')
y = df[TARGET_BIN_COL]

numerical_cols = X.select_dtypes(include=np.number).columns
categorical_cols = X.select_dtypes(include='object').columns

# Imputation
unemp_impute_zero = ['last_unemployment_duration_months', 'subjective_job_chance_likely', 'unemployment_spells_count']
for col in unemp_impute_zero:
    if col in X.columns:
        X[col] = X[col].fillna(0)

numerical_to_impute = [col for col in numerical_cols if col not in unemp_impute_zero]
for col in numerical_to_impute:
    X[col] = X[col].fillna(X[col].median())

for col in categorical_cols:
    X[col] = X[col].fillna(X[col].mode()[0])

X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
X_full = X_encoded.copy()


# Train-Test Split (Augmented Feature Set - Model 2)
# Re-split using the correct target and random state
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)


# 2. Calculate Sample Weights for Class Imbalance ---

# Calculate weights based on the inverse frequency of each class in the training set
weight_minority = 1.0 / np.sum(y_train == 1)
weight_majority = 1.0 / np.sum(y_train == 0)

# Assign weights to each sample
sample_weights = np.where(y_train == 1, weight_minority, weight_majority)
# Normalize weights
sample_weights = sample_weights / sample_weights.sum() * len(y_train)


# 3. Train GradientBoostingClassifier 

gbm_clf = GradientBoostingClassifier(
    n_estimators=300,          # Increased 
    learning_rate=0.03,        # Slower 
    max_depth=4,               # Tree depth
    subsample=0.8,             # Subsampling
    random_state=RANDOM_SEED   
)

print("\nTraining Scikit-learn Gradient Boosting Classifier (with Sample Weights)...")
# Fit the model using sample weights to account for the imbalanced target
gbm_clf.fit(X_train, y_train, sample_weight=sample_weights)

# 4. Evaluate Model 
y_pred_gbm = gbm_clf.predict(X_test)
y_proba_gbm = gbm_clf.predict_proba(X_test)[:, 1]

print("\n--- Model Evaluation: GradientBoostingClassifier ---")
print("Classification Report:")
print(classification_report(y_test, y_pred_gbm))

# ROC AUC Score
roc_auc_gbm = roc_auc_score(y_test, y_proba_gbm)
print(f"ROC AUC Score: {roc_auc_gbm:.4f}")

# Confusion Matrix
cm_gbm = confusion_matrix(y_test, y_pred_gbm)
print("\nConfusion Matrix:")
print(pd.DataFrame(cm_gbm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1']).to_markdown(numalign="left", stralign="left"))

# 5. Feature Importance 
feature_importance_gbm = pd.Series(gbm_clf.feature_importances_, index=X_train.columns)

# Select the top 10 most important features
top_10_features_gbm = feature_importance_gbm.sort_values(ascending=False).head(10)

print("\nTop 10 Most Important Features (GBM Importance):")
print(top_10_features_gbm.to_markdown(numalign="left", stralign="left"))


Training Scikit-learn Gradient Boosting Classifier (with Sample Weights)...

--- Model Evaluation: GradientBoostingClassifier ---
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.72      0.79      5356
           1       0.38      0.62      0.47      1467

    accuracy                           0.70      6823
   macro avg       0.62      0.67      0.63      6823
weighted avg       0.77      0.70      0.72      6823

ROC AUC Score: 0.7333

Confusion Matrix:
|          | Predicted 0   | Predicted 1   |
|:---------|:--------------|:--------------|
| Actual 0 | 3831          | 1525          |
| Actual 1 | 552           | 915           |

Top 10 Most Important Features (GBM Importance):
|                      | 0         |
|:---------------------|:----------|
| financial_difficulty | 0.274773  |
| has_disability       | 0.191956  |
| n_age_dv             | 0.14876   |
| female               | 0.0438439 |
| actively_seeking     | 0