In [128]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

In [129]:
df = pd.read_csv('data/data_cleaned.csv', na_values=['NA'])
numeric_df = df.select_dtypes(include=['number'])
correlations = numeric_df.corr()['treatment'].sort_values(ascending=False)
print(correlations)


treatment                    1.000000
family_history               0.376674
work_interfere               0.301096
care_options                 0.272083
obs_consequence              0.152524
leave                        0.141701
mental_health_consequence    0.093062
seek_help                    0.085215
wellness_program             0.080151
Age                          0.073627
coworkers                    0.057944
benefits                     0.029501
phys_health_consequence      0.028123
remote_work                  0.026507
mental_health_interview      0.020537
self_employed                0.016442
no_employees                 0.013811
mental_vs_physical           0.006254
supervisor                  -0.024070
tech_company                -0.032521
anonymity                   -0.033870
Gender                      -0.186721
Name: treatment, dtype: float64


In [133]:
cols = [
    'Gender', 'self_employed', 'family_history', 'treatment',
    'remote_work', 'tech_company', 'benefits', 'care_options',
    'wellness_program', 'seek_help', 'anonymity',
    'mental_health_consequence', 'phys_health_consequence',
    'coworkers', 'supervisor', 'mental_health_interview', 'mental_vs_physical',
    'obs_consequence'
]
df_bin = df[cols].fillna(0).astype(int)

frequent_itemsets = apriori(df_bin, min_support=0.1, use_colnames=True)

rules = association_rules(
    frequent_itemsets,
    metric="confidence",
    min_threshold=0.6)

treatment_rules = rules[rules['consequents'].apply(lambda x: 'treatment' in x)]
treatment_rules = treatment_rules.sort_values(by='lift', ascending=False)
print("Top treatment rules:\n", treatment_rules[
    ['antecedents', 'consequents', 'support', 'confidence', 'lift']
    ].head())

for i, rule in enumerate(treatment_rules['antecedents'].head(5)):
    feature_name = f'rule_{i+1}'
    cols_in_rule = list(rule)  # convert frozenset to list
    df[feature_name] = df_bin[cols_in_rule].all(axis=1).astype(int)


rule_features = [f'rule_{i+1}' for i in range(5)]

Top treatment rules:
                                     antecedents  \
1298  (care_options, anonymity, family_history)   
727              (care_options, family_history)   
712              (care_options, family_history)   
1301             (care_options, family_history)   
1299   (care_options, family_history, benefits)   

                           consequents   support  confidence      lift  
1298             (treatment, benefits)  0.101519    0.658031  1.817212  
727            (supervisor, treatment)  0.102318    0.618357  1.807395  
712              (treatment, benefits)  0.106315    0.642512  1.774355  
1301  (anonymity, treatment, benefits)  0.101519    0.613527  1.748341  
1299            (anonymity, treatment)  0.101519    0.808917  1.700765  




In [159]:

df['work_leave_interaction'] = df['work_interfere'] * df['leave']

df['family_work_interaction'] = df['family_history'] * df['work_interfere']

top_features = [
    'work_interfere', 'leave', 'no_employees',
    'family_history', 'care_options', 'obs_consequence',
    'work_leave_interaction', 'family_work_interaction'
] + rule_features

X = df[top_features]
y = df['treatment']

In [164]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7529880478087649
ROC-AUC: 0.7531432562865126
Confusion Matrix:
 [[95 29]
 [33 94]]


In [220]:
rf = RandomForestClassifier(
    n_estimators=10,     
    max_depth=11,        
    random_state=42,
    min_samples_split=10,
    min_samples_leaf=3,
    class_weight='balanced'  
)

# Fit the model
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

# Evaluation
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Random Forest Accuracy: 0.7928286852589641
ROC-AUC: 0.8399161798323597
Confusion Matrix:
 [[100  24]
 [ 28  99]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.78      0.81      0.79       124
         1.0       0.80      0.78      0.79       127

    accuracy                           0.79       251
   macro avg       0.79      0.79      0.79       251
weighted avg       0.79      0.79      0.79       251

