In [13]:
import pandas as pd

# Load cleaned dataset
df = pd.read_csv('adult_cleaned.csv')

# Create binary target
df['income_numeric'] = df['income'].apply(lambda x: 1 if x == '>50K' else 0)

In [14]:
from sklearn.model_selection import train_test_split

# Keep sensitive attributes separately for later fairness checks
original_data = df[['sex', 'race', 'income_numeric']].copy()

# Prepare features (exclude target & sensitive attributes)
X = pd.get_dummies(
    df.drop(columns=['income', 'income_numeric', 'sex', 'race']),
    drop_first=True
)
y = df['income_numeric']

# Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

models = {
    'Logistic Regression': LogisticRegression(
        solver='liblinear', max_iter=1000, random_state=42
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100, class_weight='balanced',
        random_state=42, n_jobs=-1
    )
}

for model in models.values():
    model.fit(X_train, y_train)

In [16]:
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score

print('--- Model Performance Results ---')

for name, model in models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(f'\n{name}')
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('AUC:', roc_auc_score(y_test, y_proba))
    print('Precision:', precision_score(y_test, y_pred))
    print('Recall:', recall_score(y_test, y_pred))
    print('F1 Score:', f1_score(y_test, y_pred))

--- Model Performance Results ---

Logistic Regression
Confusion Matrix: [[6563  227]
 [1647  605]]
AUC: 0.5823335565571562
Precision: 0.7271634615384616
Recall: 0.26865008880994673
F1 Score: 0.39234760051880674

Random Forest
Confusion Matrix: [[6328  462]
 [ 861 1391]]
AUC: 0.9025287618663954
Precision: 0.7506745817593092
Recall: 0.6176731793960923
F1 Score: 0.6777101096224117
