In [2]:
import pandas as pd
from ucimlrepo import fetch_ucirepo

# Fetch Adult dataset from UCI
adult = fetch_ucirepo(id=2)

# Get features (X) and target (y)
X = adult.data.features
y = adult.data.targets

# Combine into a single DataFrame
df = pd.concat([X, y], axis=1)

# Make sure column names are lower case and consistent
df.columns = [c.lower().strip() for c in df.columns]

# ---- CLEAN INCOME COLUMN FIRST ----
# Normalize income strings: remove trailing periods, strip whitespace
df["income"] = (
    df["income"]
    .astype(str)
    .str.replace(".", "", regex=False)   # e.g. '>50K.' -> '>50K'
    .str.strip()                         # <---- FIXED
)

# Drop rows with invalid / missing income
df = df[df["income"].notna()]
df = df[df["income"] != ""]
df = df[df["income"] != "?"]

# Encode income target â†’ >50K = 1, <=50K = 0
df["income"] = df["income"].map({">50K": 1, "<=50K": 0})

print("Remaining NA in income:", df["income"].isna().sum())


# ---- OTHER FEATURE ENGINEERING ----

# Simplify marital status â†’ married vs not married
df["married"] = df["marital-status"].apply(
    lambda x: 1 if isinstance(x, str) and "married" in x.lower() else 0
)

# Encode sex â†’ male = 1, female = 0
df["sex"] = df["sex"].map({"Male": 1, "Female": 0})

# Keep only the minimal, interpretable features for counterfactuals
df_simplified = df[
    [
        "age",
        "education-num",
        "hours-per-week",
        "capital-gain",
        "capital-loss",
        "sex",
        "married",
        "income",
    ]
]

# Print preview + shape
print(df_simplified.head())
print("Shape:", df_simplified.shape)

# Save to CSV (recommended for your project)
df_simplified.to_csv("adult_simplified.csv", index=False)
print("Saved cleaned dataset as adult_simplified.csv")


Remaining NA in income: 0
   age  education-num  hours-per-week  capital-gain  capital-loss  sex  \
0   39             13              40          2174             0    1   
1   50             13              13             0             0    1   
2   38              9              40             0             0    1   
3   53              7              40             0             0    1   
4   28             13              40             0             0    0   

   married  income  
0        1       0  
1        1       0  
2        0       0  
3        1       0  
4        1       0  
Shape: (48842, 8)
Saved cleaned dataset as adult_simplified.csv


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score


#split data into training and test sets 
X = df_simplified.drop("income", axis=1)
y = df_simplified["income"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train a random forect classifier with default skicit parameters 
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# evaluate the model
accuracy = rf.score(X_test, y_test)
print(f"Random Forest Classifier Accuracy: {accuracy:.2f}")
AUC = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
print(f"Random Forest Classifier AUC: {AUC:.2f}")

Random Forest Classifier Accuracy: 0.83
Random Forest Classifier AUC: 0.86


In [5]:
# save the model in a pickle file
import pickle
with open("random_forest_model.pkl", "wb") as f:
    pickle.dump(rf, f)

#save training and test sets as csv files
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)