In [2]:
!pip install catboost
!pip install lightgbm
!pip install xgboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl (102.4 MB)
   ---------------------------------------- 0.0/102.4 MB ? eta -:--:--
   -- ------------------------------------- 6.3/102.4 MB 38.6 MB/s eta 0:00:03
   ---- ----------------------------------- 10.5/102.4 MB 28.5 MB/s eta 0:00:04
   ------ --------------------------------- 16.5/102.4 MB 28.9 MB/s eta 0:00:03
   ------- -------------------------------- 19.7/102.4 MB 25.4 MB/s eta 0:00:04
   -------- ------------------------------- 22.8/102.4 MB 22.9 MB/s eta 0:00:04
   --------- ------------------------------ 24.9/102.4 MB 21.0 MB/s eta 0:00:04
   ---------- ----------------------------- 26.2/102.4 MB 18.9 MB/s eta 0:00:05
   ---------- ----------------------------- 27.0/102.4 MB 16.9 MB/s eta 0:00:05
   ---------- ---------------------


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 12.8 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting xgboost
  Downloading xgboost-3.0.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.3-py3-none-win_amd64.whl (149.9 MB)
   ---------------------------------------- 0.0/149.9 MB ? eta -:--:--
   - -------------------------------------- 3.9/149.9 MB 23.4 MB/s eta 0:00:07
   -- ------------------------------------- 8.4/149.9 MB 21.7 MB/s eta 0:00:07
   --- ------------------------------------ 11.8/149.9 MB 19.9 MB/s eta 0:00:07
   --- ------------------------------------ 13.9/149.9 MB 17.4 MB/s eta 0:00:08
   ---- ----------------------------------- 15.2/149.9 MB 15.7 MB/s eta 0:00:09
   ---- ----------------------------------- 17.0/149.9 MB 14.1 MB/s eta 0:00:10
   ---- ----------------------------------- 18.1/149.9 MB 13.0 MB/s eta 0:00:11
   ----- ---------------------------------- 18.9/149.9 MB 12.0 MB/s eta 0:00:11
   ----- ---------------------------------- 19.9/149.9 MB 11.2 MB/s eta 0:00:12
   ----- ---------------------------------- 21.0/149.9 MB 10.4 


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# 1. IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import json
import warnings
warnings.filterwarnings("ignore")

KeyboardInterrupt: 

In [None]:
# 2. LOAD DATA
column_names = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income"
]

# Read the data and skip initial spaces
df = pd.read_csv("./Dataset/adult.data", names=column_names, skipinitialspace=True)

# Display the first few rows
print(df.head())

In [None]:
# 3. DATA CLEANING + ENCODING
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Remove whitespace
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Label encode categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [None]:
# 4. SPLIT DATA
X = df.drop("income", axis=1)
y = df["income"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# 5. MODEL FUNCTION
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[name] = {
        "accuracy": acc,
        "precision": pre,
        "recall": rec,
        "f1_score": f1
    }
    print(f"{name} done.")

In [None]:
# 6. INIT RESULTS DICT
results = {}

In [None]:
# 7. MODELS
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
evaluate_model("Logistic Regression", lr, X_test, y_test)

# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
evaluate_model("Decision Tree", dt, X_test, y_test)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
evaluate_model("Random Forest", rf, X_test, y_test)

# KNN
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
evaluate_model("KNN", knn, X_test, y_test)

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
evaluate_model("Naive Bayes", nb, X_test, y_test)

# SVM
svm = SVC()
svm.fit(X_train, y_train)
evaluate_model("SVM", svm, X_test, y_test)

# XGBoost
xg = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xg.fit(X_train, y_train)
evaluate_model("XGBoost", xg, X_test, y_test)

# LightGBM
lg = lgb.LGBMClassifier()
lg.fit(X_train, y_train)
evaluate_model("LightGBM", lg, X_test, y_test)

# CatBoost
cb = CatBoostClassifier(verbose=0)
cb.fit(X_train, y_train)
evaluate_model("CatBoost", cb, X_test, y_test)

In [None]:
# 8. ANN Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Build the model
ann = Sequential()
ann.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
ann.add(Dense(32, activation='relu'))
ann.add(Dense(1, activation='sigmoid'))

# Compile the model
ann.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
ann.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

# Predict and convert probabilities to binary predictions
y_pred_ann = (ann.predict(X_test) > 0.5).astype("int32").flatten()

# Convert y_test to integer if it's in string/object format
y_test_int = y_test.astype("int32")

# Evaluate the model manually (since evaluate_model() expects sklearn-style models)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_ann(y_true, y_pred):
    print("Model: ANN")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

# Evaluate ANN
evaluate_ann(y_test_int, y_pred_ann)

In [None]:
import os
import json

# 9. SAVE RESULTS

# Ensure the "tabular" directory exists
os.makedirs("tabular", exist_ok=True)

# Save results to JSON file
with open("tabular/tabular_results.json", "w") as f:
    json.dump(results, f, indent=4)

# 10. SHOW RESULTS

# Convert the results dictionary to a DataFrame and display it
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by="accuracy", ascending=False)
results_df