In [1]:
!pip install catboost
!pip install lightgbm
!pip install xgboost




[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# 1. IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import json
import warnings
warnings.filterwarnings("ignore")

In [3]:
# 2. LOAD DATA
column_names = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income"
]

# Read the data and skip initial spaces
df = pd.read_csv("./Dataset/adult.data", names=column_names, skipinitialspace=True)

# Display the first few rows
print(df.head())

   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

In [4]:
# 3. DATA CLEANING + ENCODING
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Remove whitespace
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Label encode categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [5]:
# 4. SPLIT DATA
X = df.drop("income", axis=1)
y = df["income"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [6]:
# 5. MODEL FUNCTION
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[name] = {
        "accuracy": acc,
        "precision": pre,
        "recall": rec,
        "f1_score": f1
    }
    print(f"{name} done.")

In [7]:
# 6. INIT RESULTS DICT
results = {}

In [8]:
# 7. MODELS
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
evaluate_model("Logistic Regression", lr, X_test, y_test)

# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
evaluate_model("Decision Tree", dt, X_test, y_test)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
evaluate_model("Random Forest", rf, X_test, y_test)

# KNN
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
evaluate_model("KNN", knn, X_test, y_test)

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
evaluate_model("Naive Bayes", nb, X_test, y_test)

# SVM
svm = SVC()
svm.fit(X_train, y_train)
evaluate_model("SVM", svm, X_test, y_test)

# XGBoost
xg = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xg.fit(X_train, y_train)
evaluate_model("XGBoost", xg, X_test, y_test)

# LightGBM
lg = lgb.LGBMClassifier()
lg.fit(X_train, y_train)
evaluate_model("LightGBM", lg, X_test, y_test)

# CatBoost
cb = CatBoostClassifier(verbose=0)
cb.fit(X_train, y_train)
evaluate_model("CatBoost", cb, X_test, y_test)

Logistic Regression done.
Decision Tree done.
Random Forest done.
KNN done.
Naive Bayes done.
SVM done.
XGBoost done.
[LightGBM] [Info] Number of positive: 5978, number of negative: 18151
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 24129, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247752 -> initscore=-1.110640
[LightGBM] [Info] Start training from score -1.110640
LightGBM done.
CatBoost done.


In [9]:
# 8. ANN Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Build the model
ann = Sequential()
ann.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
ann.add(Dense(32, activation='relu'))
ann.add(Dense(1, activation='sigmoid'))

# Compile the model
ann.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
ann.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

# Predict and convert probabilities to binary predictions
y_pred_ann = (ann.predict(X_test) > 0.5).astype("int32").flatten()

# Convert y_test to integer if it's in string/object format
y_test_int = y_test.astype("int32")

# Evaluate the model manually (since evaluate_model() expects sklearn-style models)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_ann(y_true, y_pred):
    print("Model: ANN")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

# Evaluate ANN
evaluate_ann(y_test_int, y_pred_ann)

[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Model: ANN
Accuracy: 0.8518150174042765
Precision: 0.7438650306748467
Recall: 0.6339869281045751
F1 Score: 0.68454481298518
Confusion Matrix:
 [[4169  334]
 [ 560  970]]


In [10]:
import os
import json

# 9. SAVE RESULTS

# Ensure the "tabular" directory exists
os.makedirs("tabular", exist_ok=True)

# Save results to JSON file
with open("tabular/tabular_results.json", "w") as f:
    json.dump(results, f, indent=4)

# 10. SHOW RESULTS

# Convert the results dictionary to a DataFrame and display it
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by="accuracy", ascending=False)
results_df

Unnamed: 0,accuracy,precision,recall,f1_score
LightGBM,0.873529,0.79342,0.677778,0.731054
CatBoost,0.873529,0.791192,0.681046,0.731999
XGBoost,0.868391,0.772997,0.681046,0.724114
Random Forest,0.854301,0.748284,0.641176,0.690602
SVM,0.847174,0.771914,0.564052,0.651813
KNN,0.825791,0.676753,0.599346,0.635702
Logistic Regression,0.823139,0.744456,0.460784,0.569237
Decision Tree,0.808387,0.622543,0.620915,0.621728
Naive Bayes,0.798442,0.709893,0.347059,0.466198
