In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
import joblib


In [3]:
df = pd.read_csv("adult 3.csv")
df.head()  
print(df.columns.tolist())

['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']


In [4]:
df = df[['education', 'occupation', 'gender', 'hours-per-week', 'income']]

In [5]:
categorical_cols = ['education', 'occupation', 'gender']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [6]:
joblib.dump(label_encoders, "encoders.pkl")

['encoders.pkl']

In [7]:
X = df[['education', 'occupation', 'gender', 'hours-per-week']]
y = df['income']

In [8]:
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)
joblib.dump(target_encoder, "target_encoder.pkl")

['target_encoder.pkl']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "LogisticRegression": LogisticRegression(),
    "RandomForest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "GradientBoosting": GradientBoostingClassifier()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(f"{name}: {acc:.4f}")

LogisticRegression: 0.7609
RandomForest: 0.7961
KNN: 0.7821
SVM: 0.7656
GradientBoosting: 0.8038


In [10]:
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
joblib.dump(best_model, "best_model.pkl")
print(f"Saved best model: {best_model_name}")

Saved best model: GradientBoosting
