In [3]:
# income_classifier_with_streamlit.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
import os

# =============================
# Data Loading and Preprocessing

@st.cache_data
def load_data():
    data = pd.read_csv("adult 3.csv")
    data.replace('?', np.nan, inplace=True)
    data.dropna(subset=['workclass', 'occupation', 'native-country'], inplace=True)
    data = data[~data['workclass'].isin(['Without-pay', 'Never-worked'])]
    data.drop(columns=['education', 'fnlwgt'], inplace=True)  # ✅ Removed fnlwgt
    data = data[(data['age'] >= 17) & (data['age'] <= 75)]
    data = data[(data['educational-num'] >= 5) & (data['educational-num'] <= 16)]
    return data

data = load_data()
X = data.drop(columns='income')
y = data['income']

numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

X_preprocessed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Model Training and Selection

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "GradientBoosting": GradientBoostingClassifier()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(f"{name}: {acc:.4f}")

best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"\n✅ Best model: {best_model_name} with accuracy {results[best_model_name]:.4f}")

# Save the best model
if not os.path.exists("models"):
    os.makedirs("models")
joblib.dump(best_model, "models/best_model.pkl")
print("✅ Saved best model as models/best_model.pkl")

# =============================
# Streamlit App
# =============================

st.set_page_config(page_title="Income Classifier", page_icon="💼", layout="centered")
st.title("💼 Employee Income Classification")
st.markdown("Predict whether income is >50K or <=50K using demographic and job info.")

model = joblib.load('models/best_model.pkl')

st.sidebar.header("📋 Input Features")
age = st.sidebar.slider("Age", 17, 75, 30)
edu_num = st.sidebar.slider("Education Level (numeric)", 5, 16, 10)
workclass = st.sidebar.selectbox("Workclass", data['workclass'].unique())
marital_status = st.sidebar.selectbox("Marital Status", data['marital-status'].unique())
occupation = st.sidebar.selectbox("Occupation", data['occupation'].unique())
relationship = st.sidebar.selectbox("Relationship", data['relationship'].unique())
race = st.sidebar.selectbox("Race", data['race'].unique())
gender = st.sidebar.selectbox("Gender", data['gender'].unique())
hours_per_week = st.sidebar.slider("Hours per Week", 1, 80, 40)
capital_gain = st.sidebar.number_input("Capital Gain", 0, 100000, 0)
capital_loss = st.sidebar.number_input("Capital Loss", 0, 100000, 0)
native_country = st.sidebar.selectbox("Native Country", data['native-country'].unique())

input_df = pd.DataFrame({
    'age': [age],
    'workclass': [workclass],
    'educational-num': [edu_num],
    'marital-status': [marital_status],
    'occupation': [occupation],
    'relationship': [relationship],
    'race': [race],
    'gender': [gender],
    'capital-gain': [capital_gain],
    'capital-loss': [capital_loss],
    'hours-per-week': [hours_per_week],
    'native-country': [native_country]
})

input_transformed = preprocessor.transform(input_df)

st.write("### 🔍 Input Data")
st.dataframe(input_df)

if st.button("Predict Income Class"):
    prediction = model.predict(input_transformed)
    st.success(f"✅ Prediction: {prediction[0]}")

st.markdown("---")
st.markdown("#### 📂 Batch Prediction")
uploaded_file = st.file_uploader("Upload a CSV for batch prediction", type="csv")

if uploaded_file is not None:
    batch_data = pd.read_csv(uploaded_file)
    batch_transformed = preprocessor.transform(batch_data)
    batch_preds = model.predict(batch_transformed)
    batch_data['PredictedClass'] = batch_preds
    st.write("✅ Batch Predictions:")
    st.dataframe(batch_data.head())
    csv = batch_data.to_csv(index=False).encode('utf-8')
    st.download_button("Download Results", csv, file_name='predicted_income.csv', mime='text/csv')


2025-07-23 02:04:07.733 No runtime found, using MemoryCacheStorageManager
2025-07-23 02:04:07.746 No runtime found, using MemoryCacheStorageManager


LogisticRegression: 0.8490
RandomForest: 0.8534
KNN: 0.8323
SVM: 0.8526
GradientBoosting: 0.8616

✅ Best model: GradientBoosting with accuracy 0.8616
✅ Saved best model as models/best_model.pkl


ValueError: columns are missing: {'fnlwgt'}