In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report

import joblib
import pickle


In [None]:
data = pd.read_csv('adult.csv')
data.head()

In [None]:
data.shape

In [None]:
data.isna().sum()

In [None]:
data.workclass.value_counts()

In [None]:
data.workclass.replace({'?':'Others'},inplace=True)
data['workclass'].value_counts()

In [None]:
data['occupation'].value_counts()

In [None]:
data.occupation.replace({'?':'Others'},inplace=True)
data['occupation'].value_counts()

In [None]:
data['native-country'].replace({'?':'Others'},inplace=True)
data['native-country'].value_counts()

In [None]:
data=data[data['workclass']!='Without-pay']
data=data[data['workclass']!='Never-worked']
data['workclass'].value_counts()

In [None]:
data.relationship.value_counts()

In [None]:
data.gender.value_counts()

In [None]:
data.shape

In [None]:
plt.boxplot(data['age'])
plt.show()

In [None]:
data=data[(data['age']<=75)&(data['age']>=17)]

In [None]:
plt.boxplot(data['age'])
plt.show()

In [None]:
data.shape

In [None]:
plt.boxplot(data['capital-gain'])
plt.show()

In [None]:
plt.boxplot(data['educational-num'])
plt.show()

In [None]:
data=data[(data['educational-num']<=16)&(data['educational-num']>=5)]

In [None]:
plt.boxplot(data['educational-num'])
plt.show()

In [None]:
plt.boxplot(data['hours-per-week'])
plt.show()

In [None]:
data=data.drop(columns=['education']) #redundant features removal

In [None]:
label_encoders = {}
categorical_cols = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']

for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Save encoders
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [None]:
data.head()

In [None]:
data['income'] = data['income'].map({'<=50K': 0, '>50K': 1})

In [None]:
data.head()

In [None]:
X = data.drop(columns=['income'])
y = data['income']

In [None]:
X.head(10)

In [None]:
y.head(10)

In [None]:
scaler = StandardScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
X_scaled.head()

In [None]:
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape,y_train.shape

In [None]:
X_test.shape,y_test.shape

In [None]:
with open("column_order.pkl", "wb") as f:
    pickle.dump(X_train.columns.tolist(), f)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier()
}


In [None]:
results = {}

In [None]:
best_accuracy = 0
best_model = None
best_model_name = ""


for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model
        best_model_name = name

with open(f'{best_model_name}_best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)
    
print(f"Saved best model: {best_model_name} with accuracy {best_accuracy:.4f}")


In [None]:
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])
results_df.sort_values(by='Accuracy', ascending=False)

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib

# Load the trained model
model = joblib.load("GradientBoostingClassifier_best_model.pkl")

st.set_page_config(page_title="Employee Salary Classification", page_icon="💼", layout="centered")

st.title("💼 Employee Salary Classification App")
st.markdown("Predict whether an employee earns >50K or ≤50K based on input features.")

# Sidebar inputs (these must match your training feature columns)
st.sidebar.header("Input Employee Details")

# ✨ Replace these fields with your dataset's actual input columns
age = st.sidebar.slider("Age", 18, 65, 30)
education = st.sidebar.selectbox("Education Level", [
    "Bachelors", "Masters", "PhD", "HS-grad", "Assoc", "Some-college"
])
occupation = st.sidebar.selectbox("Job Role", [
    "Tech-support", "Craft-repair", "Other-service", "Sales",
    "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct",
    "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv",
    "Protective-serv", "Armed-Forces"
])
hours_per_week = st.sidebar.slider("Hours per week", 1, 80, 40)
experience = st.sidebar.slider("Years of Experience", 0, 40, 5)

# Build input DataFrame (⚠️ must match preprocessing of your training data)
input_df = pd.DataFrame({
    'age': [age],
    'education': [education],
    'occupation': [occupation],
    'hours-per-week': [hours_per_week],
    'experience': [experience]
})

st.write("### 🔎 Input Data")
st.write(input_df)

# Predict button
if st.button("Predict Salary Class"):
    prediction = model.predict(input_df)
    st.success(f"✅ Prediction: {prediction[0]}")

# Batch prediction
st.markdown("---")
st.markdown("#### 📂 Batch Prediction")
uploaded_file = st.file_uploader("Upload a CSV file for batch prediction", type="csv")

if uploaded_file is not None:
    batch_data = pd.read_csv(uploaded_file)
    st.write("Uploaded data preview:", batch_data.head())
    batch_preds = model.predict(batch_data)
    batch_data['PredictedClass'] = batch_preds
    st.write("✅ Predictions:")
    st.write(batch_data.head())
    csv = batch_data.to_csv(index=False).encode('utf-8')
    st.download_button("Download Predictions CSV", csv, file_name='predicted_classes.csv', mime='text/csv')



In [None]:
!streamlit run app.py