In [None]:
# Cell 1: Install Libraries
!pip install streamlit pandas scikit-learn xgboost joblib shap pyngrok -q


In [None]:
# Cell 2: Train and Save the Model

import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
import joblib
import numpy as np



In [None]:
# 1. Load Data
try:
    data = pd.read_csv('adult 3.csv')
    print("Successfully loaded 'adult 3.csv'.")
except FileNotFoundError:
    print("ERROR: 'adult 3.csv' not found. Please upload it.")
    raise



In [None]:
# 2. Data Cleaning
data.replace({' ?': 'Others', '?': 'Others'}, inplace=True)
data = data[data['workclass'] != 'Without-pay']
data = data[data['workclass'] != 'Never-worked']
if 'education' in data.columns:
    data = data.drop(columns=['education'])



In [None]:
# Standalone Cell for Outlier Visualization with Box Plots

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


#  Define the columns to check for outliers
numerical_cols_to_check = ['age', 'hours-per-week', 'capital-gain']

#  Creating "Before" plots
print("Visualizing Outliers: BEFORE Cleaning ")
plt.style.use('seaborn-v0_8-whitegrid')
fig, axes = plt.subplots(1, len(numerical_cols_to_check), figsize=(18, 6))
fig.suptitle('Box Plots of Key Features - BEFORE Outlier Removal', fontsize=16, fontweight='bold')

for i, col in enumerate(numerical_cols_to_check):
    sns.boxplot(y=data_original[col], ax=axes[i], color='skyblue')
    axes[i].set_title(f'Distribution of {col}', fontsize=12)
    axes[i].set_ylabel('') # Clean up y-axis label
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


# Apply the same cleaning rules from your notebook
data_cleaned = data_original.copy()
data_cleaned = data_cleaned[(data_cleaned['age'] <= 75) & (data_cleaned['age'] >= 17)]
# You might have other filtering rules for 'hours-per-week' etc. Add them here if needed.
# Example: data_cleaned = data_cleaned[data_cleaned['hours-per-week'] <= 60]


#  Create "After" plots
print("\n Visualizing Outliers: AFTER Cleaning ")
fig, axes = plt.subplots(1, len(numerical_cols_to_check), figsize=(18, 6))
fig.suptitle('Box Plots of Key Features - AFTER Outlier Removal', fontsize=16, fontweight='bold')

for i, col in enumerate(numerical_cols_to_check):
    sns.boxplot(y=data_cleaned[col], ax=axes[i], color='lightgreen')
    axes[i].set_title(f'Distribution of {col}', fontsize=12)
    axes[i].set_ylabel('')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
# 3. Target Transformation
data['income'] = data['income'].apply(lambda x: 1 if str(x).strip() == '>50K' else 0)

# 4. Define Features (X) and Target (y)
X = data.drop(columns=['income'])
y = data['income']

# 5. Identify feature types
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# 6. Create the Preprocessing Pipeline using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# 7. Create the Full Final Pipeline
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

# 8. Train the Full Pipeline
print("Training the full pipeline...")
final_pipeline.fit(X, y)
print("--- Model Training Complete ---")

# 9. Save the final pipeline object
joblib.dump(final_pipeline, '/content/income_prediction_pipeline.joblib')
print("Final verified pipeline saved successfully.")

In [None]:
%%writefile app.py

import streamlit as st
import pandas as pd
import joblib

# 1. LOAD THE PIPELINE
st.set_page_config(page_title="Salary Predictor", page_icon="💼", layout="centered")

@st.cache_resource
def load_pipeline():
    """Loads the pre-trained pipeline."""
    return joblib.load('/content/income_prediction_pipeline.joblib')

pipeline = load_pipeline()

# 2. USER INTERFACE
st.title('💼 SALARY_SENSE: Employee Income Predictor')
st.write("Predict if an employee earns >50K or ≤50K using a Gradient Boosting model.")

# We create the input fields based on the columns of a sample row.
# This ensures we have all the necessary inputs.
sample_columns = ['age', 'workclass', 'fnlwgt', 'educational-num', 'marital-status',
                  'occupation', 'relationship', 'race', 'gender', 'capital-gain',
                  'capital-loss', 'hours-per-week', 'native-country']

with st.sidebar:
    st.header("👤 Applicant Details")
    input_dict = {}

    # Create inputs for all features the model was trained on
    input_dict['age'] = st.slider("Age", 17, 90, 30)
    input_dict['workclass'] = st.selectbox("Work Class", ['Private', 'Self-emp-not-inc', 'Local-gov', 'Federal-gov', 'State-gov', 'Self-emp-inc', 'Others'])
    input_dict['fnlwgt'] = st.number_input("fnlwgt (Final Weight)", value=180000)
    input_dict['educational-num'] = st.slider("Education Years", 1, 16, 10)
    input_dict['marital-status'] = st.selectbox('Marital Status', ['Married-civ-spouse', 'Never-married', 'Divorced', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse'])
    input_dict['occupation'] = st.selectbox("Occupation", ['Prof-specialty', 'Craft-repair', 'Exec-managerial', 'Adm-clerical', 'Sales', 'Other-service', 'Machine-op-inspct', 'Others', 'Transport-moving', 'Handlers-cleaners', 'Farming-fishing', 'Tech-support', 'Protective-serv', 'Priv-house-serv', 'Armed-Forces'])
    input_dict['relationship'] = st.selectbox('Relationship', ['Husband', 'Not-in-family', 'Own-child', 'Unmarried', 'Wife', 'Other-relative'])
    input_dict['race'] = st.selectbox('Race', ['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other'])
    input_dict['gender'] = st.selectbox('Gender', ['Male', 'Female'])
    input_dict['capital-gain'] = st.number_input("Capital Gain", value=0)
    input_dict['capital-loss'] = st.number_input("Capital Loss", value=0)
    input_dict['hours-per-week'] = st.slider("Hours per Week", 1, 99, 40)
    input_dict['native-country'] = st.selectbox("Native Country", ['United-States', 'Mexico', 'Philippines', 'Germany', 'Canada', 'Puerto-Rico', 'Others'])


# 3. PREDICTION LOGIC
if st.button('**Predict Income Level**', type='primary'):
    input_df = pd.DataFrame([input_dict])

    st.write("### 🔎 Input Data")
    st.write(input_df)

    try:
        prediction = pipeline.predict(input_df)[0]
        prediction_proba = pipeline.predict_proba(input_df)[0]

        st.header("Prediction Result")
        if prediction == 1:
            st.success(f'**Income is likely >$50K** (Confidence: {prediction_proba[1]:.2%})')
        else:
            st.warning(f'**Income is likely ≤50K** (Confidence: {prediction_proba[0]:.2%})')

    except Exception as e:
        st.error(f"An error occurred during prediction: {e}")

In [None]:
# Standalone Cell for SHAP Analysis (Corrected for Summary Plot)

import pandas as pd
import joblib
import shap
import matplotlib.pyplot as plt

print("--- Starting Standalone SHAP Analysis ---")

# 1. Load your saved pipeline
try:
    pipeline = joblib.load('/content/income_prediction_pipeline.joblib')
    print("* Pipeline loaded successfully.")
except FileNotFoundError:
    print("ERROR: Pipeline file not found. Please run the training script first.")
    raise

# 2. Load a sample of your data
try:
    data = pd.read_csv('adult 3.csv')
    data.replace({' ?': 'Others', '?': 'Others'}, inplace=True)
    if 'income' in data.columns:
        X_explain = data.drop(columns=['income'])
    else:
        X_explain = data

    X_sample = X_explain.sample(100, random_state=42)
    print(f"* Loaded and prepared a sample of {len(X_sample)} rows to explain.")
except FileNotFoundError:
    print("ERROR: 'adult 3.csv' not found. Please upload it for SHAP analysis.")
    raise

# 3. Create the SHAP Explainer
print("Creating SHAP explainer...")
preprocessor = pipeline.named_steps['preprocessor']
classifier = pipeline.named_steps['classifier']
transformed_background = preprocessor.transform(X_sample).toarray()
explainer = shap.TreeExplainer(classifier, transformed_background)
print("* SHAP explainer created.")

# 4. Calculate SHAP values
transformed_sample = preprocessor.transform(X_sample).toarray()
shap_values = explainer.shap_values(transformed_sample)
print("* SHAP values calculated.")

# 5. Generate and Display the Plots
feature_names = preprocessor.get_feature_names_out()

# PLOT 1: Force Plot for a single prediction
print("\n--- SHAP Force Plot (Explaining the first person in the sample) ---")
shap.initjs()
display(shap.force_plot(explainer.expected_value, shap_values[0,:], feature_names=feature_names))
print("Force Plot displayed successfully.")


# PLOT 2: Summary Plot (Beeswarm) for overall feature importance
print("\n--- SHAP Summary Plot (Overall Feature Impact) ---")

# THIS IS THE FIX
# We pass the TRANSFORMED data to the 'features' argument.
# This ensures the dimensions of 'shap_values' and 'features' match perfectly.
shap.summary_plot(shap_values, features=transformed_sample, feature_names=feature_names)
# END OF THE FIX

In [None]:
# Cell 4: Launch the Streamlit App
from pyngrok import ngrok

ngrok.kill()
NGROK_AUTH_TOKEN = "30EwBDZQCJvTSQ22N25ei1c7PzH_675q8kJRsw4F23DmYfhm" # <--- PASTE YOUR TOKEN
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

public_url = ngrok.connect(8501)
print(f"✅ Your app is live at: {public_url}")

!streamlit run app.py