<a href="https://colab.research.google.com/github/Rudrasuhan12/EXPENSE-TRACKER/blob/main/Employee_Salary_Prediction_Notebook_(Colab).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# %%
# =============================================================================
# Step 1: Install necessary libraries
# We need to install streamlit and pyngrok to run the web app in Colab.
# =============================================================================
!pip install streamlit -q
!pip install pyngrok -q

# %%
# =============================================================================
# Step 2: Import all required libraries
# =============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from pyngrok import ngrok
import io
from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# %%
# =============================================================================
# Step 3: Load the dataset
# In Google Colab, you can't use a local file path directly.
# This code will prompt you to upload your 'adult.csv' file.
# =============================================================================
print("Please upload your 'adult.csv' file.")
uploaded = files.upload()

# Check if a file was uploaded
if len(uploaded.keys()) == 0:
    print("No file uploaded. Please run the cell again and upload the file.")
else:
    # Get the filename and load the data
    file_name = list(uploaded.keys())[0]
    print(f"\nUploaded file: {file_name}")
    # The file content is in bytes, so we use io.BytesIO to read it into pandas
    data = pd.read_csv(io.BytesIO(uploaded[file_name]))
    print("Dataset loaded successfully!")
    print("First 5 rows of the dataset:")
    print(data.head())


# %%
# =============================================================================
# Step 4: Data Cleaning and Preprocessing
# This section contains all the cleaning and transformation steps you performed.
# =============================================================================

# --- Handle missing values represented by '?' ---
# Replace '?' with a more descriptive 'Others' category.
print("\n--- Data Cleaning Started ---")
# Using a dictionary with .replace() on the DataFrame to avoid FutureWarning
data.replace({
    'workclass': {'?': 'Others'},
    'occupation': {'?': 'Others'},
    'native-country': {'?': 'Others'}
}, inplace=True)
print("Replaced '?' with 'Others' in categorical columns.")


# --- Remove less frequent categories ---
# These categories have very few samples and might not be useful for prediction.
data = data[data['workclass'] != 'Without-pay']
data = data[data['workclass'] != 'Never-worked']
print("Removed 'Without-pay' and 'Never-worked' from 'workclass'.")

# --- Remove redundant features ---
# The 'education' column is redundant because 'educational-num' represents the same info numerically.
if 'education' in data.columns:
    data = data.drop(columns=['education'])
    print("Dropped the redundant 'education' column.")

# --- Handle Outliers ---
# Removing extreme outliers based on your analysis to create a more robust model.
initial_rows = len(data)
data = data[(data['age'] <= 75) & (data['age'] >= 17)]
data = data[(data['educational-num'] <= 16) & (data['educational-num'] >= 5)]
print(f"Removed outliers. Rows changed from {initial_rows} to {len(data)}.")

# --- Encode Categorical Features ---
# Machine learning models require numerical input. We use LabelEncoder to convert
# string categories into numbers. We will save these encoders to use in our app.
print("\nEncoding categorical features...")
encoders = {}
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()

# The target variable 'income' is also categorical, let's encode it separately.
if 'income' in categorical_cols:
    categorical_cols.remove('income')

for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    encoders[col] = le # Save the encoder
    print(f"Encoded '{col}'.")

# --- Save the encoders and column order ---
# This is crucial for the Streamlit app to process new data correctly.
joblib.dump(encoders, 'encoders.pkl')
print("\nSaved all label encoders to 'encoders.pkl'.")

# Save the order of columns the model will be trained on
model_columns = data.drop(columns=['income']).columns.tolist()
joblib.dump(model_columns, 'model_columns.pkl')
print("Saved model column order to 'model_columns.pkl'.")

print("\n--- Data Cleaning and Preprocessing Complete! ---")
print("Final data preview (first 5 rows):")
print(data.head())


# %%
# =============================================================================
# Step 5: Model Training and Evaluation
# =============================================================================

print("\n--- Model Training and Evaluation Started ---")
# --- Define Features (X) and Target (y) ---
X = data.drop(columns=['income'])
y = data['income'] # y remains as string labels ('<=50K', '>50K')

# --- Split data into training and testing sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Data split into training ({len(X_train)} rows) and testing ({len(X_test)} rows) sets.")

# --- Define Models to Compare ---
# Using a dictionary to hold all the models we want to test.
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=120, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "SVM": SVC(probability=True, random_state=42), # Enable probability for confidence scores
    "GradientBoosting": GradientBoostingClassifier(n_estimators=120, random_state=42)
}

results = {}

# --- Train, Evaluate, and Find the Best Model ---
for name, model in models.items():
    # We use a pipeline to scale the data and then train the model.
    # This prevents data leakage and is good practice.
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    # The pipeline is trained with string labels for y, so it will predict strings.
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"\nModel: {name}")
    print(f"Accuracy: {acc:.4f}")

# --- Identify and Save the Best Model ---
best_model_name = max(results, key=results.get)
best_model_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', models[best_model_name])
])
best_model_pipeline.fit(X_train, y_train)

print(f"\n🏆 Best performing model is {best_model_name} with an accuracy of {results[best_model_name]:.4f}")

# Save the entire pipeline (scaler + model)
joblib.dump(best_model_pipeline, "best_model_pipeline.pkl")
print("✅ Saved the best model pipeline to 'best_model_pipeline.pkl'.")
print("\n--- Model Training and Evaluation Complete! ---")


# %%
# =============================================================================
# Step 6: Create the Streamlit Web Application
# This code writes a file 'app.py' which contains our app's logic.
# I've wrapped the string in parentheses to prevent parsing errors.
# =============================================================================

app_code = ("""
import streamlit as st
import pandas as pd
import joblib
import numpy as np
from sklearn.preprocessing import LabelEncoder

# --- Load Saved Artifacts ---
# We load the trained model pipeline, the encoders, and the column order.
try:
    model_pipeline = joblib.load("best_model_pipeline.pkl")
    encoders = joblib.load("encoders.pkl")
    model_columns = joblib.load("model_columns.pkl")
except FileNotFoundError:
    st.error("Could not find model files. Please ensure 'best_model_pipeline.pkl', 'encoders.pkl', and 'model_columns.pkl' are in the same directory.")
    st.stop()


# --- Page Configuration ---
st.set_page_config(
    page_title="Employee Salary Prediction",
    page_icon="💼",
    layout="wide"
)

# --- App Title and Description ---
st.title("💼 Employee Salary Prediction")
st.markdown('''
This app predicts whether an employee's annual income is **more than $50K** or **less than or equal to $50K**.
''')

# --- Sidebar for User Inputs ---
st.sidebar.header("Single Employee Prediction")
st.sidebar.markdown("Enter details for one employee below.")

# Create dictionaries of original values for selectboxes.
workclass_options = ['Private', 'Self-emp-not-inc', 'Local-gov', 'State-gov', 'Self-emp-inc', 'Federal-gov', 'Others']
marital_status_options = ['Married-civ-spouse', 'Never-married', 'Divorced', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse']
occupation_options = ['Prof-specialty', 'Craft-repair', 'Exec-managerial', 'Adm-clerical', 'Sales', 'Other-service', 'Machine-op-inspct', 'Transport-moving', 'Handlers-cleaners', 'Farming-fishing', 'Tech-support', 'Protective-serv', 'Priv-house-serv', 'Armed-Forces', 'Others']
relationship_options = ['Husband', 'Not-in-family', 'Own-child', 'Unmarried', 'Wife', 'Other-relative']
race_options = ['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other']
gender_options = ['Male', 'Female']
native_country_options = ['United-States', 'Mexico', 'Philippines', 'Germany', 'Puerto-Rico', 'Canada', 'El-Salvador', 'India', 'Cuba', 'England', 'Jamaica', 'South', 'China', 'Italy', 'Dominican-Republic', 'Vietnam', 'Guatemala', 'Japan', 'Poland', 'Columbia', 'Taiwan', 'Haiti', 'Iran', 'Portugal', 'Nicaragua', 'Peru', 'France', 'Greece', 'Ecuador', 'Ireland', 'Hong', 'Trinadad&Tobago', 'Cambodia', 'Laos', 'Thailand', 'Yugoslavia', 'Outlying-US(Guam-USVI-etc)', 'Hungary', 'Honduras', 'Scotland', 'Holand-Netherlands', 'Others']


# Create input fields in the sidebar
age = st.sidebar.slider("Age", 17, 75, 35)
workclass = st.sidebar.selectbox("Work Class", options=workclass_options)
fnlwgt = st.sidebar.number_input("Final Weight (fnlwgt)", min_value=1, value=189778)
educational_num = st.sidebar.slider("Education Level (Numeric)", 5, 16, 10)
marital_status = st.sidebar.selectbox("Marital Status", options=marital_status_options)
occupation = st.sidebar.selectbox("Occupation", options=occupation_options)
relationship = st.sidebar.selectbox("Relationship", options=relationship_options)
race = st.sidebar.selectbox("Race", options=race_options)
gender = st.sidebar.selectbox("Gender", options=gender_options)
capital_gain = st.sidebar.number_input("Capital Gain", min_value=0, value=0)
capital_loss = st.sidebar.number_input("Capital Loss", min_value=0, value=0)
hours_per_week = st.sidebar.slider("Hours per Week", 1, 99, 40)
native_country = st.sidebar.selectbox("Native Country", options=native_country_options)


# --- Prediction Logic ---
if st.sidebar.button("Predict Salary", use_container_width=True):
    # 1. Create a dictionary with the user's input
    input_data = {
        'age': age, 'workclass': workclass, 'fnlwgt': fnlwgt,
        'educational-num': educational_num, 'marital-status': marital_status,
        'occupation': occupation, 'relationship': relationship, 'race': race,
        'gender': gender, 'capital-gain': capital_gain, 'capital-loss': capital_loss,
        'hours-per-week': hours_per_week, 'native-country': native_country
    }
    input_df = pd.DataFrame([input_data])

    st.markdown("### Your Input (Single Prediction):")
    st.dataframe(input_df)

    # 3. Preprocess the input DataFrame
    for col, encoder in encoders.items():
        input_df[col] = input_df[col].apply(lambda x: encoder.transform([x])[0] if x in encoder.classes_ else -1)
    input_df = input_df[model_columns]

    # 4. Make a prediction
    # The model was trained on string labels, so it will predict string labels.
    # No need to decode the prediction.
    prediction_decoded = model_pipeline.predict(input_df)
    prediction_proba = model_pipeline.predict_proba(input_df)

    # 5. Display the result
    st.markdown("---")
    st.markdown("### Prediction Result")

    if prediction_decoded[0] == '>50K':
        st.success(f"**Predicted Income: >$50K**")
    else:
        st.info(f"**Predicted Income: ≤$50K**")

    st.write("Prediction Confidence:")
    # The order of probabilities corresponds to model.classes_ which is ['<=50K', '>50K']
    st.write(f"**≤$50K:** `{prediction_proba[0][0]:.2%}`")
    st.write(f"**>$50K:** `{prediction_proba[0][1]:.2%}`")

# --- Batch Prediction from File ---
st.markdown("---")
st.header("📂 Batch Prediction from File")
st.markdown("Upload a CSV file with the same format as the training data to predict salaries for multiple employees.")

uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

if uploaded_file is not None:
    batch_df = pd.read_csv(uploaded_file)
    st.markdown("### Uploaded Data Preview:")
    st.dataframe(batch_df.head())

    predict_df = batch_df.copy()

    try:
        # Preprocess the uploaded data
        for col, encoder in encoders.items():
            if col in predict_df.columns:
                predict_df[col] = predict_df[col].apply(lambda x: encoder.transform([x])[0] if x in encoder.classes_ else -1)

        # Ensure columns are in the correct order for the model
        predict_df_ordered = predict_df[model_columns]

        # Make Predictions
        batch_predictions_decoded = model_pipeline.predict(predict_df_ordered)

        # Add predictions to the original DataFrame
        batch_df['predicted_income'] = batch_predictions_decoded

        # Display and Download Results
        st.markdown("### Prediction Results:")
        st.dataframe(batch_df)

        csv = batch_df.to_csv(index=False).encode('utf-8')
        st.download_button(
            label="Download Predictions as CSV",
            data=csv,
            file_name='predicted_salaries.csv',
            mime='text/csv',
            use_container_width=True
        )

    except Exception as e:
        st.error(f"An error occurred during batch prediction: {e}")
        st.warning("Please ensure your uploaded CSV has the correct columns (e.g., 'age', 'workclass', etc.) and data types.")

""")

with open("app.py", "w") as f:
    f.write(app_code)

print("✅ Streamlit app code written to app.py")


# %%
# =============================================================================
# Step 7: Run the Streamlit App using ngrok
# This will create a public URL to access your app.
# =============================================================================
# --- Robustly terminate any existing ngrok tunnels ---
# This is to prevent the ERR_NGROK_108 error (too many sessions).
try:
    # Get a list of all active tunnels
    tunnels = ngrok.get_tunnels()
    # Disconnect each tunnel
    for tunnel in tunnels:
        ngrok.disconnect(tunnel.public_url)
    print("Disconnected all active ngrok tunnels.")
except Exception as e:
    # This might fail if no tunnels are active, which is fine.
    print(f"No active tunnels to disconnect or an error occurred: {e}")

# Kill any existing ngrok process
ngrok.kill()
print("Killed any lingering ngrok processes.")


# --- Set up a new ngrok tunnel ---
# We are using the authtoken you provided to authenticate with ngrok.
authtoken = "300Jsjq0oqcC3BEAqCfFVw6LLUw_qXgJEgiAeX4ZUP6HkJ95"
ngrok.set_auth_token(authtoken)

# Connect to the port and create a public URL
public_url = ngrok.connect(8501)
print(f"🚀 Your Streamlit app is live at: {public_url}")

# Run the Streamlit app
!streamlit run app.py --server.port 8501 --server.enableCORS=false

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hPlease upload your 'adult.csv' file.


Saving adult 3.csv to adult 3.csv

Uploaded file: adult 3.csv
Dataset loaded successfully!
First 5 rows of the dataset:
   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   

ERROR:pyngrok.process.ngrok:t=2025-07-17T15:05:43+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-07-17T15:05:43+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-07-17T15:05:43+0000 lvl=eror msg="terminating with error" obj=app err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your aut

No active tunnels to disconnect or an error occurred: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.
Killed any lingering ngrok processes.
🚀 Your Streamlit app is live at: NgrokTunnel: "https://ac56d938ce80.ngrok-free.app" -> "http://localhost:8501"
2025-07-17 15:05:44.641 
'server.enableXsrfProtection=true'.
As a result, 'server.enableCORS' is being overridden to 'true'.

More information:
In order to protect against CSRF attacks, we send a cookie with each request.
To do so, we must specify allowable origins, which places a restriction on
cross-origin resource sharing.

If cross origin resource sharing is required, please disable server.enableXsrfProtection.
            

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to fa