In [None]:
pip install --upgrade pip

In [None]:
%pip install pandas
%pip install scikit-learn matplotlib 

In [None]:
%pip install seaborn joblib

In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

In [14]:
# Load data
df = pd.read_csv('../data/adult.csv')
df = df.replace('?', pd.NA).dropna()

In [15]:
df["years_experience"] = pd.to_numeric(df["years_experience"], errors='coerce')
df = df.dropna(subset=["years_experience"])

In [16]:
# Filter for experience range 2–30 years
df = df[(df["years_experience"] >= 2) & (df["years_experience"] <= 30)]

In [17]:
# Approximate salary mapping (INR)
salary_map = {'<=50K': 340_000, '>50K': 800_000}
df['salary_inr'] = df['income'].map(salary_map)

In [18]:
# Feature selection
X = df[["age", "years_experience", "education", "occupation", "hours-per-week", "gender"]]
y = df["salary_inr"]

In [19]:
# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore'), ["education", "occupation", "gender"])
], remainder="passthrough")

In [20]:
model = Pipeline([
    ("prep", preprocessor),
    ("reg", RandomForestRegressor(n_estimators=100, random_state=42))
])

In [21]:
model.fit(X, y)
joblib.dump(model, '../salary_model.pkl')
print("Model trained and saved!")


Model trained and saved!
