<a href="https://colab.research.google.com/github/RAMJI123-ai/ML_projects/blob/main/predictive_model(pf%2Canual%2Creferal_bonus).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =========================================================
# 1. IMPORTS
# =========================================================
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os

np.random.seed(42)

# =========================================================
# 2. CREATE SYNTHETIC DATASET  (Replace this with your real data)
# =========================================================
N = 5000

base_salary = np.random.normal(700000, 250000, N).clip(150000, 3000000)
years_experience = np.random.poisson(5, N)
performance_rating = np.random.choice([1,2,3,4,5], N, p=[0.05,0.15,0.40,0.30,0.10])
department = np.random.choice(['Engineering','Sales','HR','Operations','Finance','Product'], N)
city_tier = np.random.choice(['Tier-1','Tier-2','Tier-3'], N)
tenure_months = np.random.randint(1, 120, N)
num_referrals = np.random.poisson(0.4, N)
past_bonus = base_salary * np.random.uniform(0.02, 0.15, N)
company_size = np.random.choice(['Startup','SME','Large'], N)
role_level = np.random.choice(['IC','Senior IC','Manager','Senior Manager','Director'], N)
age = np.random.randint(22, 55, N)

# =========================================================
# 3. GENERATE TARGET VARIABLES
# =========================================================

# PF (approx 12% salary with company adjustments)
pf = base_salary * 0.12
pf *= np.where(company_size=='Large', 1.05, np.where(company_size=='Startup', 0.95, 1.0))
pf += np.random.normal(0, 3000, N)

# Annual Bonus
bonus_pct = (performance_rating * 0.03) + 0.05
annual_bonus = base_salary * bonus_pct
annual_bonus *= np.where(company_size=='Large', 1.2, 1.0)
annual_bonus += np.random.normal(0, 20000, N)

# Referral Bonus
avg_payout = np.where(company_size=='Large', 20000,
                np.where(company_size=='Startup', 15000, 18000))
referral_bonus = num_referrals * avg_payout * np.random.uniform(0.05, 0.20, N)
referral_bonus += np.random.normal(0, 1500, N)

# =========================================================
# 4. BUILD DATAFRAME
# =========================================================
df = pd.DataFrame({
    'base_salary': base_salary,
    'years_experience': years_experience,
    'performance_rating': performance_rating,
    'department': department,
    'city_tier': city_tier,
    'tenure_months': tenure_months,
    'num_referrals': num_referrals,
    'past_bonus': past_bonus,
    'company_size': company_size,
    'role_level': role_level,
    'age': age,
    'pf': pf,
    'annual_bonus': annual_bonus,
    'referral_bonus': referral_bonus
})

print(df.head())

# =========================================================
# 5. PREPROCESSING
# =========================================================
numeric_features = [
    'base_salary','years_experience','tenure_months',
    'num_referrals','past_bonus','age'
]

categorical_features = [
    'performance_rating','department','city_tier',
    'company_size','role_level'
]

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# =========================================================
# 6. TRAINING FUNCTION
# =========================================================
def train_model(target_column, model_filename):
    print(f"\nTraining model for: {target_column}")

    X = df[numeric_features + categorical_features]
    y = df[target_column]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('rf', RandomForestRegressor(
            n_estimators=200,
            random_state=42,
            n_jobs=-1
        ))
    ])

    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)

    print(f"RMSE: {rmse:.2f}")
    print(f"RÂ² Score: {r2:.4f}")

    joblib.dump(model, model_filename)
    print(f"Model saved as: {model_filename}\n")

# =========================================================
# 7. TRAIN ALL MODELS
# =========================================================
train_model('pf', 'model_pf.joblib')
train_model('annual_bonus', 'model_annual_bonus.joblib')
train_model('referral_bonus', 'model_referral_bonus.joblib')

# =========================================================
# 8. EXAMPLE PREDICTION
# =========================================================
example = pd.DataFrame([{
    'base_salary': 900000,
    'years_experience': 6,
    'performance_rating': 4,
    'department': 'Engineering',
    'city_tier': 'Tier-1',
    'tenure_months': 50,
    'num_referrals': 2,
    'past_bonus': 60000,
    'company_size': 'Large',
    'role_level': 'Senior IC',
    'age': 30
}])

loaded_model = joblib.load("model_annual_bonus.joblib")
prediction = loaded_model.predict(example)[0]

print("\nPredicted Annual Bonus:", prediction)


    base_salary  years_experience  performance_rating   department city_tier  \
0  8.241785e+05                 3                   4  Engineering    Tier-1   
1  6.654339e+05                 5                   4      Product    Tier-2   
2  8.619221e+05                 4                   1      Finance    Tier-2   
3  1.080757e+06                 5                   4        Sales    Tier-2   
4  6.414617e+05                 5                   5           HR    Tier-2   

   tenure_months  num_referrals     past_bonus company_size      role_level  \
0             46              0   50338.129591      Startup  Senior Manager   
1             27              1   48237.155221        Large        Director   
2            115              0   36775.185811        Large       Senior IC   
3            114              0  141759.082543        Large       Senior IC   
4             45              3   15629.518750          SME        Director   

   age             pf   annual_bonus  referr

KeyboardInterrupt: 