Salary Prediction Project

1) Jupyter Notebook: model_training

In [29]:
#Model_training 
#import and setup
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

print('Libraries loaded')

#load dataset
df= pd.read_csv('employee_salary_dataset.csv')
df.head()

print(df.shape)
print(df.dtypes)

#Define target and feature for regression
TARGET='Monthly_Salary'
X = df.drop(columns=[TARGET])
y = df[TARGET]

# Identity categorical and numeric
cat_cols = [c for c in X.columns if X[c].dtypes == 'object']
num_cols = [c for c in X.columns if X[c].dtypes != 'object']
print('cat:' , cat_cols)
print('num:' , num_cols)

#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output= False), cat_cols)
    ]
)

# fit_tranform on train
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

#scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_pre)
X_test_scaled = scaler.transform(X_test_pre)


#MLflow experiment

mlflow.set_experiment('salary_regression_experiment')

# Train models
models = {
    'LinearRegression:' : LinearRegression(),
    'SVR': SVR(kernel='rbf', C=1.0),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42)
}

results={}

for name, model in models.items():
    with mlflow.start_run(run_name=name): 
        if name == 'RandomForest':
            model.fit(X_train_pre, y_train)
            preds = model.predict(X_test_pre)
            mlflow.log_param('n_estimators', 100)
        else :
            model.fit(X_train_scaled, y_train)
            preds = model.predict(X_test_scaled)
            if name == 'SVR':
                mlflow.log_param('kernel','rbf')
                mlflow.log_param('C',1.0)

        mse = mean_squared_error(y_test, preds)
        r2 = r2_score(y_test, preds)

        mlflow.log_metric('MSE', float(mse))
        mlflow.log_metric('R2', float(r2))

        mlflow.sklearn.log_model(model, name='model')

        results[name]={'model' : model, 'MSE' : mse, 'R2' : r2}
        print(name , 'MSE:', mse, 'R2:', r2)

# Compare results table
import pandas as pd
res_df = pd.DataFrame([{'model': K, 'MSE': V['MSE'], 'R2': V['R2']} for K, V in results.items()])
res_df

#Pick best model by R2
best_name = res_df.sort_values('R2', ascending=False).iloc[0]['model']
best_model = results[best_name]['model']
print('Best Model:', best_name)

# Retrain best model on traininf data 
if best_name =='RandomForest' :
    best_model.fit(X_train_pre, y_train)
else: 
    best_model.fit(X_train_scaled, y_train)

#Save
joblib.dump(best_model, 'best_salary_model.joblib')
joblib.dump(preprocessor, 'preprocessor.joblib')
joblib.dump(scaler, 'scaler.joblib')
print('Save best model and preprocessor')

Libraries loaded
(50, 9)
EmployeeID           int64
Name                object
Department          object
Experience_Years     int64
Education_Level     object
Age                  int64
Gender              object
City                object
Monthly_Salary       int64
dtype: object
cat: ['Name', 'Department', 'Education_Level', 'Gender', 'City']
num: ['EmployeeID', 'Experience_Years', 'Age']




LinearRegression: MSE: 1135660145.5786853 R2: 0.03679452388401583




SVR MSE: 1325679720.5658631 R2: -0.12436979618959065




RandomForest MSE: 1309721994.1423001 R2: -0.11083531623326137
Best Model LinearRegression:
Save best model and preprocessor
