In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv(r"C:\Users\SAYAN GHOSH\Downloads\ML Datasets\Refined Employers Data.csv")
df.head()

Unnamed: 0,Industry,Job_Title,Experience_Level,Educational_Level,Location,Salary
0,Finance,Executive,Senior-Level,Master,India,3600000
1,Education,Clerk,Entry-Level,Master,America,4200000
2,Finance,Analyst,Mid-Level,Bachelor,England,6500000
3,Technology,Manager,Mid-Level,Bachelor,England,8000000
4,Healthcare,Executive,Senior-Level,Master,America,12000000


In [3]:
np.random.seed(37)
df['Salary'] = (df['Salary'] * np.random.uniform(0.95, 1.05, size=len(df))).round().astype(int)
df.duplicated().sum()

np.int64(3)

In [4]:
df.dropna(inplace=True)

In [5]:
ip = df.drop(columns = ['Salary'])
op = df['Salary']

In [6]:
ip_train, ip_test, op_train, op_test = train_test_split(ip, op, test_size = 0.2, random_state = 37)
df_columns = ['Industry', 'Job_Title', 'Experience_Level', 'Educational_Level', 'Location']

In [7]:
ip_train_enc = pd.get_dummies(ip_train, columns = df_columns, drop_first = True)
ip_test_enc  = pd.get_dummies(ip_test,  columns = df_columns, drop_first = True)
ip_test_enc = ip_test_enc.reindex(columns = ip_train_enc.columns, fill_value = 0)

In [8]:
def encode(sample_df: pd.DataFrame) -> pd.DataFrame:
    sample = sample_df.copy()
    sample_enc = pd.get_dummies(sample, columns = df_columns, drop_first = True)
    sample_enc = sample_enc.reindex(columns = ip_train_enc.columns, fill_value = 0)
    return sample_enc

In [9]:
rf = RandomForestRegressor(
    n_estimators = 450,
    max_depth = None,
    random_state = 37,
    n_jobs = -1 )

rf.fit(ip_train_enc, op_train)

0,1,2
,n_estimators,450
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [10]:
train_prediction = rf.predict(ip_train_enc)
test_prediction = rf.predict(ip_test_enc)

In [11]:
Train_MSE = np.sqrt(mean_squared_error(op_train, train_prediction))
Test_MSE = np.sqrt(mean_squared_error(op_test, test_prediction))
Train_R2 = np.sqrt(r2_score(op_train, train_prediction))
Test_R2 = np.sqrt(r2_score(op_test, test_prediction))

In [12]:
scores = pd.DataFrame(['Random Forest Regressor', Train_MSE, Train_R2, Test_MSE, Test_R2]).transpose()
scores.columns = ['Method', 'Training MSE', 'Training RS', 'Test MSE', 'Test RS']
scores

Unnamed: 0,Method,Training MSE,Training RS,Test MSE,Test RS
0,Random Forest Regressor,244619.315265,0.999149,250285.457784,0.999153


In [13]:
sample_data = pd.DataFrame({
    'Industry': [
        'Technology', 'Healthcare', 'Finance',
        'Transportation', 'Manufacturing', 'Technology',
        'Education', 'Retail', 'Transportation'],
    'Job_Title': [
        'Engineer', 'Staff', 'Analyst',
        'Manager', 'HR', 'Engineer',
        'Teacher', 'Intern', 'Engineer'],
    'Experience_Level': [
        'Entry-Level', 'Mid-Level', 'Entry-Level',
        'Senior-Level', 'Mid-Level', 'Mid-Level',
        'Senior-Level', 'Entry-Level', 'Senior-Level'],
    'Educational_Level': [
        'Bachelor', 'Master', 'Bachelor',
        'Master', 'PhD', 'Master',
        'PhD', 'Bachelor', 'PhD'],
    'Location': [
        'India', 'America', 'Dubai',
        'England', 'India', 'Dubai',
        'India', 'Australia', 'America']})

In [14]:
encoded_sample = encode(sample_data)
result = rf.predict(encoded_sample)

sample_data['Predicted_Salary'] = np.round(result)
sample_data

Unnamed: 0,Industry,Job_Title,Experience_Level,Educational_Level,Location,Predicted_Salary
0,Technology,Engineer,Entry-Level,Bachelor,India,502698.0
1,Healthcare,Staff,Mid-Level,Master,America,15469680.0
2,Finance,Analyst,Entry-Level,Bachelor,Dubai,3003821.0
3,Transportation,Manager,Senior-Level,Master,England,12501266.0
4,Manufacturing,HR,Mid-Level,PhD,India,802192.0
5,Technology,Engineer,Mid-Level,Master,Dubai,4488080.0
6,Education,Teacher,Senior-Level,PhD,India,798198.0
7,Retail,Intern,Entry-Level,Bachelor,Australia,115592.0
8,Transportation,Engineer,Senior-Level,PhD,America,11947084.0


In [15]:
pickle.dump(rf, open('Model.pkl', 'wb'))
pickle.dump(ip_train_enc.columns.tolist(), open("Train_Cols.pkl", "wb"))