In [1]:
# Author: Priti Gupta
# Date: June 24th, 2023


#importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
#tqdm is a popular library that provides a progress bar for iterating over iterable objects in loops.
from tqdm import tqdm

# loading the training and testing dataset 

train_data = pd.read_csv('training_dataset.csv')

# it is unlikely and can also be checked using boxplot that salary less than 30k is rare. 
#So we get rid of the rows exhibiting such salaries

train_data = train_data[train_data['salary']>30]

# there are no missing values in the training dataset
# it is obvious that columns like 'job id' and 'company id' will not impact the salary.
# Therefore, we drop these two coloumns from our dataset

train_data.drop(['jobId','companyId'],axis=1,inplace=True)

In [3]:
# importing dataset
x = train_data.iloc[:,:-1].values
y = train_data.iloc[:, -1].values

In [4]:
# next we would like to encode the categorical coloumns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,1,2,3])], remainder='passthrough')
x = ct.fit_transform(x).toarray()

In [5]:
# splitting between training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [6]:
# scaling of training data before modeling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(X_train[:,-2:])

# Replace the original coloumns with scaled data

X_train[:,-2:] = x_scaled

Modeling the dataset

### Linear Regression

In [7]:
# training the model on the training set
from sklearn.linear_model import LinearRegression
regressor_mlr = LinearRegression()
regressor_mlr.fit(X_train, y_train)

In [8]:
# scaling the test data using same transformer and replacing the columns with scaled ones.
X_test_scaled = scaler.transform(X_test[:,-2:])
X_test[:,-2:] = X_test_scaled

In [9]:
#predicting the test results
y_pred = regressor_mlr.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

[[ 99.94  80.  ]
 [162.73 154.  ]
 [111.47  91.  ]
 ...
 [ 93.56  70.  ]
 [104.68 101.  ]
 [ 54.    54.  ]]
0.7394546226526766
Mean Absolute Error: 15.891514363187781


In [10]:
import sys
pd.set_option('display.max_rows',None)
np.set_printoptions(threshold=sys.maxsize)


print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 99.94  80.  ]
 [162.73 154.  ]
 [111.47  91.  ]
 [191.04 164.  ]
 [123.89 132.  ]
 [ 70.57  52.  ]
 [108.99  98.  ]
 [ 87.98  98.  ]
 [ 89.63  94.  ]
 [149.76 144.  ]
 [117.26 100.  ]
 [ 77.48  69.  ]
 [141.8  189.  ]
 [181.24 163.  ]
 [126.93 126.  ]
 [127.65 137.  ]
 [155.92 152.  ]
 [136.84 155.  ]
 [146.18 175.  ]
 [103.28  85.  ]
 [167.18 130.  ]
 [109.6  123.  ]
 [109.48  81.  ]
 [ 84.07  84.  ]
 [ 64.79  69.  ]
 [ 83.22  81.  ]
 [132.36 157.  ]
 [124.37 159.  ]
 [ 64.    65.  ]
 [158.78 143.  ]
 [123.19 128.  ]
 [ 88.11  72.  ]
 [113.31 129.  ]
 [145.57 135.  ]
 [114.83 141.  ]
 [129.13 104.  ]
 [137.37 104.  ]
 [ 93.33 103.  ]
 [127.74 124.  ]
 [ 81.2   66.  ]
 [118.57 128.  ]
 [ 75.78  71.  ]
 [121.63 124.  ]
 [ 84.33  97.  ]
 [123.57 131.  ]
 [147.14 105.  ]
 [ 86.38  83.  ]
 [136.22 160.  ]
 [160.67 174.  ]
 [ 80.09  92.  ]
 [131.69 140.  ]
 [ 68.14  88.  ]
 [106.2   91.  ]
 [125.63 122.  ]
 [187.89 214.  ]
 [ 94.02 114.  ]
 [138.25 115.  ]
 [102.61 129.  ]
 [ 77.99  66. 

### Random Forest

In [None]:
#training the model

from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators = 15, random_state = 0)
regressor_rf.fit(X_train, y_train)

#predicting
y_pred = regressor_rf.predict(X_test)
np.set_printoptions(precision=2)

from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

### Analysis

- Using the R^2 as metric, we confirm that multiple linear regression is performing better.

- If we believe in the accuracy of our trained model, we see that there is contrast in predicted and test salaries. The test (real) salaries are lower than the predicted ones. This shows that the employee is unhappy with the low payscale.

- The employee's living far from the city are not getting a satisfactory salary which is the most probable reason for them leaving the company.

- The trained model can become further better using hyperparameter tuning.