# Predicting CTC of new employees

In [138]:
#importing libraries
import pandas as pd
import numpy as np

In [139]:
#reading the dataset
Hires = pd.read_csv('Hires.csv')

## Data Exploration

In [140]:
Hires.head()

Unnamed: 0,S.No.,College,Role,City type,Previous CTC,Previous job changes,Graduation marks,Exp (Months),CTC
0,1,Tier 1,Manager,Non-Metro,55523,3,66,19,71406.58
1,2,Tier 2,Executive,Metro,57081,1,84,18,68005.87
2,3,Tier 2,Executive,Metro,60347,2,52,28,76764.02
3,4,Tier 3,Executive,Metro,49010,2,81,33,82092.39
4,5,Tier 3,Executive,Metro,57879,4,74,32,73878.1


In [141]:
Hires.shape

(1338, 9)

In [142]:
Hires.College.value_counts()

Tier 1    649
Tier 2    364
Tier 3    325
Name: College, dtype: int64

In [143]:
Hires.Role.value_counts()

Executive    1064
Manager       274
Name: Role, dtype: int64

In [144]:
Hires['City type'].value_counts()

Metro        676
Non-Metro    662
Name: City type, dtype: int64

In [145]:
Hires.describe()

Unnamed: 0,S.No.,Previous CTC,Previous job changes,Graduation marks,Exp (Months),CTC
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,669.5,55581.762332,2.525411,59.890882,39.207025,75459.339036
std,386.391641,6685.600553,1.123502,14.894696,14.04996,12551.280147
min,1.0,36990.0,1.0,35.0,18.0,53020.32
25%,335.25,50547.0,2.0,47.0,27.0,66905.0675
50%,669.5,55293.5,3.0,60.0,39.0,73112.64
75%,1003.75,60150.5,4.0,73.0,51.0,80747.175
max,1338.0,77911.0,4.0,85.0,64.0,123416.99


In [146]:
Hires.isna().sum()

S.No.                   0
College                 0
Role                    0
City type               0
Previous CTC            0
Previous job changes    0
Graduation marks        0
Exp (Months)            0
CTC                     0
dtype: int64

In [147]:
Hires = Hires.drop(['S.No.'] , axis = 1)

In [148]:
Hires

Unnamed: 0,College,Role,City type,Previous CTC,Previous job changes,Graduation marks,Exp (Months),CTC
0,Tier 1,Manager,Non-Metro,55523,3,66,19,71406.58
1,Tier 2,Executive,Metro,57081,1,84,18,68005.87
2,Tier 2,Executive,Metro,60347,2,52,28,76764.02
3,Tier 3,Executive,Metro,49010,2,81,33,82092.39
4,Tier 3,Executive,Metro,57879,4,74,32,73878.10
...,...,...,...,...,...,...,...,...
1333,Tier 3,Executive,Metro,59661,4,68,50,69712.40
1334,Tier 1,Executive,Non-Metro,53714,1,67,18,69298.75
1335,Tier 2,Executive,Non-Metro,61957,1,47,18,66397.77
1336,Tier 1,Executive,Non-Metro,53203,3,69,21,64044.38


In [149]:
# Transform categorical variables to the numerical variables
CollegeD = pd.get_dummies(Hires.College , prefix = 'colleges').iloc[:, :2]
RoleD = pd.get_dummies(Hires.Role , prefix = 'roles').iloc[:, 1:]
CityT = pd.get_dummies(Hires['City type'] , prefix = 'citytype').iloc[:, 0:1]

In [150]:
Hires =pd.concat([Hires , CollegeD , RoleD , CityT] , axis = 1)

In [151]:
Hires

Unnamed: 0,College,Role,City type,Previous CTC,Previous job changes,Graduation marks,Exp (Months),CTC,colleges_Tier 1,colleges_Tier 2,roles_Manager,citytype_Metro
0,Tier 1,Manager,Non-Metro,55523,3,66,19,71406.58,1,0,1,0
1,Tier 2,Executive,Metro,57081,1,84,18,68005.87,0,1,0,1
2,Tier 2,Executive,Metro,60347,2,52,28,76764.02,0,1,0,1
3,Tier 3,Executive,Metro,49010,2,81,33,82092.39,0,0,0,1
4,Tier 3,Executive,Metro,57879,4,74,32,73878.10,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,Tier 3,Executive,Metro,59661,4,68,50,69712.40,0,0,0,1
1334,Tier 1,Executive,Non-Metro,53714,1,67,18,69298.75,1,0,0,0
1335,Tier 2,Executive,Non-Metro,61957,1,47,18,66397.77,0,1,0,0
1336,Tier 1,Executive,Non-Metro,53203,3,69,21,64044.38,1,0,0,0


In [152]:
Hires = Hires.drop(['College' , 'Role' , 'City type'] , axis = 1)

In [153]:
Hires.head()

Unnamed: 0,Previous CTC,Previous job changes,Graduation marks,Exp (Months),CTC,colleges_Tier 1,colleges_Tier 2,roles_Manager,citytype_Metro
0,55523,3,66,19,71406.58,1,0,1,0
1,57081,1,84,18,68005.87,0,1,0,1
2,60347,2,52,28,76764.02,0,1,0,1
3,49010,2,81,33,82092.39,0,0,0,1
4,57879,4,74,32,73878.1,0,0,0,1


In [154]:
# rearranging columns
columnsTitles = ['colleges_Tier 1' , 'colleges_Tier 2' , 'roles_Manager' ,'citytype_Metro' , 'Previous CTC' , 'Previous job changes' , 'Graduation marks' , 'Exp (Months)' , 'CTC']

Hires = Hires.reindex(columns=columnsTitles)

In [155]:
Hires = Hires.astype(float)

In [156]:
Hires.dtypes

colleges_Tier 1         float64
colleges_Tier 2         float64
roles_Manager           float64
citytype_Metro          float64
Previous CTC            float64
Previous job changes    float64
Graduation marks        float64
Exp (Months)            float64
CTC                     float64
dtype: object

In [157]:
X = Hires.drop('CTC' , axis = 1)
y = Hires['CTC']

## Applying Linear Regression Model

In [188]:
# splitting training and testing data in 70:30 ratio
from sklearn.model_selection import train_test_split
train_X  , test_X , train_y , test_y = train_test_split(X, y, test_size = 0.3 , random_state = 2) 

In [159]:
# using linear regression model
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_X, train_y)

In [160]:
# regression score
reg.score(test_X, test_y)

0.6372176223093018

In [161]:
reg.score(train_X, train_y) 

0.5915885948488417

In [162]:
test_data_prediction = reg.predict(test_X)

In [163]:
test_data_prediction

array([ 71347.32613176,  73480.76949211,  79633.53461659,  63874.7858195 ,
        71449.44149352,  70619.51186143,  64080.05645792,  66253.9738497 ,
        70027.64089918,  68910.15193052,  71500.5565045 ,  64965.72913258,
        82739.5365174 ,  62345.17006338,  76232.43897453,  73527.9562203 ,
        72413.78877105,  67186.48536473,  85148.66537248,  67775.32876438,
        69853.46057956,  67430.05262246,  94402.12467304,  89969.42662847,
        88072.80894425,  74576.2418352 ,  63903.01810646,  77836.00379858,
        72203.01445207,  59399.85876326,  69263.46519975,  63103.43228617,
        68174.51507882,  65368.24833646,  68234.74760572,  64853.2831886 ,
        87833.09771776,  77033.85179737,  88947.731099  ,  76358.19380403,
        56969.52106914,  92562.56906471,  65428.35990372,  66666.10141943,
        70109.25853791,  70935.71789937,  72595.62698124,  74277.34314119,
        75136.38239588,  81913.36564817,  73328.71257266,  68355.0354054 ,
        81446.40819232,  

In [164]:
# Using R2 score to know how well the model performed
from sklearn import metrics
score = metrics.r2_score(test_y, test_data_prediction)
print("R2 score : ", score)

R2 score :  0.6372176223093018


In [165]:
test = pd.read_csv('testhires.csv')

In [166]:
test.head()

Unnamed: 0,College,Role,City type,College_T1,College_T2,Role_Manager,City_Metro,previous CTC,previous job changes,Graduation marks,Exp,Actual CTC,Predicted CTC
0,Tier 1,Manager,Non-Metro,1,0,1,0,55523,3,66,19,71406.57653,
1,Tier 2,Executive,Metro,0,1,0,1,57081,1,84,18,68005.87063,
2,Tier 2,Executive,Metro,0,1,0,1,60347,2,52,28,76764.02028,
3,Tier 3,Executive,Metro,0,0,0,1,49010,2,81,33,82092.38688,
4,Tier 3,Executive,Metro,0,0,0,1,57879,4,74,32,73878.09773,


In [167]:
y1 = test['Actual CTC']

In [168]:
test.drop(['College' , 'Role' , 'City type' , 'Predicted CTC' , 'Actual CTC'] , axis = 1, inplace = True)

In [169]:
test.dtypes

College_T1              int64
College_T2              int64
Role_Manager            int64
City_Metro              int64
previous CTC            int64
previous job changes    int64
Graduation marks        int64
Exp                     int64
dtype: object

In [170]:
test = test.astype(float)

In [171]:
test.dtypes

College_T1              float64
College_T2              float64
Role_Manager            float64
City_Metro              float64
previous CTC            float64
previous job changes    float64
Graduation marks        float64
Exp                     float64
dtype: object

In [172]:
test.head()

Unnamed: 0,College_T1,College_T2,Role_Manager,City_Metro,previous CTC,previous job changes,Graduation marks,Exp
0,1.0,0.0,1.0,0.0,55523.0,3.0,66.0,19.0
1,0.0,1.0,0.0,1.0,57081.0,1.0,84.0,18.0
2,0.0,1.0,0.0,1.0,60347.0,2.0,52.0,28.0
3,0.0,0.0,0.0,1.0,49010.0,2.0,81.0,33.0
4,0.0,0.0,0.0,1.0,57879.0,4.0,74.0,32.0


In [173]:
predict_test = reg.predict(test)
print('Target on test data', predict_test)

Target on test data [86017.14522626 67039.47075481 70905.21919585 ... 65151.79406742
 66666.10141943 89435.01131508]


In [174]:
test1 = test

In [175]:
test1.head()

Unnamed: 0,College_T1,College_T2,Role_Manager,City_Metro,previous CTC,previous job changes,Graduation marks,Exp
0,1.0,0.0,1.0,0.0,55523.0,3.0,66.0,19.0
1,0.0,1.0,0.0,1.0,57081.0,1.0,84.0,18.0
2,0.0,1.0,0.0,1.0,60347.0,2.0,52.0,28.0
3,0.0,0.0,0.0,1.0,49010.0,2.0,81.0,33.0
4,0.0,0.0,0.0,1.0,57879.0,4.0,74.0,32.0


In [104]:
test['Predicted']=predict_test.tolist()

In [105]:
test.Predicted

0       86017.145226
1       67039.470755
2       70905.219196
3       66793.030601
4       70452.864882
            ...     
1333    75543.522619
1334    66332.825128
1335    65151.794067
1336    66666.101419
1337    89435.011315
Name: Predicted, Length: 1338, dtype: float64

In [106]:
y1

0       71406.57653
1       68005.87063
2       76764.02028
3       82092.38688
4       73878.09773
           ...     
1333    69712.40366
1334    69298.75010
1335    66397.77069
1336    64044.38295
1337    83346.06096
Name: Actual CTC, Length: 1338, dtype: float64

In [107]:
from sklearn import metrics
error_score = metrics.r2_score(y1, test.Predicted)
print("R2 score : ", error_score)

R2 score :  0.606556073591544


In [108]:
test.Predicted

0       86017.145226
1       67039.470755
2       70905.219196
3       66793.030601
4       70452.864882
            ...     
1333    75543.522619
1334    66332.825128
1335    65151.794067
1336    66666.101419
1337    89435.011315
Name: Predicted, Length: 1338, dtype: float64

In [123]:
# mean squared error on test data
error_score = metrics.mean_squared_error(y1, test.Predicted)
print(error_score)

61934722.170763135


In [124]:
# root mean squared error
np.sqrt(error_score)

7869.86163606217

## Random Forest Regressor

In [176]:
# using Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

In [177]:
regressor = RandomForestRegressor(n_estimators=100)

In [178]:
regressor.fit(train_X, train_y)

RandomForestRegressor()

In [179]:
test_data_prediction = regressor.predict(test_X)

In [180]:
test_data_prediction

array([ 71431.1521,  77046.7994,  85378.8489,  66679.6268,  72316.8603,
        68271.6446,  66694.481 ,  69094.9419,  73042.3968,  68037.7882,
        72074.5304,  61966.0494,  76039.9676,  65276.1878,  77516.7108,
        70356.1927,  71869.0082,  64169.2757,  73923.0415,  72503.3442,
        73099.4945,  72244.9224,  96992.0884,  84168.137 ,  98385.8609,
        81915.1319,  62522.3737,  71683.411 ,  75220.4646,  60290.711 ,
        66049.3128,  64532.3198,  66376.6428,  66409.2583,  73530.1534,
        63145.8077,  96221.022 ,  75345.3473,  77272.5344,  73351.9565,
        62407.0863,  89455.7928,  73856.3457,  69146.1548,  69865.0783,
        73315.5796,  72422.1144,  71830.0532,  71941.443 ,  79260.5724,
        75153.4265,  72970.5429,  89507.8237,  93661.7774,  71032.1311,
        76745.5953,  74180.3764,  72168.1393,  66421.8214,  96433.2772,
        78916.3292,  72366.9223,  64991.867 ,  96745.8103,  71907.6625,
        80207.5332,  83087.9984,  62926.4427,  74996.4111,  6939

In [181]:
error_score = metrics.r2_score(test_y, test_data_prediction)
print("R2 : ", error_score)

R2 :  0.6683803584207583


In [182]:
predict_test = regressor.predict(test1)
print('Target on test data', predict_test)

Target on test data [92302.0836 65262.8576 63504.8942 ... 63673.8028 69146.1548 86418.727 ]


In [183]:
test1['Predicted1']=predict_test.tolist()

In [184]:
test1.Predicted1

0       92302.0836
1       65262.8576
2       63504.8942
3       75021.2365
4       70842.0334
           ...    
1333    69591.6333
1334    69017.0207
1335    63673.8028
1336    69146.1548
1337    86418.7270
Name: Predicted1, Length: 1338, dtype: float64

In [185]:
error_score = metrics.mean_squared_error(y1, test.Predicted1)
print(error_score)

23256311.21483684


In [186]:
np.sqrt(error_score)

4822.479778582471

In [187]:
r2_score = metrics.r2_score(y1, test.Predicted1)
print(r2_score)

0.8522629297841303


In [189]:
Adj_r2 = 1-(1-r2_score)*(1338-1)/(1338-8-1)

In [190]:
Adj_r2

0.8513736170966006