# Importing Libraries

In [182]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

# Importing Dataset

In [183]:
df = pd.read_csv("company_hiring_dataset.csv")


# First 10 rows

In [184]:
df.head(10)

Unnamed: 0,CandidateID,Gender,Age,Role,Education,Experience_Years,Location,Expected_Salary
0,EMP1000,Other,30.0,ML Engineer,PhD,,Hyderabad,126725
1,EMP1001,,,ML Engineer,PhD,16.0,Bangalore,155393
2,EMP1002,,,Data Scientist,PhD,16.0,Delhi,151022
3,EMP1003,Other,41.0,ML Engineer,Masters,16.0,Delhi,145401
4,EMP1004,Male,36.0,ML Engineer,,1.0,Bangalore,111850
5,EMP1005,Male,28.0,Data Scientist,Masters,1.0,Hyderabad,108496
6,EMP1006,Other,33.0,ML Engineer,,4.0,Bangalore,107007
7,EMP1007,,29.0,Backend Developer,Masters,0.0,Delhi,94683
8,EMP1008,Other,36.0,Software Engineer,Bachelors,0.0,Chennai,84642
9,EMP1009,Other,24.0,Software Engineer,,18.0,Chennai,123679


# Dropping Rows which are not important

In [185]:
df.drop('CandidateID', axis = 1, inplace =True)


# Checking for Missing Data in our Dataset

In [186]:
df['Role'].isnull().sum()


np.int64(0)

In [187]:
df['Location'].isnull().sum()

np.int64(0)

# Handling Missing Data

In [188]:
df['Age'].fillna(df['Age'].median(), inplace = True)
df['Education'].fillna(df['Education'].mode()[0], inplace=True)
df['Gender'].fillna(df['Gender'].mode(), inplace = True)
df['Experience_Years'].fillna(df['Experience_Years'].mean(), inplace = True)


# Handling Categorical Data

In [189]:
from sklearn.preprocessing import OrdinalEncoder
edu_order = [['Bachelors', 'Masters', 'PhD']]
df[['Education']] = OrdinalEncoder(categories=edu_order).fit_transform(df[['Education']])

In [190]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['Role'] = le.fit_transform(df['Role'])
df['Education'] = le.fit_transform(df['Education'])
df['Location'] = le.fit_transform(df['Location'])


# Preprocessed Data

In [191]:
df.head(10)

Unnamed: 0,Gender,Age,Role,Education,Experience_Years,Location,Expected_Salary
0,2,30.0,4,2,8.244444,3,126725
1,3,33.0,4,2,16.0,0,155393
2,3,33.0,2,2,16.0,2,151022
3,2,41.0,4,1,16.0,2,145401
4,1,36.0,4,1,1.0,0,111850
5,1,28.0,2,1,1.0,3,108496
6,2,33.0,4,1,4.0,0,107007
7,3,29.0,0,1,0.0,2,94683
8,2,36.0,5,0,0.0,1,84642
9,2,24.0,5,1,18.0,1,123679


In [192]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [193]:
print(X)

[[ 2.   30.    4.    2.    8.24  3.  ]
 [ 3.   33.    4.    2.   16.    0.  ]
 [ 3.   33.    2.    2.   16.    2.  ]
 [ 2.   41.    4.    1.   16.    2.  ]
 [ 1.   36.    4.    1.    1.    0.  ]
 [ 1.   28.    2.    1.    1.    3.  ]
 [ 2.   33.    4.    1.    4.    0.  ]
 [ 3.   29.    0.    1.    0.    2.  ]
 [ 2.   36.    5.    0.    0.    1.  ]
 [ 2.   24.    5.    1.   18.    1.  ]
 [ 2.   35.    2.    1.    1.    4.  ]
 [ 2.   38.    1.    1.   11.    0.  ]
 [ 1.   25.    2.    1.    5.    0.  ]
 [ 2.   39.    1.    2.    3.    4.  ]
 [ 0.   29.    5.    1.   10.    4.  ]
 [ 1.   25.    2.    2.   16.    4.  ]
 [ 3.   23.    3.    0.    5.    2.  ]
 [ 0.   27.    0.    2.    4.    0.  ]
 [ 0.   43.    4.    1.   19.    2.  ]
 [ 0.   31.    1.    0.    1.    4.  ]
 [ 1.   25.    3.    0.    5.    0.  ]
 [ 1.   43.    4.    0.   10.    2.  ]
 [ 0.   33.    4.    2.   15.    2.  ]
 [ 0.   33.    2.    1.   15.    4.  ]
 [ 1.   23.    4.    0.    0.    0.  ]
 [ 1.   31.    0.    0.  

In [194]:
print(y)

[126725 155393 151022 145401 111850 108496 107007  94683  84642 123679
 108708 104839 117935  94205 113819 147444  98060 109327 145670  67301
  97606 113170 149199 133588  99331  91190  97980 151475 122102 126184
 114827 126394 116569 122816 123854 124209 114069 120442 120395 115233
  80863 116455  93514  88686 110009 128806 136654  92409  98784 113096]


# Splitting Data into Training Set and Test Set

In [195]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 0)

# Training the Data on Random Forest Regressor

In [210]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 9, max_depth = 10, random_state = 0)
regressor.fit(X_train, y_train)

In [211]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[106174.33 122102.  ]
 [121229.89 104839.  ]
 [ 91485.89 108708.  ]
 [132615.67 116455.  ]
 [145090.78 151022.  ]
 [139702.67 151475.  ]
 [ 99638.   120395.  ]
 [130885.56 126394.  ]
 [141967.   149199.  ]
 [104416.11 111850.  ]]


In [212]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.37448338399441905

#  Training Dataset on Decision Tree Regression

In [199]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

In [200]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[108496. 122102.]
 [114069. 104839.]
 [108496. 108708.]
 [128806. 116455.]
 [147444. 151022.]
 [113170. 151475.]
 [108496. 120395.]
 [126184. 126394.]
 [133588. 149199.]
 [108496. 111850.]]


In [201]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.20638142808628135

# Training Dataset on Polynomial Regression

In [202]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree = 12)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_poly, y_train)

In [203]:
y_pred = regressor.predict(poly_reg.transform(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 452016.36  122102.  ]
 [1207711.27  104839.  ]
 [ 360572.02  108708.  ]
 [ 362354.04  116455.  ]
 [  34765.14  151022.  ]
 [ 218818.52  151475.  ]
 [ 615579.31  120395.  ]
 [ 400774.32  126394.  ]
 [  42398.65  149199.  ]
 [ 132887.88  111850.  ]]


In [204]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

-620.0250579471866

# Training Dataset on Multiple Linear Regression Model

In [205]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [206]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[104089.44 122102.  ]
 [123239.06 104839.  ]
 [ 90308.41 108708.  ]
 [135178.73 116455.  ]
 [140621.68 151022.  ]
 [145308.28 151475.  ]
 [108539.21 120395.  ]
 [125832.08 126394.  ]
 [142992.52 149199.  ]
 [104922.91 111850.  ]]


In [207]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.4044276610616586