### Predicting salary based on 3 independent variables using Liner Regression with multiple variables

In [24]:
# Installing the library to convert words to numbers
pip install word2number

Note: you may need to restart the kernel to use updated packages.


In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [26]:
data = pd.read_csv(r"C:\Users\prave\OneDrive\Desktop\NPTEL\Datasets\hiring_salary.csv")

In [27]:
# Exploring the data
data

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [28]:
data.shape

(8, 4)

In [29]:
# Renaming the columns
data.rename(columns ={"test_score(out of 10)":"test_score_out_of_10","interview_score(out of 10)":"interview_score_out_of_10","salary($)":"salary"}, inplace=True)

In [30]:
# Checking
data

Unnamed: 0,experience,test_score_out_of_10,interview_score_out_of_10,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [31]:
# Filling the Nan with some values
data.fillna({"experience":"zero","test_score_out_of_10":data["test_score_out_of_10"].mean()}, inplace=True)

In [32]:
data

Unnamed: 0,experience,test_score_out_of_10,interview_score_out_of_10,salary
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,7.857143,7,72000
7,eleven,7.0,8,80000


In [33]:
# Creating an object
from word2number import w2n
x = w2n.word_to_num

In [34]:
# Applying the method on all cells under experience column 
data_corrected = data["experience"].apply(x)

In [37]:
# Checking
pd.DataFrame(data_corrected)

Unnamed: 0,experience
0,0
1,0
2,5
3,2
4,7
5,3
6,10
7,11


In [38]:
# Adding a new column 
data["Experience_in_numbers"] = data_corrected

In [39]:
# Checking
data

Unnamed: 0,experience,test_score_out_of_10,interview_score_out_of_10,salary,Experience_in_numbers
0,zero,8.0,9,50000,0
1,zero,8.0,6,45000,0
2,five,6.0,7,60000,5
3,two,10.0,10,65000,2
4,seven,9.0,6,70000,7
5,three,7.0,10,62000,3
6,ten,7.857143,7,72000,10
7,eleven,7.0,8,80000,11


In [41]:
# Dropping the unwanted column
data.drop("experience",axis=1,inplace=True)

In [42]:
# Checking 
data

Unnamed: 0,test_score_out_of_10,interview_score_out_of_10,salary,Experience_in_numbers
0,8.0,9,50000,0
1,8.0,6,45000,0
2,6.0,7,60000,5
3,10.0,10,65000,2
4,9.0,6,70000,7
5,7.0,10,62000,3
6,7.857143,7,72000,10
7,7.0,8,80000,11


In [45]:
# Knowing the data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   test_score_out_of_10       8 non-null      float64
 1   interview_score_out_of_10  8 non-null      int64  
 2   salary                     8 non-null      int64  
 3   Experience_in_numbers      8 non-null      int64  
dtypes: float64(1), int64(3)
memory usage: 384.0 bytes


In [46]:
# Column names
data.columns

Index(['test_score_out_of_10', 'interview_score_out_of_10', 'salary',
       'Experience_in_numbers'],
      dtype='object')

In [47]:
# Reordering columns
cleaned_data = data[["Experience_in_numbers","test_score_out_of_10","interview_score_out_of_10","salary"]]

In [48]:
# Checking
cleaned_data

Unnamed: 0,Experience_in_numbers,test_score_out_of_10,interview_score_out_of_10,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,7.857143,7,72000
7,11,7.0,8,80000


In [53]:
# Importing scikit learn
from sklearn import linear_model

In [54]:
# Creating an regression object
regression_object = linear_model.LinearRegression()

In [55]:
# Fitting the dataframe into the object for prediction
regression_object.fit(cleaned_data[["Experience_in_numbers","test_score_out_of_10","interview_score_out_of_10"]],cleaned_data.salary)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [58]:
# Knowing the coefficients
regression_object.coef_

array([2827.63404314, 1912.93803053, 2196.9753141 ])

In [61]:
# Knowing the intercept
regression_object.intercept_

17237.3303137272

In [63]:
# Predicting the salary for candidate with 2 years of experience and has scored 9 in test and 6 in interview 
d = regression_object.predict([[2,9,6]])

In [66]:
d[0]

53290.892559447646