In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Deploy a ML model
import pickle
#pickle is used to save the model which we will create

In [8]:
df = pd.read_csv("Hiring.csv")
df.head()

Unnamed: 0,experience,test_score,Interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000


In [9]:
df.isna().sum()

experience         2
test_score         1
Interview_score    0
salary             0
dtype: int64

In [10]:
#experience
df['experience'].fillna(0, inplace=True)


In [11]:
df.isna().sum()

experience         0
test_score         1
Interview_score    0
salary             0
dtype: int64

In [13]:
#Test_Score
x= df['test_score'].mean()
x

7.857142857142857

In [15]:
df['test_score'].fillna(x, inplace=True)

In [16]:
df.isna().sum()

experience         0
test_score         0
Interview_score    0
salary             0
dtype: int64

In [17]:
df.head()

Unnamed: 0,experience,test_score,Interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000


In [18]:
X = df.iloc[ : , :-1]
X

Unnamed: 0,experience,test_score,Interview_score
0,0,8.0,9
1,0,8.0,6
2,five,6.0,7
3,two,10.0,10
4,seven,9.0,6
5,three,7.0,10
6,ten,7.857143,7
7,eleven,7.0,8


In [20]:
y =df.iloc[ : ,-1]
y

0    50000
1    45000
2    60000
3    65000
4    70000
5    62000
6    72000
7    80000
Name: salary, dtype: int64

In [21]:
#Treat X now (Experience is text column)
X

Unnamed: 0,experience,test_score,Interview_score
0,0,8.0,9
1,0,8.0,6
2,five,6.0,7
3,two,10.0,10
4,seven,9.0,6
5,three,7.0,10
6,ten,7.857143,7
7,eleven,7.0,8


In [26]:
map_text_to_numbers = {
    'two' : 2,
    'three' : 3,
    'five' : 5,
    'seven' : 7,
    'ten' : 10,
    'eleven' : 11,
    0 : 0
}
map_text_to_numbers

{'two': 2, 'three': 3, 'five': 5, 'seven': 7, 'ten': 10, 'eleven': 11, 0: 0}

In [28]:
X['experience'].map(map_text_to_numbers)

0     0
1     0
2     5
3     2
4     7
5     3
6    10
7    11
Name: experience, dtype: int64

In [29]:
X['experience'] = X['experience'].map(map_text_to_numbers)
X

Unnamed: 0,experience,test_score,Interview_score
0,0,8.0,9
1,0,8.0,6
2,5,6.0,7
3,2,10.0,10
4,7,9.0,6
5,3,7.0,10
6,10,7.857143,7
7,11,7.0,8


In [30]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   experience       8 non-null      int64  
 1   test_score       8 non-null      float64
 2   Interview_score  8 non-null      int64  
dtypes: float64(1), int64(2)
memory usage: 320.0 bytes


In [31]:
# X is ready
# Dataset is small so train-test split is difficult
# It is redommended to do train-test split
# Exception we are doing here , Not recommended

In [32]:
#Modeliing
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [33]:
lr.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [35]:
#Prediction Phase
y_pred = lr.predict(X)
y_pred

array([52313.61238494, 45722.68644263, 58231.95591138, 63991.7318464 ,
       67429.06277517, 61080.55179794, 75922.72532666, 79307.67351488])

In [36]:
y

0    50000
1    45000
2    60000
3    65000
4    70000
5    62000
6    72000
7    80000
Name: salary, dtype: int64

In [43]:
#Comparison
pd.DataFrame(y_pred,y)

Unnamed: 0_level_0,0
salary,Unnamed: 1_level_1
50000,52313.612385
45000,45722.686443
60000,58231.955911
65000,63991.731846
70000,67429.062775
62000,61080.551798
72000,75922.725327
80000,79307.673515


In [44]:
X.head(1)

Unnamed: 0,experience,test_score,Interview_score
0,0,8.0,9


In [60]:
#Let's do some manual testing , provide manual data for experience	test_score	Interview_score
lr.predict([[5,8.0,9]])[0]

66451.78260065857

In [48]:
from sklearn.metrics import r2_score
r2_score(y_pred,y)

0.9626511210293307

In [49]:
X

Unnamed: 0,experience,test_score,Interview_score
0,0,8.0,9
1,0,8.0,6
2,5,6.0,7
3,2,10.0,10
4,7,9.0,6
5,3,7.0,10
6,10,7.857143,7
7,11,7.0,8


In [50]:
lr.predict([[5,6,7]])

array([58231.95591138])

## Model Deployment

In [51]:
# Here we will save the 'lr' model to our Hard Disk as 'model.pkl' (Pickle file format)

import pickle

pickle.dump(lr, open("model.pkl",'wb')) #wb = write bites

#### Now let's say you send the model.pkl file to deploymet team
#### The deployment team will check whether the model is valid or not


In [52]:
deployment_team = pickle.load(open('model.pkl','rb')) #rb = read bite

In [53]:
deployment_team

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [54]:
lr

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [55]:
#Hence deployment_team and lr is same, model is ok
# Check again

In [61]:
deployment_team.predict([[5,6,7]])[0]

58231.95591138131

#### Yes the result is also same, hence Deployment team will confirm that model is ok.