### An HR company has hired you as a ML expert to create a model that can predict the salary of the employee based on his/her YearsExperience

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [27]:
data = pd.read_csv('Salary_Data.csv')

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 2 columns):
YearsExperience    31 non-null float64
Salary             31 non-null float64
dtypes: float64(2)
memory usage: 576.0 bytes


In [29]:
data.describe()

Unnamed: 0,YearsExperience,Salary
count,31.0,31.0
mean,8.367742,3584299000.0
std,17.233599,19956110000.0
min,1.1,37731.0
25%,3.2,56799.5
50%,4.9,66029.0
75%,8.05,103442.0
max,100.0,111111000000.0


In [38]:
data = data.drop(data.index[data.YearsExperience > 11])

In [39]:
data

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0
5,2.9,56642.0
6,3.0,60150.0
7,3.2,54445.0
8,3.2,64445.0
9,3.7,57189.0


In [40]:
#Seperate your data as features and label
features = data.iloc[:,0].values
label = data.iloc[:,1].values

In [41]:
#Rule when working with Regression using Sklearn
#SKLEARN expects your features and label to be in 2d form for Regression Algos
print(features.shape)
print(label.shape)

(30,)
(30,)


In [42]:
features = features.reshape(-1,1)
label = label.reshape(-1,1)
print(features.shape)
print(label.shape)

(30, 1)
(30, 1)


In [50]:
#Create Training and Testing sets
# Training set will be used to train our model (Learning)
# Testing set will be used to test our model for accuracy/generalization (Testing)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=30)

In [51]:
#Create our model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train) # Training my model
                        # Creating intercept and coeff

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [52]:
#Check the generalization
print(lr.score(X_train,y_train)) #Known data
print(lr.score(X_test,y_test)) #Unknown data

0.9400496694274888
0.9944092048209744


In [36]:
#If your training score is less than testing score, you can conclude that the model is a 
# generalized model !!!!

In [49]:
#Technique to do experimental trail and error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
for i in range(1,101):
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=i)
    lr = LinearRegression()
    lr.fit(X_train,y_train)
    train_score = lr.score(X_train,y_train)
    test_score = lr.score(X_test,y_test)
    if train_score < test_score:
        print('Train Score : {} Test Score: {} Random Seed: {}'.format(train_score,test_score,i))

Train Score : 0.9545249190394052 Test Score: 0.9695039421049821 Random Seed: 3
Train Score : 0.9528197369259258 Test Score: 0.9631182154839475 Random Seed: 8
Train Score : 0.9494673013344646 Test Score: 0.9816423482070255 Random Seed: 10
Train Score : 0.9527636176933665 Test Score: 0.9606215790278543 Random Seed: 14
Train Score : 0.9460054870434312 Test Score: 0.9835849730044817 Random Seed: 26
Train Score : 0.9527636606684406 Test Score: 0.9636425773684423 Random Seed: 27
Train Score : 0.9400496694274888 Test Score: 0.9944092048209744 Random Seed: 30
Train Score : 0.9486350116716654 Test Score: 0.9778242092591889 Random Seed: 37
Train Score : 0.9473317052697812 Test Score: 0.9724794487377619 Random Seed: 38
Train Score : 0.9492886917497556 Test Score: 0.9928344802911048 Random Seed: 39
Train Score : 0.9491742100347064 Test Score: 0.9802519469633169 Random Seed: 41
Train Score : 0.948821675263085 Test Score: 0.9789129767378081 Random Seed: 46
Train Score : 0.9486450781125914 Test Score

In [53]:
# Deploy the model
import pickle
pickle.dump(lr,open('SalaryPredictor.model','wb'))

In [55]:
#Equation
#Salary = b0 + b1(yearsExper)
print("The equation of regression line is Salary = {} + {} * YearsExperience".format(lr.intercept_,lr.coef_))


The equation of regression line is Salary = [25566.43561641] + [[9481.03756369]] * YearsExperience


In [56]:
lr.predict(np.array([[0]]))

array([[25566.43561641]])