# An HR company hired you as a Data Scientist, Your role is to create and deploy a model that can predict the salary of the employee based on his/her years of experience

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
#Format of the program
#1. Load the data
#2. Data Preprocessing if applicable
#3. Seperate data as features and label
#4. USe for loop to idenify the ideal sample that can deliver generalized
#   model
#5. Create the model
#6. Check the model using user-input
#7. Deploy the model

In [5]:
salaryData = pd.read_csv('Salary_Data.csv')

In [6]:
#Check for missing values
salaryData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
YearsExperience    30 non-null float64
Salary             30 non-null float64
dtypes: float64(2)
memory usage: 560.0 bytes


In [7]:
x = salaryData.values
x.shape

(30, 2)

In [8]:
#Create Features and Label
features = salaryData.YearsExperience.values.reshape(-1,1)

label= salaryData.Salary.values.reshape(-1,1)

In [44]:
label.shape

(30, 1)

In [45]:
label.ndim

2

In [9]:
#Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,
                                                   label,
                                                   test_size=0.2,
                                                   random_state=30)

In [10]:
#Implment Linear Regression
from sklearn.linear_model import LinearRegression
modelSalary = LinearRegression()
modelSalary.fit(X_train,y_train) #Used to create equation of line

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [11]:
modelSalary.score(X_train,y_train)

0.9400496694274888

In [12]:
modelSalary.score(X_test,y_test)

0.9944092048209744

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

for i in range(1,31):
    
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                   label,
                                                   test_size=0.2,
                                                   random_state=i)
    model = LinearRegression()
    model.fit(X_train,y_train)
    
    train_score=model.score(X_train,y_train)
    test_score=model.score(X_test,y_test)
    
    if test_score > train_score:
        print("Train: {} Test: {} RandomState:{}".format(train_score,test_score,i))
    

Train: 0.9545249190394052 Test: 0.9695039421049821 RandomState:3
Train: 0.9528197369259258 Test: 0.9631182154839475 RandomState:8
Train: 0.9494673013344644 Test: 0.9816423482070255 RandomState:10
Train: 0.9527636176933665 Test: 0.9606215790278543 RandomState:14
Train: 0.9460054870434312 Test: 0.9835849730044817 RandomState:26
Train: 0.9527636606684406 Test: 0.9636425773684422 RandomState:27
Train: 0.9400496694274888 Test: 0.9944092048209744 RandomState:30


In [14]:
from sklearn.metrics import r2_score
r2_score(y_test,model.predict(X_test))
        #Actual  #Predicted Label

0.9944092048209745

In [15]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,model.predict(X_test))

5453562.21914827

In [16]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,model.predict(X_test))

1941.8047096136786

In [17]:
#Use the model
numYears = float(input("Enter Years of Experience: "))
salary = modelSalary.predict(np.array([[numYears]]))
print("The predicted salary for {} years of Experience is {}".format(numYears,salary))

Enter Years of Experience: 5
The predicted salary for 5.0 years of Experience is [[72971.62343485]]


In [50]:
#Deploy using Pickle
import pickle  #Memory object to file
            #memory object, file

pickle.dump(modelSalary, open('SalaryPredictor.model', 'wb'))

In [18]:
#Equation of line
print("Salary = {} + {}(YearsExperience)".format(modelSalary.intercept_,modelSalary.coef_))

Salary = [25566.43561641] + [[9481.03756369]](YearsExperience)


In [19]:
modelSalary.intercept_

array([25566.43561641])