# A SALARY PREDICTION MODEL PROJECT

In [38]:
#import all required python libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [39]:
#LOADING DATASET
data = pd.read_csv("hiring.csv")

In [40]:
data

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [41]:
#cleaning the dataset
#checking for missing values

data.isnull().sum()

experience         2
test_score         1
interview_score    0
salary             0
dtype: int64

In [42]:
#In the experience column, we replaced the null values(NaN) with 0
#In the test_score column, we replaced the null values(NaN) with the mean score across the column

data["experience"].fillna(0, inplace = True)
data["test_score"].fillna(data["test_score"].mean(), inplace=True)

In [43]:
data

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,7.857143,7,72000
7,eleven,7.0,8,80000


In [44]:
#Selecting the dependent(y) variable and independent(X) variable

y = data.iloc[:, -1]
X = data.iloc[:, :3]


In [50]:
#display the first five rows in X and y
#y.head()
X.head()

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,5,6.0,7
3,2,10.0,10
4,7,9.0,6


In [46]:
#X is conventionally called "FEATURES", y,"PREDICTION TARGET"

In [47]:
#Convert words to numbers in the experience column
#functions for converting words into numbers

def data_conversion(word):
    my_pict = {'one':1, 'two':2, 'three':3, 'four':4, 'five':5, 'six':6, 'seven':7, 'eight':8, 'nine':9, 'ten':10, 'eleven':11, 'twelve':12, 0:0}
    return  my_pict[word]

In [51]:
X

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,5,6.0,7
3,2,10.0,10
4,7,9.0,6
5,3,7.0,10
6,10,7.857143,7
7,11,7.0,8


In [52]:
#Dataframe concatenate
df = pd.concat([X,y], axis = 1)
df.head()

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000


In [53]:
df.dtypes

experience           int64
test_score         float64
interview_score      int64
salary               int64
dtype: object

In [54]:
# save clean CSV
df.to_csv("clean_hiring_csv")

#we now have a new file saved as "clean_hiring_csv", a cleaned version of  "hiring.csv"

In [63]:
# fitting the model

import pickle
import sklearn
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X,y)
clf.score(X,y)

0.9639958361860578

In [64]:
# save a model in storage
pickle.dump(clf, open("model.pkl", 'wb'))

In [65]:
clf.predict(X)

array([52313.61238494, 45722.68644263, 58231.95591138, 63991.7318464 ,
       67429.06277517, 61080.55179794, 75922.72532666, 79307.67351488])

In [66]:
from sklearn.metrics import mean_absolute_error
model = pickle.load(open("model.pkl", "rb"))
print(model.predict([[2,80,80]]))

[351685.6659711]
