# Importing necessary library

In [26]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split as tts
import pickle 
import warnings
warnings.filterwarnings(action='ignore',category=FutureWarning)
%matplotlib inline


In [2]:
# reading the dataset

df = pd.read_csv("employee salary dataset.csv")
df

Unnamed: 0,experience,test_score,interview_socre,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [3]:
df.describe()

Unnamed: 0,test_score,interview_socre,salary
count,7.0,8.0,8.0
mean,7.857143,7.875,63000.0
std,1.345185,1.642081,11501.55269
min,6.0,6.0,45000.0
25%,7.0,6.75,57500.0
50%,8.0,7.5,63500.0
75%,8.5,9.25,70500.0
max,10.0,10.0,80000.0


# Identifying the missing values

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   experience       6 non-null      object 
 1   test_score       7 non-null      float64
 2   interview_socre  8 non-null      int64  
 3   salary           8 non-null      int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 384.0+ bytes


# Treating the missing values:

In [5]:
# treating the missing values of numerical variables

df['test_score'].fillna(df['test_score'].mean(),inplace=True)

In [6]:
# treating the missing values of categorical variables

df['experience'].fillna(0,inplace=True)

In [7]:
# Dataset after treating the missing values
df

Unnamed: 0,experience,test_score,interview_socre,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,7.857143,7,72000
7,eleven,7.0,8,80000


# Feature Transformation

In [8]:
# creating a function for feature transformation for 'expererience' categorical variable

def converter(key):
    dictionary = {"one":1,"two":2,"three":3,"four":4,"five":5,"six":6,"seven":7,"eight":8,"nine":9,"ten":10,"eleven":11,0:0}    
    return dictionary[key]

In [9]:
df['experience'] = df['experience'].apply(converter)
df['experience']

0     0
1     0
2     5
3     2
4     7
5     3
6    10
7    11
Name: experience, dtype: int64

In [10]:
# Dataset after feature transformation

df

Unnamed: 0,experience,test_score,interview_socre,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,7.857143,7,72000
7,11,7.0,8,80000


# Splitting Dataset into Train & Test data

In [11]:
Y = df['salary']
X = df.drop(columns ='salary')


In [12]:
X_train , X_test ,Y_train ,Y_test = tts(X ,Y,test_size=0.3)

# Building the model

In [13]:
lr = LinearRegression()

In [14]:
# Training the model 

lr.fit(X_train,Y_train)

LinearRegression()

In [15]:
# predicting the salary using training data values

train_prediction = lr.predict(X_train)
train_prediction

array([63404.28240741, 59097.24702381, 78385.98710317, 64133.72189153,
       73978.76157407])

In [16]:
# predicting the salary using test data values

test_prediction = lr.predict(X_test)
test_prediction

array([64749.49570106, 54017.99355159, 45674.0327381 ])

# Model Evaluation

In [17]:
# Model Evaluation Error Matrics\

train_mse = mse(Y_train,train_prediction)
test_mse = mse(Y_test,test_prediction)

In [18]:
# Model performance matrics

train_score = r2_score(Y_train,train_prediction)
test_score = r2_score(Y_test,test_prediction)

In [19]:
print(f"train mse {train_mse}")
print(f"test mse {test_mse}")
print(f"train score {train_score}")
print(f"test score {test_score}")

train mse 2011588.9550264515
test mse 14722129.235275844
train score 0.962582050687752
test score 0.8738103208404928


In [24]:
def data_maker(inp):
    v = inp.split()
    v1 = [int(i) for i in v]
    data = np.array(v1).reshape(1,-1)
    return data
    

# ================================================================

# >>>>>>>>>>>>>>> EMPLOYEE SALARY PREDICTOR<<<<<<<<<<<<<<<

# Enter The Following Values To Predict The Salary Of Employee :
   ------------------------------------------------------------------------------------------------------

In [21]:
# Taking the values from user

user_input = input("Enter 1)Experience 2)Test Score 3)Interview Score : ")

Enter 1)Experience 2)Test Score 3)Interview Score : 0 2 3


In [27]:
final_data = data_maker(user_input)
prediction = lr.predict(final_data)
print(f"Salary of Employee is : {prediction} $")

Salary of Employee is : [30735.10664683] $




# Pickling the Final model

In [28]:
pickle.dump(lr,open("model.pkl",'wb'))