In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler

In [30]:
df = pd.read_csv("Data/hiring.csv")

In [31]:
df.head(10)

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [32]:
df.shape

(8, 4)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   experience                  6 non-null      object 
 1   test_score(out of 10)       7 non-null      float64
 2   interview_score(out of 10)  8 non-null      int64  
 3   salary($)                   8 non-null      int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 388.0+ bytes


In [34]:
df['experience'].fillna(df['experience'].mode()[0], inplace=True)
df['test_score(out of 10)'].fillna(df['test_score(out of 10)'].median(), inplace=True)

In [35]:
df.experience.unique()

array(['eleven', 'five', 'two', 'seven', 'three', 'ten'], dtype=object)

In [36]:
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,eleven,8.0,9,50000
1,eleven,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,8.0,7,72000
7,eleven,7.0,8,80000


In [37]:
# Define a mapping dictionary
experience_mapping = {
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'seven': 7,
    'eight': 8,
    'nine': 9,
    'ten': 10,
    'eleven': 11,
}

# Map the categories to numeric values and create a new column
df['experience_numeric'] = df['experience'].map(experience_mapping)

In [38]:
df = df.drop("experience", axis = 'columns')
df

Unnamed: 0,test_score(out of 10),interview_score(out of 10),salary($),experience_numeric
0,8.0,9,50000,11
1,8.0,6,45000,11
2,6.0,7,60000,5
3,10.0,10,65000,2
4,9.0,6,70000,7
5,7.0,10,62000,3
6,8.0,7,72000,10
7,7.0,8,80000,11


In [39]:
df.rename(columns={'salary($)': 'salary'}, inplace=True)

In [40]:
df = pd.concat([df['experience_numeric'], df.drop(columns=['experience_numeric'])], axis=1)

In [41]:
df

Unnamed: 0,experience_numeric,test_score(out of 10),interview_score(out of 10),salary
0,11,8.0,9,50000
1,11,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,8.0,7,72000
7,11,7.0,8,80000


In [42]:
X = df[['test_score(out of 10)', 'interview_score(out of 10)', 'experience_numeric']]
y = df['salary']

In [43]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [44]:
reg = linear_model.LinearRegression()
reg.fit(X,y)

In [45]:
reg.coef_

array([  -58.01539941,  -166.06075392, -1120.98223549])

In [46]:
reg.intercept_

63000.0

In [47]:
p = reg.predict(X)
p

array([61762.43980738, 62086.77178737, 63980.54952318, 64408.36559343,
       63305.25918232, 64240.58162591, 62295.7227835 , 61920.30969691])

In [50]:
# Evaluate the model
print('Coefficients:', reg.coef_)
print('Intercept:', reg.intercept_)
print('Mean squared error:', mean_squared_error(y, p))
print('Coefficient of determination (R^2 score):', r2_score(y, p))

Coefficients: [  -58.01539941  -166.06075392 -1120.98223549]
Intercept: 63000.0
Mean squared error: 114674440.56274197
Coefficient of determination (R^2 score): 0.009292090170695766
