In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('hiring.csv')
df

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   experience       6 non-null      object 
 1   test_score       7 non-null      float64
 2   interview_score  8 non-null      int64  
 3   salary           8 non-null      int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 388.0+ bytes


In [4]:
df.isnull().sum()

experience         2
test_score         1
interview_score    0
salary             0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,test_score,interview_score,salary
count,7.0,8.0,8.0
mean,7.857143,7.875,63000.0
std,1.345185,1.642081,11501.55269
min,6.0,6.0,45000.0
25%,7.0,6.75,57500.0
50%,8.0,7.5,63500.0
75%,8.5,9.25,70500.0
max,10.0,10.0,80000.0


In [7]:
def convert_experience(exp):
    word_to_num = {
        'zero':0,
        'one':1,
        'two':2,
        'three':3,
        'four':4,
        'five':5,
        'six':6,
        'seven':7,
        'eight':8,
        'nine':9,
        'ten':10,
        'eleven':11,
    }
    if pd.isna(exp):
        return 0
    return word_to_num.get(exp.lower(),0)

In [8]:
df['experience'] = df['experience'].apply(convert_experience)

In [9]:
df.head()

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000


In [10]:
df.isnull().sum()

experience         0
test_score         1
interview_score    0
salary             0
dtype: int64

In [11]:
df['test_score'].median()

8.0

In [12]:
df['test_score'].mean()

7.857142857142857

In [13]:
df['test_score'] = df['test_score'].fillna(df['test_score'].median())

In [14]:
df.isnull().sum()

experience         0
test_score         0
interview_score    0
salary             0
dtype: int64

#### Since there are more than one input columns, so this is the case of Multi Linear Regression


In [15]:
df.head()

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000


In [16]:
x = df.drop(columns='salary')
y = df['salary']

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y , test_size=0.2, random_state=0)

In [20]:
x_train

Unnamed: 0,experience,test_score,interview_score
1,0,8.0,6
7,11,7.0,8
3,2,10.0,10
0,0,8.0,9
5,3,7.0,10
4,7,9.0,6


In [21]:
x_test

Unnamed: 0,experience,test_score,interview_score
6,10,8.0,7
2,5,6.0,7


In [22]:
y_train

1    45000
7    80000
3    65000
0    50000
5    62000
4    70000
Name: salary, dtype: int64

In [23]:
y_test

6    72000
2    60000
Name: salary, dtype: int64

In [24]:
model = LinearRegression()

In [25]:
model.fit(x_train, y_train)

In [28]:
y_pred = model.predict(x_test)
y_pred

array([78186.17460755, 57496.5574222 ])

In [29]:
pd.DataFrame({'y_test':y_test, 'y_predict':y_pred})

Unnamed: 0,y_test,y_predict
6,72000,78186.174608
2,60000,57496.557422


In [37]:
print(model.predict([[0, 8.0, 9]]))

[51478.3806114]




In [31]:
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2_score:', r2_score(y_test, y_pred))

MSE: 22267990.507700842
RMSE: 4718.897170706397
R2_score: 0.3814447081194211
