In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression

In [23]:
df = pd.read_csv("Data/hiring.csv")

In [24]:
df.head(10)

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [25]:
df.shape

(8, 4)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   experience                  6 non-null      object 
 1   test_score(out of 10)       7 non-null      float64
 2   interview_score(out of 10)  8 non-null      int64  
 3   salary($)                   8 non-null      int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 388.0+ bytes


In [27]:
df['experience'].fillna(df['experience'].mode()[0], inplace=True)
df['test_score(out of 10)'].fillna(df['test_score(out of 10)'].median(), inplace=True)

In [28]:
df.experience.unique()

array(['eleven', 'five', 'two', 'seven', 'three', 'ten'], dtype=object)

In [29]:
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,eleven,8.0,9,50000
1,eleven,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,8.0,7,72000
7,eleven,7.0,8,80000


In [30]:
experience = {
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'seven': 7,
    'eight': 8,
    'nine': 9,
    'ten': 10,
    'eleven': 11,
}

# Map the categories to numeric values and create a new column
df['experience_numeric'] = df['experience'].map(experience)

In [31]:
df = df.drop("experience", axis = 'columns')
df

Unnamed: 0,test_score(out of 10),interview_score(out of 10),salary($),experience_numeric
0,8.0,9,50000,11
1,8.0,6,45000,11
2,6.0,7,60000,5
3,10.0,10,65000,2
4,9.0,6,70000,7
5,7.0,10,62000,3
6,8.0,7,72000,10
7,7.0,8,80000,11


In [32]:
df.rename(columns={'salary($)': 'salary'}, inplace=True)

In [12]:
df

Unnamed: 0,test_score(out of 10),interview_score(out of 10),salary,experience_numeric
0,8.0,9,50000,11
1,8.0,6,45000,11
2,6.0,7,60000,5
3,10.0,10,65000,2
4,9.0,6,70000,7
5,7.0,10,62000,3
6,8.0,7,72000,10
7,7.0,8,80000,11


In [33]:
X = df[['test_score(out of 10)', 'interview_score(out of 10)', 'experience_numeric']]
y = df['salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [35]:
reg = LinearRegression()
reg.fit(X_train, y_train)

In [36]:
# Make predictions on the testing set
y_pred = reg.predict(X_test)

In [37]:
# Evaluate the model
print('Coefficients:', reg.coef_)
print('Intercept:', reg.intercept_)
print('Mean squared error:', mean_squared_error(y_test, y_pred))
print('Coefficient of determination (R^2 score):', r2_score(y_test, y_pred))

Coefficients: [ 1849.83948177 -3640.95895894  1169.22461843]
Intercept: 66166.66666666667
Mean squared error: 383940292.82259625
Coefficient of determination (R^2 score): -4.314052495814481
