In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from math import sqrt

#Import data
data = pd.read_excel('/Users/jason/Documents/Machine Learning/Absenteeism_at_work.xls')

In [86]:
data.shape

(740, 21)

In [87]:
data.head()

Unnamed: 0,ID,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,...,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours
0,11,26,7,3,1,289,36,13,33,239554,...,0,1,2,1,0,1,90,172,30,4
1,36,0,7,3,1,118,13,18,50,239554,...,1,1,1,1,0,0,98,178,31,0
2,3,23,7,4,1,179,51,18,38,239554,...,0,1,0,1,0,0,89,170,31,2
3,7,7,7,5,1,279,5,14,39,239554,...,0,1,2,1,1,0,68,168,24,4
4,11,23,7,5,1,289,36,13,33,239554,...,0,1,2,1,0,1,90,172,30,2


In [88]:
X = data.drop('Absenteeism time in hours', axis=1)
y = data['Absenteeism time in hours']

#Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [89]:
#Applying KNN
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

#Predicting and evaluating the model
y_pred = knn.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)

Mean Squared Error (MSE): 132.7140540540541
Root Mean Squared Error (RMSE): 11.520158595004414
Mean Absolute Error (MAE): 5.205405405405406


In [90]:
#Estimate feature importance to improve model
importance_scores = {}

#Iterating over all the features
for col in X_train.columns:
# Drop the feature
    X_train_temp = X_train.drop(col, axis=1)
    X_test_temp = X_test.drop(col, axis=1)

#Scale the features
    scaler_temp = StandardScaler()
    X_train_temp_scaled = scaler_temp.fit_transform(X_train_temp)
    X_test_temp_scaled = scaler_temp.transform(X_test_temp)

#Predictions
    knn_temp = KNeighborsRegressor(n_neighbors=5)
    knn_temp.fit(X_train_temp_scaled, y_train)
    y_pred_temp = knn_temp.predict(X_test_temp_scaled)

#Calculate the error
    mse_temp = mean_squared_error(y_test, y_pred_temp)
    rmse_temp = sqrt(mse_temp)

#Importance scores - increase in error when drop the feature (compared to original RMSE)
    error_increase = rmse_temp - rmse
    importance_scores[col] = error_increase

#Sorting features by importance scores
sorted_importance = sorted(importance_scores.items(), key=lambda x: x[1], reverse=True)

sorted_importance



[('Month of absence', 0.537843121922208),
 ('Disciplinary failure', 0.4355818559141227),
 ('Education', 0.2090956611452679),
 ('Social smoker', 0.1256735094613468),
 ('Hit target', 0.12163470978120827),
 ('Distance from Residence to Work', 0.0683259035386552),
 ('Service time', 0.02875032736991301),
 ('Reason for absence', -0.1048180394186975),
 ('Transportation expense', -0.16037890547721823),
 ('Pet', -0.19311609080712877),
 ('Weight', -0.20882727564963943),
 ('Height', -0.21835295117869613),
 ('Son', -0.2190703910041183),
 ('ID', -0.4119804535798881),
 ('Seasons', -0.42824530015124296),
 ('Social drinker', -0.5380724445402763),
 ('Day of the week', -0.5735683570427383),
 ('Body mass index', -0.7113158687417585),
 ('Work load Average/day ', -0.7286955025560271),
 ('Age', -1.142570998442082)]

In [91]:
#We select top 10 features
top_10_features = sorted(importance_scores.items(), key = lambda x: x[1], reverse=True)[:10]
print(top_10_features)

[('Month of absence', 0.537843121922208), ('Disciplinary failure', 0.4355818559141227), ('Education', 0.2090956611452679), ('Social smoker', 0.1256735094613468), ('Hit target', 0.12163470978120827), ('Distance from Residence to Work', 0.0683259035386552), ('Service time', 0.02875032736991301), ('Reason for absence', -0.1048180394186975), ('Transportation expense', -0.16037890547721823), ('Pet', -0.19311609080712877)]


In [92]:
#Get the name of the top 10 features
top_10_features_nm = [item[0] for item in top_10_features]
print(top_10_features_nm)

['Month of absence', 'Disciplinary failure', 'Education', 'Social smoker', 'Hit target', 'Distance from Residence to Work', 'Service time', 'Reason for absence', 'Transportation expense', 'Pet']


In [93]:
#Select top 10 features
X_train_top = X_train[top_10_features_nm]
X_test_top = X_test[top_10_features_nm]

In [94]:
#Feature scaling
scaler = StandardScaler()
X_train_scaled_top = scaler.fit_transform(X_train_top)
X_test_scaled_top = scaler.transform(X_test_top)

In [95]:
#Re-fitting the model
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled_top, y_train)

#Predicting and evaluating the model again
y_pred_top = knn.predict(X_test_scaled_top)
mse = mean_squared_error(y_test, y_pred_top)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_top)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
#Error is less than before

Mean Squared Error (MSE): 97.61567567567567
Root Mean Squared Error (RMSE): 9.880064558274691
Mean Absolute Error (MAE): 4.481081081081081


# The result shows that when only the top 10 important features are selected for modeling, the error can be reduced and the accuracy of the model can be improved