In [1]:
# Loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
warnings.filterwarnings('ignore')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data =pd.read_csv('maintenance_servicehours.csv')
display(data.head())

Unnamed: 0,BoostPressure,EngineFuelRate,EngineLoad,EngineOilPressure,EngineRpm,GroundSpeed,HaulDistance,Payload,TankFuelLevel,GearSelect,ServiceHours
0,3.12,88.19,31.7,587.67,1360.65,0.0,0.0,0.0,33.24,0.0,541.002751
1,8.23,109.86,34.6,692.82,1731.02,0.0,0.0,0.0,33.24,0.0,665.708336
2,3.05,64.07,22.3,572.3,1265.98,0.0,0.0,0.0,33.24,0.0,614.675853
3,12.03,104.47,33.5,685.22,1862.32,0.0,0.0,0.0,33.24,0.0,398.043887
4,10.08,71.29,21.4,612.62,1482.57,0.0,0.0,0.0,33.24,0.0,548.413577


In [3]:
data.shape

(11500, 11)

In [4]:
# X/y split
y = data['ServiceHours']
X = data.drop(['ServiceHours'], axis=1)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1)

X_train_df = pd.DataFrame(X_train, columns=X.columns)

X_test_df = pd.DataFrame(X_test, columns=X.columns)

# Prediction with Random forest

In [6]:
model = RandomForestRegressor()

grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 7, 11],
    'min_samples_split': [5, 10, 15]
}

grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=5)


In [7]:
# Fit the grid search to the train data
grid_search.fit(X_train, y_train)

In [8]:
# best combination of hyperparameters
grid_search.best_params_

{'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 100}

In [9]:
# in grid search you are more likely to get really good results in your training set, even with CV
print("The best R2 for the best hyperparameters is {:.2f}".format(grid_search.best_score_))

The best R2 for the best hyperparameters is -0.00


In [14]:
from sklearn.model_selection import cross_validate

# applying CV on the TRAIN set.
results = cross_validate(grid_search, X_train, y_train, cv = 5)

In [15]:
# Displaying the results
results

{'fit_time': array([309.38619113, 309.35134697, 310.68474364, 311.03194427,
        316.32396913]),
 'score_time': array([0.00982594, 0.01464868, 0.00821614, 0.00979757, 0.01405931]),
 'test_score': array([-0.00049702, -0.00489605, -0.00074057, -0.00132709, -0.00293413])}

In [16]:
print(results['test_score'])
print("The average R2 over the folds is: {:.2f}".format(results['test_score'].mean()))
print("The standard deviation of R2 over the folds is: {:.2f}".format(results['test_score'].std()))

[-0.00049702 -0.00489605 -0.00074057 -0.00132709 -0.00293413]
The average R2 over the folds is: -0.00
The standard deviation of R2 over the folds is: 0.00


In [17]:
# Fit the grid search to the test data
grid_search.fit(X_test, y_test)

In [19]:
# applying CV on the TEST set.
results = cross_validate(grid_search, X_test, y_test, cv = 5)

In [20]:
# results for the test dataset
results

{'fit_time': array([86.62606478, 86.08385849, 82.96865773, 83.6960988 , 83.66655254]),
 'score_time': array([0.00426245, 0.00851965, 0.00364184, 0.0066874 , 0.00752759]),
 'test_score': array([-0.00844337, -0.0049997 , -0.00807567, -0.01053685, -0.00155247])}

In [21]:
# results summary
print(results['test_score'])
print("The average R2 over the folds is: {:.2f}".format(results['test_score'].mean()))
print("The standard deviation of R2 over the folds is: {:.2f}".format(results['test_score'].std()))

[-0.00844337 -0.0049997  -0.00807567 -0.01053685 -0.00155247]
The average R2 over the folds is: -0.01
The standard deviation of R2 over the folds is: 0.00


In [22]:
print("The Accuracy for the Random Forest in the TRAIN set is {:.2f}".format(grid_search.score(X_train, y_train)))
print("The Accuracy for the Random Forest in the TEST  set is {:.2f}".format(grid_search.score(X_test, y_test)))

The Accuracy for the Random Forest in the TRAIN set is -0.01
The Accuracy for the Random Forest in the TEST  set is 0.10


# Predictions using KNN-Regressor

In [14]:
# Apply KNN regression: GridsearchCV: n_neighbours=[ 2, 3,´4, 5 ], weights=["uniform", "distance"], cv=5,
knn_regressor = KNeighborsRegressor(n_neighbors=1)
knn_regressor.fit(X_train_df, y_train)
predictions = knn_regressor.predict(X_test)
 
# Evaluate the model
print('R2 Score:', knn_regressor.score(X_test, y_test))

Score: -0.9643423758821434


In [15]:
#Measuring Goodness of fit in Training data
from sklearn import metrics
print('R2 Value:',metrics.r2_score(y_train, knn_regressor.predict(X_train_df)))
 
#Measuring accuracy on Testing Data
print('Accuracy',100- (np.mean(np.abs((y_test - predictions) / y_test)) * 100))
 


R2 Value: 1.0
Accuracy -37.720771160710655
