# **Predictive Maintenance using Random Forest Classifier**

**Project Overview**

This project applies a supervised machine learning approach to predict machinery maintenance needs using sensor data. The goal is to identify whether a machine will require maintenance (binary classification) based on its operational features.

We use a Random Forest Classifier and fine-tune it using GridSearchCV to achieve high performance.



In [29]:
#importing Libraries

import pandas as pd
import numpy as np
import seaborn as sns

In [30]:
#loading the data and reading it and making the target variable

df_raw = pd.read_csv('/content/Machine Downtime (1)copy.csv')
df = df_raw.copy()
df = df.drop(['Assembly_Line_No','Date'],axis = 1)
y = df.pop('Downtime')

display(y.value_counts())

Unnamed: 0_level_0,count
Downtime,Unnamed: 1_level_1
Machine_Failure,1265
No_Machine_Failure,1235


# **Preproccesing**

In [31]:
df.head()

Unnamed: 0,Machine_ID,Hydraulic_Pressure(bar),Coolant_Pressure(bar),Air_System_Pressure(bar),Coolant_Temperature,Hydraulic_Oil_Temperature(?C),Spindle_Bearing_Temperature(?C),Spindle_Vibration(?m),Tool_Vibration(?m),Spindle_Speed(RPM),Voltage(volts),Torque(Nm),Cutting(kN)
0,Makino-L1-Unit1-2013,71.04,6.933725,6.284965,25.6,46.0,33.4,1.291,26.492,25892.0,335.0,24.055326,3.58
1,Makino-L1-Unit1-2013,125.33,4.936892,6.196733,35.3,47.4,34.6,1.382,25.274,19856.0,368.0,14.20289,2.68
2,Makino-L3-Unit1-2015,71.12,6.839413,6.655448,13.1,40.7,33.0,1.319,30.608,19851.0,325.0,24.049267,3.55
3,Makino-L2-Unit1-2015,139.34,4.574382,6.560394,24.4,44.2,40.6,0.618,30.791,18461.0,360.0,25.860029,3.55
4,Makino-L1-Unit1-2013,60.51,6.893182,6.141238,4.1,47.3,31.4,0.983,25.516,26526.0,354.0,25.515874,3.55


In [32]:
df.columns
df.shape

(2500, 13)

In [33]:
machine_means = df.groupby('Machine_ID').mean()
display(machine_means)

Unnamed: 0_level_0,Hydraulic_Pressure(bar),Coolant_Pressure(bar),Air_System_Pressure(bar),Coolant_Temperature,Hydraulic_Oil_Temperature(?C),Spindle_Bearing_Temperature(?C),Spindle_Vibration(?m),Tool_Vibration(?m),Spindle_Speed(RPM),Voltage(volts),Torque(Nm),Cutting(kN)
Machine_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Makino-L1-Unit1-2013,100.220548,4.959537,6.501796,18.811137,47.562025,35.018234,0.995954,25.313381,20358.289805,349.413517,24.947256,2.774227
Makino-L2-Unit1-2015,103.403586,4.931144,6.509159,18.224658,47.740722,34.97311,1.020964,25.411773,20284.630597,347.386335,25.213688,2.79364
Makino-L3-Unit1-2015,100.714398,4.949339,6.486905,18.621744,47.557512,35.202211,1.012083,25.516793,20175.889841,350.139706,25.563235,2.780515


# **Imputation**


In [34]:
missing_values_by_machine = df.groupby('Machine_ID').apply(lambda x: x.isnull().sum())
display(missing_values_by_machine)

  missing_values_by_machine = df.groupby('Machine_ID').apply(lambda x: x.isnull().sum())


Unnamed: 0_level_0,Machine_ID,Hydraulic_Pressure(bar),Coolant_Pressure(bar),Air_System_Pressure(bar),Coolant_Temperature,Hydraulic_Oil_Temperature(?C),Spindle_Bearing_Temperature(?C),Spindle_Vibration(?m),Tool_Vibration(?m),Spindle_Speed(RPM),Voltage(volts),Torque(Nm),Cutting(kN)
Machine_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Makino-L1-Unit1-2013,0,4,5,7,3,5,2,6,6,1,1,8,1
Makino-L2-Unit1-2015,0,5,10,8,5,5,1,2,5,4,3,6,3
Makino-L3-Unit1-2015,0,1,4,2,4,6,4,3,0,1,2,7,3


In [35]:
for col in df.columns:
    if col != 'Machine_ID':
        for machine_id in df['Machine_ID'].unique():
            mean_value = machine_means.loc[machine_id, col]
            df.loc[df['Machine_ID'] == machine_id, col] = df.loc[df['Machine_ID'] == machine_id, col].fillna(mean_value)
display(df.isnull().sum())

Unnamed: 0,0
Machine_ID,0
Hydraulic_Pressure(bar),0
Coolant_Pressure(bar),0
Air_System_Pressure(bar),0
Coolant_Temperature,0
Hydraulic_Oil_Temperature(?C),0
Spindle_Bearing_Temperature(?C),0
Spindle_Vibration(?m),0
Tool_Vibration(?m),0
Spindle_Speed(RPM),0


In [36]:
df = df.drop('Machine_ID',axis = 1)

In [37]:
y = (y == "Machine_Failure").astype(int)
display(y.value_counts())

Unnamed: 0_level_0,count
Downtime,Unnamed: 1_level_1
1,1265
0,1235


In [38]:
from sklearn.model_selection import train_test_split
x = df
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)


# **Random Forest and Tuning**

In [39]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
scores = cross_val_score(model, x_train, y_train, cv=5, scoring='accuracy')
print("CV Accuracy: ", scores.mean())


CV Accuracy:  0.985


In [40]:
from sklearn.metrics import classification_report, roc_auc_score

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, model.predict_proba(x_test)[:, 1]))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       247
           1       0.98      1.00      0.99       253

    accuracy                           0.99       500
   macro avg       0.99      0.99      0.99       500
weighted avg       0.99      0.99      0.99       500

AUC: 0.9998639804131795


In [41]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, x_train, y_train, cv=5, scoring='accuracy')
print("CV Accuracy:", scores.mean(), "+/-", scores.std())

CV Accuracy: 0.985 +/- 0.004472135954999571


In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [100,200],
    'max_depth': [10, 20],
    'min_samples_split': [2,5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}

rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,  # Use all CPU cores
    verbose=2
)

# Fit the grid search to your data
grid_search.fit(x_train, y_train)

# Best parameters
print("Best Parameters: ", grid_search.best_params_)

# Best estimator accuracy
print("Best CV Score: ", grid_search.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV Score:  0.9875


In [43]:
best_model = grid_search.best_estimator_

# Evaluate on test set
test_accuracy = best_model.score(x_test, y_test)
print("Test Accuracy with best model: ", test_accuracy)

Test Accuracy with best model:  0.988


In [44]:
#checking for overfitting
train_acc = best_model.score(x_train, y_train)
print("Train Accuracy:", train_acc)



Train Accuracy: 0.9995


In [45]:
#Evaluation of model
from sklearn.metrics import confusion_matrix, classification_report

y_pred = best_model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[243   4]
 [  2 251]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       247
           1       0.98      0.99      0.99       253

    accuracy                           0.99       500
   macro avg       0.99      0.99      0.99       500
weighted avg       0.99      0.99      0.99       500



In [46]:
import joblib
joblib.dump(best_model, "random_forest_maintenance_model.pkl")


['random_forest_maintenance_model.pkl']

 Model used: Random Forest Classifier
- Accuracy achieved: ~98.8% on test data
- Balanced precision/recall scores
- Hyperparameter tuning improved the baseline accuracy from ~98.5% to ~98.8%