In [1]:
# Importing Libraries
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

In [2]:
# Data Pre-processing

sensor_df = pd.read_csv('sensor_data!.csv')
failure_df = pd.read_csv('failure_data!.csv')

sensor_df = sensor_df.dropna()
failure_df = failure_df.dropna()

sensor_df = sensor_df.drop_duplicates()
failure_df = failure_df.drop_duplicates()

sensor_df['Date'] = pd.to_datetime(sensor_df['Date'])
failure_df['Date'] = pd.to_datetime(failure_df['Date'])

sensor_df.rename(columns={'Date': 'Sensor_Date'}, inplace=True)
failure_df.rename(columns={'Date': 'Failure_Date'}, inplace=True)

merged_data = sensor_df.merge(failure_df, on='Equipment_ID', how='inner')


In [4]:
# One-hot encoding 
categorical_columns = ['Equipment_ID', 'Maintenance_Type', 'Issue_Description']
merged_data_encoded = pd.get_dummies(merged_data, columns=categorical_columns)

In [5]:
# Converting the Date columns from categorical to numeical
merged_data_encoded['Sensor_Date'] = pd.to_datetime(merged_data_encoded['Sensor_Date'])
merged_data_encoded['Failure_Date'] = pd.to_datetime(merged_data_encoded['Failure_Date'])

merged_data_encoded['Sensor_Year'] = merged_data_encoded['Sensor_Date'].dt.year
merged_data_encoded['Sensor_Month'] = merged_data_encoded['Sensor_Date'].dt.month
merged_data_encoded['Sensor_Day'] = merged_data_encoded['Sensor_Date'].dt.day
merged_data_encoded['Failure_Year'] = merged_data_encoded['Failure_Date'].dt.year
merged_data_encoded['Failure_Month'] = merged_data_encoded['Failure_Date'].dt.month
merged_data_encoded['Failure_Day'] = merged_data_encoded['Failure_Date'].dt.day

merged_data_encoded = merged_data_encoded.drop(columns=['Sensor_Date', 'Failure_Date'])


In [6]:

X = merged_data_encoded.drop(columns=['Target'])
y = merged_data_encoded['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

rf_classifier = RandomForestClassifier(n_estimators=60, random_state=0)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(confusion_mat)


Accuracy: 0.83
Classification Report:
              precision    recall  f1-score   support

     Failure       0.83      0.83      0.83    762649
 Maintenance       0.83      0.83      0.83    748193

    accuracy                           0.83   1510842
   macro avg       0.83      0.83      0.83   1510842
weighted avg       0.83      0.83      0.83   1510842

Confusion Matrix:
[[634607 128042]
 [130153 618040]]


In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Create your Random Forest Classifier with chosen hyperparameters
rf_classifier = RandomForestClassifier(n_estimators=60, max_depth=7, random_state=0)

# Perform cross-validation and get the scores
scores = cross_val_score(rf_classifier, X, y, cv=5)  # You can adjust the number of folds (cv) as needed

# Print the cross-validation scores
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", scores.mean())


Cross-Validation Scores: [0.58094427 0.61929838 0.61013925 0.61008431 0.61212622]
Mean Accuracy: 0.6065184843947945
