In [21]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Lasso
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import pickle

In [22]:
df=pd.read_csv("cleaned_data.csv")

In [23]:
#Separate the features and target variable
X=df.drop(['Failure_status','date'],axis=1) #Exclude Failure status
y=df['Failure_status']

In [24]:
y.head(5)

0    0.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: Failure_status, dtype: float64

In [25]:
y.tail(5)

2895    1.0
2896    0.0
2897    1.0
2898    0.0
2899    1.0
Name: Failure_status, dtype: float64

# Building ML models

In [26]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Encode the target variable if it is categorical
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [28]:
# Initialize the classifiers to perform transformations
classifiers = [
    {'name': 'Random Forest', 'model': RandomForestClassifier(), 'params': {'n_estimators': [10, 50, 100]}},
    {'name': 'SVM', 'model': SVC(), 'params': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}},
    {'name': 'KNN', 'model': KNeighborsClassifier(), 'params': {'n_neighbors': [3, 5, 7]}},
    {'name': 'Decision Tree', 'model': DecisionTreeClassifier(), 'params': {'max_depth': [None, 5, 10]}},
    {'name': 'Naive Bayes', 'model': GaussianNB(), 'params': {}},
    {'name': 'AdaBoost', 'model': AdaBoostClassifier(), 'params': {'n_estimators': [50, 100, 200]}},
    {'name': 'Ridge Regression', 'model': RidgeClassifier(), 'params': {'alpha': [0.1, 1, 10]}},
    {'name': 'Lasso Regression', 'model': Lasso(), 'params': {'alpha': [0.1, 1, 10]}},
    {'name': 'Logistic Regression', 'model': LogisticRegression(), 'params': {'C': [0.1, 1, 10]}},
    {'name': 'MLP', 'model': MLPClassifier(), 'params': {'hidden_layer_sizes': [(50,), (100,), (50, 50)]}}
]

In [29]:
# Initialize a dictionary to store the results
results = {}

In [30]:
#Evaluate the models
def evaluate_classification(y_true, y_pred):
    y_pred_binary = (y_pred > 0.5).astype(int)
    acc = accuracy_score(y_true, y_pred_binary)
    report = classification_report(y_true, y_pred_binary)
    return acc, report


# Train the model and make predictions   
for classifier in classifiers:
    model = GridSearchCV(classifier['model'], classifier['params'], cv=5)
    model.fit(X_train, y_train)
    best_params = model.best_params_
    best_model = classifier['model'].set_params(**best_params)
    best_model.fit(X_train, y_train)
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    #Calculate accuracy
    train_accuracy, train_classification_report = evaluate_classification(y_train, y_train_pred)
    test_accuracy, test_classification_report = evaluate_classification(y_test, y_test_pred)

    
    results[classifier['name']] = {
        'best_params': best_params,
        'train_accuracy': train_accuracy,
        'train_classification_report': train_classification_report,
        'test_accuracy': test_accuracy,
        'test_classification_report': test_classification_report
    }

    print(f"Train Accuracy for {classifier['name']}: {train_accuracy}")
    print(f"Train Classification Report for {classifier['name']}:\n{train_classification_report}")
    print(f"Test Accuracy for {classifier['name']}: {test_accuracy}")
    print(f"Test Classification Report for {classifier['name']}:\n{test_classification_report}")
    print()

Train Accuracy for Random Forest: 1.0
Train Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       404
           1       1.00      1.00      1.00      1916

    accuracy                           1.00      2320
   macro avg       1.00      1.00      1.00      2320
weighted avg       1.00      1.00      1.00      2320

Test Accuracy for Random Forest: 1.0
Test Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        96
           1       1.00      1.00      1.00       484

    accuracy                           1.00       580
   macro avg       1.00      1.00      1.00       580
weighted avg       1.00      1.00      1.00       580


Train Accuracy for SVM: 1.0
Train Classification Report for SVM:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       404
   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Train Accuracy for Logistic Regression: 1.0
Train Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       404
           1       1.00      1.00      1.00      1916

    accuracy                           1.00      2320
   macro avg       1.00      1.00      1.00      2320
weighted avg       1.00      1.00      1.00      2320

Test Accuracy for Logistic Regression: 0.9982758620689656
Test Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99        96
           1       1.00      1.00      1.00       484

    accuracy                           1.00       580
   macro avg       1.00      0.99      1.00       580
weighted avg       1.00      1.00      1.00       580


Train Accuracy for MLP: 1.0
Train Classification Report for MLP:
              precision    recall  f1-score   support

           0      

Saving the best model - Naive Bayes

In [31]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [32]:

model = GridSearchCV(GaussianNB(), {}, cv=5)
model.fit(X_train, y_train)
best_params = model.best_params_
best_model = GaussianNB().set_params(**best_params)
best_model.fit(X_train, y_train)
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
test_acc=accuracy_score(y_test,y_test_pred)
train_acc=accuracy_score(y_train,y_train_pred)
print("Test accuracy: ", test_acc,"\n")
print("Train accuracy: ",train_acc)

Test accuracy:  0.9977011494252873 

Train accuracy:  0.9947261663286004


In [33]:
#Saving the model
with open('trained_model.sav', 'wb') as file:
    pickle.dump(model, file)