In [21]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from collections import Counter
from imblearn.over_sampling import SMOTE

In [22]:
# import the dataset
df = pd.read_csv('data_a.csv')

In [23]:
df['stabf'] = df['stabf'].replace({'unstable': 0, 'stable': 1})

y = df['stabf'].values
# X = df.drop('stabf', axis=1).values
X = df.drop(['p1', 'p2', 'p3', 'p4', 'stab', 'stabf'], axis=1).values

print("Original class distribution:", Counter(y))
X, y = SMOTE(random_state=42).fit_resample(X, y)
X = np.array(X)
y = np.array(y)
print("After SMOTE class distribution:", Counter(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)

Original class distribution: Counter({0: 38280, 1: 21720})
After SMOTE class distribution: Counter({0: 38280, 1: 38280})


In [24]:
# Different models for comparision LR, DT, SVM, XGBoost, RF, and KNN
models = [
    (LogisticRegression(), "LogisticRegression"),
    (DecisionTreeClassifier(), "DecisionTree"),
    (make_pipeline(StandardScaler(), SVC(kernel='linear')), "SVM"),
    (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), "XGBoost"),
    (RandomForestClassifier(), "Random Forest"),
    (make_pipeline(StandardScaler(), KNeighborsClassifier()), "KNN")
]

In [25]:
# to store the results of each model
results = {'Model': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': []}
table = {}

In [26]:
# Function to do the evaluation of each model
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name=None):
    if not model_name:
        model_name = model.__class__.__name__
        if hasattr(model, 'steps'):  # Check if model is a pipeline
            # Set model_name to the last step of the pipeline
            model_name = model.steps[-1][1].__class__.__name__
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    # Calculating metrics
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')
    
    # store metrics in the results dictionary
    results['Model'].append(model_name)
    results['Accuracy'].append(accuracy)
    results['Precision'].append(precision)
    results['Recall'].append(recall)
    results['F1 Score'].append(f1)
    
    # Adding metrics to the table
    table[model_name] = (accuracy, precision, recall, f1)

In [27]:
for model, model_name in models:
    evaluate_model(model, X_train, y_train, X_test, y_test, model_name=model_name)

In [28]:
# Add the proposed model metrics from the main experiment
results['Model'].append('Proposed')
results['Accuracy'].append(0.995)
results['Precision'].append(0.995)
results['Recall'].append(0.995)
results['F1 Score'].append(0.995)

table['LSTM'] = (0.9955, 0.9955, 0.9955, 0.9955)

In [30]:
metrics_df = pd.DataFrame(table, index=["Accuracy", "Precision", "Recall", "F1 Score"])
metrics_df

Unnamed: 0,LogisticRegression,DecisionTree,SVM,XGBoost,Random Forest,KNN,LSTM
Accuracy,0.803357,0.91745,0.80401,0.98413,0.961272,0.946904,0.9955
Precision,0.803393,0.917456,0.804042,0.984143,0.961385,0.947249,0.9955
Recall,0.803357,0.91745,0.80401,0.98413,0.961272,0.946904,0.9955
F1 Score,0.803333,0.917448,0.803987,0.98413,0.961267,0.946886,0.9955


In [31]:
fig = make_subplots(rows=1, cols=4, subplot_titles=("Accuracy", "Precision", "Recall", "F1 Score"))

def format_metrics(metrics):
    return [f"{m:.3f}" for m in metrics]

# Add bar plots for each metric with exact values displayed
fig.add_trace(go.Bar(x=results['Model'], y=results['Accuracy'], name='Accuracy', text=format_metrics(results['Accuracy']), textposition='outside'), row=1, col=1)
fig.add_trace(go.Bar(x=results['Model'], y=results['Precision'], name='Precision', text=format_metrics(results['Precision']), textposition='outside'), row=1, col=2)
fig.add_trace(go.Bar(x=results['Model'], y=results['Recall'], name='Recall', text=format_metrics(results['Recall']), textposition='outside'), row=1, col=3)
fig.add_trace(go.Bar(x=results['Model'], y=results['F1 Score'], name='F1 Score', text=format_metrics(results['F1 Score']), textposition='outside'), row=1, col=4)

fig.update_layout(
                  title_text="Model Performance Comparison Across Metrics", 
                  height=500, 
                  width=1200,
                  showlegend=False,
                  uniformtext_minsize=8, 
                  uniformtext_mode='hide',
                  bargap=0.15) 

# Customize axis labels and set the y-axis range to fit your data if necessary
fig.update_xaxes(title_text="Model", row=1, col=1)
fig.update_xaxes(title_text="Model", row=1, col=2)
fig.update_xaxes(title_text="Model", row=1, col=3)
fig.update_xaxes(title_text="Model", row=1, col=4)

fig.update_yaxes(title_text="Score", range=[0, 1.1], row=1, col=1)
fig.update_yaxes(range=[0, 1.1], row=1, col=2) 
fig.update_yaxes(range=[0, 1.1], row=1, col=3)
fig.update_yaxes(range=[0, 1.1], row=1, col=4)

fig.show()

In [37]:
# Sort the results for each metric
sorted_accuracy_results = sorted(zip(results['Model'], results['Accuracy']), key=lambda x: x[1])
sorted_precision_results = sorted(zip(results['Model'], results['Precision']), key=lambda x: x[1])
sorted_recall_results = sorted(zip(results['Model'], results['Recall']), key=lambda x: x[1])
sorted_f1_score_results = sorted(zip(results['Model'], results['F1 Score']), key=lambda x: x[1])

# Unzip the sorted results
sorted_accuracy_models, sorted_accuracy_values = zip(*sorted_accuracy_results)
sorted_precision_models, sorted_precision_values = zip(*sorted_precision_results)
sorted_recall_models, sorted_recall_values = zip(*sorted_recall_results)
sorted_f1_score_models, sorted_f1_score_values = zip(*sorted_f1_score_results)

# Create a subplots figure
fig = make_subplots(rows=1, cols=4, subplot_titles=("Accuracy", "Precision", "Recall", "F1 Score"))

# Add bar plots for each metric with exact values displayed
fig.add_trace(go.Bar(x=sorted_accuracy_models, y=sorted_accuracy_values, name='Accuracy', text=format_metrics(sorted_accuracy_values), textposition='outside'), row=1, col=1)
fig.add_trace(go.Bar(x=sorted_precision_models, y=sorted_precision_values, name='Precision', text=format_metrics(sorted_precision_values), textposition='outside'), row=1, col=2)
fig.add_trace(go.Bar(x=sorted_recall_models, y=sorted_recall_values, name='Recall', text=format_metrics(sorted_recall_values), textposition='outside'), row=1, col=3)
fig.add_trace(go.Bar(x=sorted_f1_score_models, y=sorted_f1_score_values, name='F1 Score', text=format_metrics(sorted_f1_score_values), textposition='outside'), row=1, col=4)

# Update layout and axis labels
fig.update_layout(title_text="",
                  height=500,
                  width=1200,
                  showlegend=False,
                  uniformtext_minsize=8,
                  uniformtext_mode='hide',
                  bargap=0.15)

# Customize axis labels and set the y-axis range to fit your data if necessary
fig.update_xaxes(title_text="", row=1, col=1)
fig.update_xaxes(title_text="", row=1, col=2)
fig.update_xaxes(title_text="", row=1, col=3)
fig.update_xaxes(title_text="", row=1, col=4)

fig.update_yaxes(title_text="Score", range=[0, 1.1], row=1, col=1)
fig.update_yaxes(range=[0, 1.1], row=1, col=2)
fig.update_yaxes(range=[0, 1.1], row=1, col=3)
fig.update_yaxes(range=[0, 1.1], row=1, col=4)

fig.show()



In [36]:
import plotly.io as pio
pio.write_image(fig, 'fig_ml_comparision.pdf')