In [1]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from scipy.io import arff
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import classification_report, confusion_matrix

from typing import Optional
from tensorflow.keras.models import load_model
import json

os.environ["OMP_NUM_THREADS"] = '1'

print(
"TensorFlow version:", tf.__version__)
print("Num GPUs Available", len(tf.config.list_physical_devices('GPU')))

TensorFlow version: 2.10.0
Num GPUs Available 1


In [14]:
path = "models/"

model1_name = "best_relu"
model2_name = "grid_search"

In [15]:
# Load .arff file
data, meta = arff.loadarff('dataset/Dry_Bean_Dataset.arff')
df = pd.DataFrame(data)

# Convert binary string to readable format for the 'Class' column
df['Class'] = df['Class'].apply(lambda x: x.decode('utf-8'))

# Basic Data Info
print(df.info())  # check data types and ensure correct loading
print(df['Class'].value_counts())  # check class distribution

# Display the first few rows to understand feature values and ranges
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13611 entries, 0 to 13610
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             13611 non-null  float64
 1   Perimeter        13611 non-null  float64
 2   MajorAxisLength  13611 non-null  float64
 3   MinorAxisLength  13611 non-null  float64
 4   AspectRation     13611 non-null  float64
 5   Eccentricity     13611 non-null  float64
 6   ConvexArea       13611 non-null  float64
 7   EquivDiameter    13611 non-null  float64
 8   Extent           13611 non-null  float64
 9   Solidity         13611 non-null  float64
 10  roundness        13611 non-null  float64
 11  Compactness      13611 non-null  float64
 12  ShapeFactor1     13611 non-null  float64
 13  ShapeFactor2     13611 non-null  float64
 14  ShapeFactor3     13611 non-null  float64
 15  ShapeFactor4     13611 non-null  float64
 16  Class            13611 non-null  object 
dtypes: float64(1

In [16]:
df.drop(columns=['ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4'], axis=1, inplace=True)

In [18]:
# Mean Imputation
def mean_imputation(df: pd.DataFrame) -> pd.DataFrame:
    """Impute missing values with the mean of each column."""
    imputed_df = df.copy()
    return imputed_df.fillna(imputed_df.mean())

# Median Imputation
def median_imputation(df: pd.DataFrame) -> pd.DataFrame:
    """Impute missing values with the median of each column."""
    imputed_df = df.copy()
    return imputed_df.fillna(imputed_df.median())

# Mode Imputation
def mode_imputation(df: pd.DataFrame) -> pd.DataFrame:
    """Impute missing values with the mode (most frequent value) of each column."""
    imputed_df = df.copy()
    for column in imputed_df.columns:
        mode_val = imputed_df[column].mode()[0]
        imputed_df[column].fillna(mode_val, inplace=True)
    return imputed_df

# K-Nearest Neighbors (KNN) Imputation
def knn_imputation(df: pd.DataFrame, n_neighbors: int = 5) -> pd.DataFrame:
    """Impute missing values using K-Nearest Neighbors."""
    imputer = KNNImputer(n_neighbors=n_neighbors)
    imputed_array = imputer.fit_transform(df)
    return pd.DataFrame(imputed_array, columns=df.columns)

# Constant Imputation
def constant_imputation(df: pd.DataFrame, fill_value: Optional[float] = 0) -> pd.DataFrame:
    """Impute missing values with a constant value (default is 0)."""
    imputed_df = df.copy()
    return imputed_df.fillna(fill_value)

def drop_na(df: pd.DataFrame) -> pd.DataFrame:
    imputed_df = df.copy()
    return imputed_df.dropna()

imputed_df = df.copy()

In [19]:
# Encode the target variable
label_encoder = LabelEncoder()
imputed_df['Class'] = label_encoder.fit_transform(imputed_df['Class'])

# Separate features and target variable
X = imputed_df.drop(columns=['Class'])
y = imputed_df['Class']

# Split into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the feature columns (use fit on train, transform on both train and test)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
# Load models

model1 = load_model(path + model1_name + "_model.h5")
model2 = load_model(path + model2_name + "_model.h5")

with open(path + model1_name + "_history.json", 'r') as f:
    model1_history = json.load(f)

with open(path + model2_name + "_history.json", 'r') as f:
    model2_history = json.load(f)

In [21]:
# Generate predictions for each model
y_pred1 = model1.predict(X_test).argmax(axis=1)  # Predict labels for model 1
y_pred2 = model2.predict(X_test).argmax(axis=1)  # Predict labels for model 2

# Generate classification reports for each model
report1 = classification_report(y_test, y_pred1, output_dict=True, target_names=label_encoder.classes_)
report2 = classification_report(y_test, y_pred2, output_dict=True, target_names=label_encoder.classes_)



In [22]:
# Convert classification reports to DataFrames
report1_df = pd.DataFrame(report1).transpose().iloc[:-3, :-1]
report2_df = pd.DataFrame(report2).transpose().iloc[:-3, :-1]

# Calculate the difference and format it to two decimal places
difference_df = report1_df - report2_df
formatted_text = difference_df.applymap(lambda x: f"{x:.3f}")  # Format to 2 decimal places

# Display classification reports side by side with enhanced readability
fig = px.imshow(
    difference_df, 
    text_auto=False,  # Disable auto text
    color_continuous_scale="RdBu", 
    title="Difference in Classification Metrics (Model1 - Model2)"
)

# Add the formatted text manually
fig.update_traces(text=formatted_text.values, texttemplate="%{text}")

# Update layout for readability
fig.update_layout(
    xaxis_title="Metrics",
    yaxis_title="Class",
    width=800,  # Increase the width of the figure
    height=600,  # Increase the height of the figure
    font=dict(size=12)  # Adjust font size for better readability
)

# Rotate x-axis labels for better readability
fig.update_xaxes(tickangle=-45)

# Increase the font size of the annotations within the cells
fig.update_traces(textfont_size=12)

fig.show()

In [None]:
# Display the absolute metrics of each model side-by-side for precision, recall, F1-score
fig = px.bar(report1_df.reset_index().melt(id_vars='index', var_name='Metric', value_name='Model1'),
             x='index', y='Model1', color='Metric', barmode='group', title="Classification Metrics for Model 1")
fig.show()

fig = px.bar(report2_df.reset_index().melt(id_vars='index', var_name='Metric', value_name='Model2'),
             x='index', y='Model2', color='Metric', barmode='group', title="Classification Metrics for Model 2")
fig.show()

In [24]:
# Confusion matrices
conf_matrix1 = confusion_matrix(y_test, y_pred1)
conf_matrix2 = confusion_matrix(y_test, y_pred2)

# Plot confusion matrices
fig1 = px.imshow(conf_matrix1, text_auto=True, color_continuous_scale="Blues", 
                 title="Confusion Matrix for Model 1", labels=dict(x="Predicted Label", y="True Label"))
fig2 = px.imshow(conf_matrix2, text_auto=True, color_continuous_scale="Blues", 
                 title="Confusion Matrix for Model 2", labels=dict(x="Predicted Label", y="True Label"))

fig1.show()
fig2.show()

In [25]:
# Assuming `history1` and `history2` contain the training histories for model1 and model2 respectively
fig = go.Figure()

# Add accuracy plots
fig.add_trace(go.Scatter(y=model1_history['val_accuracy'], mode='lines', name='Model 1 Validation Accuracy'))
fig.add_trace(go.Scatter(y=model2_history['val_accuracy'], mode='lines', name='Model 2 Validation Accuracy'))

fig.update_layout(title='Validation Accuracy Comparison', xaxis_title='Epoch', yaxis_title='Accuracy')
fig.show()

# Add loss plots
fig = go.Figure()
fig.add_trace(go.Scatter(y=model1_history['val_loss'], mode='lines', name='Model 1 Validation Loss'))
fig.add_trace(go.Scatter(y=model2_history['val_loss'], mode='lines', name='Model 2 Validation Loss'))

fig.update_layout(title='Validation Loss Comparison', xaxis_title='Epoch', yaxis_title='Loss')
fig.show()