In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import joblib

In [18]:
result_df = pd.read_csv(r"E:\FYP\Cardi 2\Cardiovascular-Detection-using-ECG-images\combined_data.csv")

# Features and target
X = result_df.iloc[:, :-1]  # all columns except last
y = result_df.iloc[:, -1]   # last column (target)

# Convert categorical features to numeric
X = pd.get_dummies(X)  # One-hot encoding for all non-numeric columns

# Flatten target to 1D if needed
y = y.ravel() if len(y.shape) > 1 else y

In [19]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Save the label encoder for deployment
import joblib
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.4, random_state=42, stratify=y_encoded
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Save scaler and PCA for deployment
joblib.dump(scaler, "scaler.pkl")
joblib.dump(pca, "pca.pkl")

['pca.pkl']

In [21]:
# RandomForest often gives strong baseline accuracy
rf_pipeline = Pipeline([
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter grid
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

grid = GridSearchCV(rf_pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
grid.fit(X_train_pca, y_train)

# Best model
best_model = grid.best_estimator_

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [22]:
y_pred = best_model.predict(X_test_pca)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nBest Hyperparameters:", grid.best_params_)

Accuracy: 0.9997927890592623

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1212
           1       1.00      1.00      1.00       894
           2       1.00      1.00      1.00      1243
           3       1.00      1.00      1.00      1477

    accuracy                           1.00      4826
   macro avg       1.00      1.00      1.00      4826
weighted avg       1.00      1.00      1.00      4826


Confusion Matrix:
 [[1211    0    1    0]
 [   0  894    0    0]
 [   0    0 1243    0]
 [   0    0    0 1477]]

Best Hyperparameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}


In [23]:
# Save the trained model for deployment
joblib.dump(best_model, "best_ecg_model.pkl")

['best_ecg_model.pkl']