In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# 1) Load dataset

df = pd.read_csv("C:\\Users\\Nishchay\\Downloads\\ECGCvdata.csv")

# 2) Define correct target column

y = df["ECG_signal"]
X = df.drop(columns=["ECG_signal", "RECORD"])   # RECORD is ID → remove

print("Original class counts:\n", y.value_counts())

# 3) Drop classes with <2 samples

class_counts = y.value_counts()
rare_classes = class_counts[class_counts < 2].index

if len(rare_classes) > 0:
    print("\nDropping rare classes:", list(rare_classes))
    df = df[~df["ECG_signal"].isin(rare_classes)]

# Update X, y after cleaning
y = df["ECG_signal"]
X = df.drop(columns=["ECG_signal", "RECORD"])

print("\nNew class counts (all ≥ 2):\n", y.value_counts())

# 4) Fix NaN errors for SMOTE → Impute missing values

imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X)

# 5) Train-test split (safe now)

Xtr, Xte, ytr, yte = train_test_split(
    X_imputed,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTrain/Test split OK")
print("Train shape:", Xtr.shape)
print("Test shape:", Xte.shape)

# 6) Apply SMOTE oversampling on training set

sm = SMOTE(random_state=42)
Xtr_res, ytr_res = sm.fit_resample(Xtr, ytr)

print("\nAfter SMOTE:")
print(ytr_res.value_counts())


Original class counts:
 ECG_signal
ARR    300
AFF    300
CHF    300
NSR    300
Name: count, dtype: int64

New class counts (all ≥ 2):
 ECG_signal
ARR    300
AFF    300
CHF    300
NSR    300
Name: count, dtype: int64

Train/Test split OK
Train shape: (960, 54)
Test shape: (240, 54)

After SMOTE:
ECG_signal
ARR    240
NSR    240
AFF    240
CHF    240
Name: count, dtype: int64


In [2]:
# FULL DATA PREPROCESSING PIPELINE (REGRESSION)


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# 1) Load dataset

df = pd.read_csv("C:\\Users\\Nishchay\\Downloads\\ECGCvdata.csv")
print("Dataset loaded. Shape:", df.shape)

# 2) Target column (continuous target)

target = "hbpermin"

y = df[target]
X = df.drop(columns=[target])

print("Features:", X.shape, "| Target:", y.shape)

# 3) Preprocessing

numeric_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include=['object','category']).columns

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, cat_cols)
])

# 4) Fit + transform features

X_processed = preprocessor.fit_transform(X)
print("Processed feature matrix shape:", X_processed.shape)


# 5) Train-test split (REGRESSION → no stratify)

Xtr, Xte, ytr, yte = train_test_split(
    X_processed,
    y,
    test_size=0.2,
    random_state=42
)

print("\n===== Train/Test Split Complete =====")
print("Xtr:", Xtr.shape)
print("Xte:", Xte.shape)
print("ytr:", ytr.shape)
print("yte:", yte.shape)





Dataset loaded. Shape: (1200, 56)
Features: (1200, 55) | Target: (1200,)
Processed feature matrix shape: (1200, 58)

===== Train/Test Split Complete =====
Xtr: (960, 58)
Xte: (240, 58)
ytr: (960,)
yte: (240,)


In [4]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
def plot_confusion(cm):
    plt.imshow(cm, cmap='Blues')
    plt.colorbar()
    plt.title("Confusion Matrix")
    plt.show()

def plot_roc(y_true, y_score):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    plt.plot(fpr, tpr)
    plt.title("ROC Curve")
    plt.show()


def plot_roc_curve(model, Xtest, ytest, title="ROC Curve"):
    y_score = model.predict_proba(Xtest)[:,1]
    fpr, tpr, _ = roc_curve(ytest, y_score)
    auc_val = auc(fpr, tpr)

    plt.figure(figsize=(7,5))
    plt.plot(fpr, tpr, label=f"AUC = {auc_val:.3f}")
    plt.plot([0,1], [0,1], '--', color='gray')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(title)
    plt.legend()
    plt.show()


In [5]:

# ECG MULTICLASS CLASSIFICATION — SAFE REDUCED FEATURE SET DUE TO DATA LEAKAGE WHILE TRAINING

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


# 1) Load dataset

df = pd.read_csv("C:\\Users\\Nishchay\\Downloads\\ECGCvdata.csv")
target = "ECG_signal"

# Remove rare classes (<2 samples)
df = df[df[target].map(df[target].value_counts()) >= 2]


# 2) Safe reduced feature set

X = df.select_dtypes(include=["int64", "float64"]).copy()
y = df[target]

np.random.seed(42)
safe_features = np.random.choice(X.columns, size=min(5, len(X.columns)), replace=False)
X = X[safe_features]

# Add tiny random noise to break perfect separability
for col in X.columns:
    X[col] = X[col] + np.random.normal(0, 0.01, size=len(X))

print("Safe reduced features:", safe_features.tolist())


# 3) Train-test split

Xtr, Xte, ytr, yte = train_test_split(
    X, y, test_size=0.2, random_state=40, stratify=y
)


# 4) Preprocessing — numeric only

preprocessor = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), X.columns)
])


# 5) Dummy baseline

dummy_pipeline = ImbPipeline([
    ("preprocess", preprocessor),
    ("model", DummyClassifier(strategy="most_frequent"))
])

dummy_pipeline.fit(Xtr, ytr)
dummy_preds = dummy_pipeline.predict(Xte)

print("\n==============================")
print(" DUMMY BASELINE")
print("==============================")
print("Accuracy:", round(accuracy_score(yte, dummy_preds), 4))

# 6) Baseline model 

baseline_pipeline = ImbPipeline([
    ("preprocess", preprocessor),
    ("smote", SMOTE(random_state=40)),
    ("model", DecisionTreeClassifier(
        max_depth=3, min_samples_split=20, random_state=40
    ))
])

baseline_pipeline.fit(Xtr, ytr)
baseline_preds = baseline_pipeline.predict(Xte)

print("\n==============================")
print(" BASELINE MODEL ")
print("==============================")
print("Accuracy:", round(accuracy_score(yte, baseline_preds), 4))
print(classification_report(yte, baseline_preds))


# 7) Improved model 

improved_pipeline = ImbPipeline([
    ("preprocess", preprocessor),
    ("smote", SMOTE(random_state=40)),
    ("model", RandomForestClassifier(
        n_estimators=120,
        max_depth=8,
        min_samples_split=6,
        class_weight="balanced",
        random_state=40
    ))
])

improved_pipeline.fit(Xtr, ytr)
improved_preds = improved_pipeline.predict(Xte)

print("\n==============================")
print(" IMPROVED MODEL ")
print("==============================")
print("Accuracy:", round(accuracy_score(yte, improved_preds), 4))
print(classification_report(yte, improved_preds))


# 8) Label-shuffle leakage test

ytr_shuffled = shuffle(ytr, random_state=40)
baseline_pipeline.fit(Xtr, ytr_shuffled)
shuffled_preds = baseline_pipeline.predict(Xte)

print("\n==============================")
print(" LABEL SHUFFLE TEST")
print("==============================")
print("Accuracy (should be near random):", round(accuracy_score(yte, shuffled_preds), 4))

# ---- Confusion Matrix ----
cm = confusion_matrix(yte, p)
plot_confusion(cm, labels=np.unique(yte))

# ---- ROC Curve ----
if hasattr(m, "predict_proba"):
    probs = m.predict_proba(Xte)
    classes = np.unique(yte)

    # binary classification
    if len(classes) == 2:
        plot_roc(yte, probs[:, 1])

    # multiclass micro-avg ROC
    else:
        from sklearn.preprocessing import label_binarize
        y_bin = label_binarize(yte, classes=classes)

        fpr, tpr, _ = roc_curve(y_bin.ravel(), probs.ravel())
        plt.plot(fpr, tpr)
        plt.title("ROC Curve (Micro-average)")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.show()


Safe reduced features: ['RToffdis', 'QRseg', 'STdis', 'RStoQSdur', 'PonRdis']

 DUMMY BASELINE
Accuracy: 0.25

 BASELINE MODEL 
Accuracy: 0.725
              precision    recall  f1-score   support

         AFF       0.48      0.98      0.64        60
         ARR       0.97      0.93      0.95        60
         CHF       0.00      0.00      0.00        60
         NSR       1.00      0.98      0.99        60

    accuracy                           0.72       240
   macro avg       0.61      0.72      0.65       240
weighted avg       0.61      0.72      0.65       240



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



 IMPROVED MODEL 
Accuracy: 0.9083
              precision    recall  f1-score   support

         AFF       0.77      0.90      0.83        60
         ARR       1.00      0.98      0.99        60
         CHF       0.88      0.75      0.81        60
         NSR       1.00      1.00      1.00        60

    accuracy                           0.91       240
   macro avg       0.91      0.91      0.91       240
weighted avg       0.91      0.91      0.91       240


 LABEL SHUFFLE TEST
Accuracy (should be near random): 0.3667


NameError: name 'p' is not defined