In [1]:
!pip install numpy pandas scikit-learn xgboost lightgbm catboost



In [2]:
import pandas as pd
import ssl
repo_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data"
ssl._create_default_https_context = ssl._create_unverified_context
df = pd.read_csv(repo_url, delimiter=',', header=None,)

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# --- Assume df is already loaded ---

# Separate features and labels
df_data = df.iloc[:, :-1]
df_class = df.iloc[:, -1]

# Replace '?' with NaN and remove columns with >40% missing
df_data = df_data.replace('?', np.nan)
thresh = len(df_data) * 0.4
df_data.dropna(thresh=thresh, axis=1, inplace=True)

# Impute remaining missing values using median
imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
df_data = pd.DataFrame(imp_mean.fit_transform(df_data))

# Encode class labels to zero-based indices
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df_class)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df_data, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Standard scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Compute class weights
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))

# Define all models
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(kernel='rbf', class_weight=class_weight_dict, probability=True),
    "Random Forest": RandomForestClassifier(class_weight=class_weight_dict, random_state=42),
    "Logistic Regression": LogisticRegression(class_weight=class_weight_dict, max_iter=1000, random_state=42),
    "XGBoost": xgb.XGBClassifier(scale_pos_weight=class_weight_dict.get(1, 1), use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "LightGBM": lgb.LGBMClassifier(class_weight=class_weight_dict, random_state=42),
    "CatBoost": CatBoostClassifier(class_weights=list(class_weight_dict.values()), verbose=0, random_state=42)
}

# Train & evaluate each model
for name, model in models.items():
    if name in ["KNN", "SVM", "Logistic Regression"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"\n{name} Results:")
    print("Accuracy:", acc)
    print("Classification Report:\n", report)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



KNN Results:
Accuracy: 0.5824175824175825
Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.96      0.72        49
           1       0.00      0.00      0.00         9
           2       1.00      1.00      1.00         3
           3       1.00      0.33      0.50         3
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00         1
           8       1.00      0.50      0.67         2
           9       1.00      0.10      0.18        10
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         4

    accuracy                           0.58        91
   macro avg       0.38      0.24      0.26        91
weighted avg       0.51      0.58      0.47        91



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



SVM Results:
Accuracy: 0.5274725274725275
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.67      0.72        49
           1       0.26      0.78      0.39         9
           2       0.50      0.33      0.40         3
           3       0.75      1.00      0.86         3
           4       0.00      0.00      0.00         3
           5       0.20      0.20      0.20         5
           6       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         2
           9       0.60      0.30      0.40        10
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         4

    accuracy                           0.53        91
   macro avg       0.26      0.27      0.25        91
weighted avg       0.56      0.53      0.52        91



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Random Forest Results:
Accuracy: 0.6923076923076923
Classification Report:
               precision    recall  f1-score   support

           0       0.65      1.00      0.79        49
           1       1.00      0.56      0.71         9
           2       0.60      1.00      0.75         3
           3       1.00      0.67      0.80         3
           4       0.00      0.00      0.00         3
           5       1.00      0.20      0.33         5
           6       0.00      0.00      0.00         1
           8       1.00      1.00      1.00         2
           9       1.00      0.10      0.18        10
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         4

    accuracy                           0.69        91
   macro avg       0.52      0.38      0.38        91
weighted avg       0.69      0.69      0.61        91



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Logistic Regression Results:
Accuracy: 0.6703296703296703
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.78      0.81        49
           1       0.54      0.78      0.64         9
           2       0.75      1.00      0.86         3
           3       0.67      0.67      0.67         3
           4       0.33      0.33      0.33         3
           5       0.25      0.20      0.22         5
           6       0.00      0.00      0.00         1
           8       1.00      0.50      0.67         2
           9       0.70      0.70      0.70        10
          10       0.00      0.00      0.00         1
          11       0.50      1.00      0.67         1
          12       0.00      0.00      0.00         4

    accuracy                           0.67        91
   macro avg       0.47      0.50      0.46        91
weighted avg       0.68      0.67      0.67        91



Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



XGBoost Results:
Accuracy: 0.7472527472527473
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.92      0.84        49
           1       0.78      0.78      0.78         9
           2       0.60      1.00      0.75         3
           3       0.60      1.00      0.75         3
           4       0.00      0.00      0.00         3
           5       0.80      0.80      0.80         5
           6       0.00      0.00      0.00         1
           8       1.00      1.00      1.00         2
           9       0.80      0.40      0.53        10
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         4

    accuracy                           0.75        91
   macro avg       0.45      0.49      0.45        91
weighted avg       0.69      0.75      0.70        91

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



LightGBM Results:
Accuracy: 0.6703296703296703
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.86      0.79        49
           1       0.55      0.67      0.60         9
           2       0.60      1.00      0.75         3
           3       0.25      0.33      0.29         3
           4       0.00      0.00      0.00         3
           5       0.80      0.80      0.80         5
           6       0.00      0.00      0.00         1
           8       0.67      1.00      0.80         2
           9       0.60      0.30      0.40        10
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         4

    accuracy                           0.67        91
   macro avg       0.35      0.41      0.37        91
weighted avg       0.60      0.67      0.63        91


CatBoost Results:
Accuracy: 0.7582417582417582
Classificatio

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
from sklearn.decomposition import PCA

# Reduce dimensions using PCA (retain 95% variance)
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"\n🔻 PCA reduced from {X_train_scaled.shape[1]} to {X_train_pca.shape[1]} components")

print("\n🧠 Training with PCA-transformed data:")

for name, model in models.items():
    if name in ["KNN", "SVM", "Logistic Regression"]:
        model.fit(X_train_pca, y_train)
        y_pred = model.predict(X_test_pca)
    else:
        model.fit(X_train_pca, y_train)
        y_pred = model.predict(X_test_pca)

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f"\n{name} (with PCA) Results:")
    print("Accuracy:", acc)
    print("Classification Report:\n", report)


🔻 PCA reduced from 278 to 93 components

🧠 Training with PCA-transformed data:

KNN (with PCA) Results:
Accuracy: 0.5824175824175825
Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.96      0.72        49
           1       0.00      0.00      0.00         9
           2       1.00      1.00      1.00         3
           3       1.00      0.33      0.50         3
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00         1
           8       1.00      0.50      0.67         2
           9       1.00      0.10      0.18        10
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         4

    accuracy                           0.58        91
   macro avg       0.38      0.24      0.26        91
weighted avg       0.51      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



SVM (with PCA) Results:
Accuracy: 0.5494505494505495
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.69      0.73        49
           1       0.35      0.78      0.48         9
           2       0.67      0.67      0.67         3
           3       0.75      1.00      0.86         3
           4       0.00      0.00      0.00         3
           5       0.14      0.20      0.17         5
           6       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         2
           9       0.60      0.30      0.40        10
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         4

    accuracy                           0.55        91
   macro avg       0.27      0.30      0.28        91
weighted avg       0.57      0.55      0.54        91



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Random Forest (with PCA) Results:
Accuracy: 0.5604395604395604
Classification Report:
               precision    recall  f1-score   support

           0       0.55      1.00      0.71        49
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00         1
           8       1.00      0.50      0.67         2
           9       1.00      0.10      0.18        10
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         4

    accuracy                           0.56        91
   macro avg       0.21      0.13      0.13        91
weighted avg       0.43      0.56      0.42        91



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Logistic Regression (with PCA) Results:
Accuracy: 0.6373626373626373
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.73      0.80        49
           1       0.64      0.78      0.70         9
           2       0.75      1.00      0.86         3
           3       0.67      0.67      0.67         3
           4       0.25      0.33      0.29         3
           5       0.40      0.40      0.40         5
           6       0.00      0.00      0.00         1
           8       1.00      0.50      0.67         2
           9       0.56      0.50      0.53        10
          10       0.00      0.00      0.00         1
          11       0.50      1.00      0.67         1
          12       0.00      0.00      0.00         4

    accuracy                           0.64        91
   macro avg       0.47      0.49      0.46        91
weighted avg       0.70      0.64      0.66        91



Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



XGBoost (with PCA) Results:
Accuracy: 0.6593406593406593
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.94      0.77        49
           1       0.50      0.33      0.40         9
           2       0.75      1.00      0.86         3
           3       1.00      0.33      0.50         3
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00         1
           8       1.00      1.00      1.00         2
           9       0.83      0.50      0.62        10
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         4

    accuracy                           0.66        91
   macro avg       0.40      0.34      0.35        91
weighted avg       0.57      0.66      0.59        91

[LightGBM] [Info] Auto-choosing col-wise multi-thre

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



CatBoost (with PCA) Results:
Accuracy: 0.6703296703296703
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.94      0.79        49
           1       0.50      0.67      0.57         9
           2       0.67      0.67      0.67         3
           3       1.00      0.33      0.50         3
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00         1
           8       1.00      1.00      1.00         2
           9       0.80      0.40      0.53        10
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         4

    accuracy                           0.67        91
   macro avg       0.39      0.33      0.34        91
weighted avg       0.58      0.67      0.60        91



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
