In [2]:
import pandas as pd
import numpy as np

train_beats = pd.read_csv("train_beats.csv")
train_patients = pd.read_csv("train_patients.csv")

test_beats = pd.read_csv("test_beats.csv")
test_patients = pd.read_csv("test_patients.csv")

print(train_beats.shape, train_patients.shape)
print(test_beats.shape, test_patients.shape)

(30991, 277) (200351, 277)
(37441, 277) (14481, 277)


In [3]:
train = np.genfromtxt("train_patients.csv", delimiter=",")
test  = np.genfromtxt("test_patients.csv", delimiter=",")

X_train = train[:, :-2]
y_train = train[:, -2]

X_test = test[:, :-2]
y_test = test[:, -2]

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
logreg = LogisticRegression(max_iter=1000, random_state=42)

# Train
logreg.fit(X_train, y_train)

# Predict
y_pred = logreg.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Logistic Regression Accuracy: 0.8245408092804861

Classification Report:
               precision    recall  f1-score   support

         1.0       0.99      0.77      0.86      9363
         2.0       0.00      0.00      0.00         0
         3.0       0.00      0.00      0.00         0
         4.0       0.57      0.95      0.71      1087
         5.0       0.00      0.00      0.00         0
         6.0       0.36      0.88      0.51       387
         7.0       0.73      0.81      0.77      1138
         8.0       0.97      0.99      0.98      2507

    accuracy                           0.82     14482
   macro avg       0.45      0.55      0.48     14482
weighted avg       0.92      0.82      0.86     14482


Confusion Matrix:
 [[7180   61   37  756  455  582  284    8]
 [   0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0]
 [   9    2    1 1032    5   18    9   11]
 [   0    0    0    0    0    0    0    0]
 [  21    4    0   11    2  340    9    

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.svm import SVC
# Initialize SVM classifier
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)

# Train
svm_model.fit(X_train, y_train)

# Predict
y_pred = svm_model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("SVM Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

# Compute permutation importance
result = permutation_importance(
    svm_model,              # trained SVM model
    X_test_scaled,          # test features (scaled)
    y_test,                 # true labels
    n_repeats=10,           # number of shuffling rounds
    random_state=42,
    scoring='accuracy'      # can also use 'f1_macro', etc.
)

# Get feature importance
importances = result.importances_mean
std = result.importances_std
feature_names = [f"feature_{i}" for i in range(X_train.shape[1])]

# Sort features by importance
sorted_idx = importances.argsort()[::-1]

# Print feature importances
print("Feature importances (permutation):")
for i in sorted_idx:
    print(f"{feature_names[i]}: {importances[i]:.4f} Â± {std[i]:.4f}")

# Optional: plot the importances
plt.figure(figsize=(10,6))
plt.bar(range(len(importances)), importances[sorted_idx], yerr=std[sorted_idx])
plt.xticks(range(len(importances)), [feature_names[i] for i in sorted_idx], rotation=90)
plt.ylabel("Permutation Importance")
plt.title("Feature Importance (Permutation)")
plt.show()


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


models = {
    "LR": LogisticRegression(max_iter=500),
    "RF": RandomForestClassifier(n_estimators=300),
    "SVM": SVC(probability=True)
}

results = {}

for name, model in models.items():
    if name in ["LR", "SVM"]:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
    else:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(f"{name} accuracy: {acc:.4f}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR accuracy: 0.8227


KeyboardInterrupt: 

In [None]:
from sklearn.inspection import permutation_importance
for name, model in models.items():

    print("\n### Permutation Importance for", name)

    if name in ["LR", "SVM"]:
        r = permutation_importance(model, X_test_scaled, y_test, n_repeats=10)
        feature_names = X_train.columns
    else:
        r = permutation_importance(model, X_test, y_test, n_repeats=10)
        feature_names = X_train.columns

    sorted_idx = r.importances_mean.argsort()

    plt.figure(figsize=(6,8))
    plt.barh(feature_names[sorted_idx[-15:]], r.importances_mean[sorted_idx[-15:]])
    plt.title(f"Top 15 Permutation Importance: {name}")
    plt.show()

In [None]:
import shap
explainer = shap.TreeExplainer(models["XGB"])
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test)


In [None]:
explainer = shap.LinearExplainer(models["LR"], X_train)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)


In [None]:
from lime.lime_tabular import LimeTabularExplainer

explainer = LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=X_train.columns,
    class_names=list(set(y_train)),
    mode='classification'
)

# explain one sample
exp = explainer.explain_instance(
    data_row=X_test.iloc[5],
    predict_fn=models["RF"].predict_proba
)

exp.show_in_notebook()


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)


In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=len(set(y_train)))
clusters = kmeans.fit_predict(X_train)


In [None]:
import matplotlib.pyplot as plt

plt.scatter(X_pca[:,0], X_pca[:,1], c=clusters, cmap='viridis')
plt.title("K-means Clusters (PCA-reduced)")
plt.show()
