In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.model_selection import StratifiedKFold, cross_val_predict, GridSearchCV
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from scipy import stats
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
train_df = pd.read_csv("NHANES_Data_P1_MI_train.csv")
test_df = pd.read_csv("NHANES_Data_P1_MI_test.csv")
train_df

Unnamed: 0,ID,Sex,Age,Race,Income,Edu,Systolic,Diastolic,Pulse,BMI,HDL,Trig,LDL,TCHOL,eGFR,CurrentSmoker,Insurance,Diabetes,MI
0,1,2,22,2,2.24,4.0,103.333333,70.666667,82.000000,23.8,52.0,,,130.0,123.841493,1,Medicaid,2,2.0
1,2,2,63,2,1.57,2.0,138.666667,74.666667,56.000000,27.4,65.0,103.0,108.0,194.0,83.002277,1,Private,2,2.0
2,3,2,18,2,0.10,,102.000000,59.666667,73.333333,18.5,45.0,,,132.0,135.154945,2,OtherGov,2,
3,4,2,74,7,5.00,4.0,,,,31.3,91.0,73.0,128.0,236.0,83.840405,2,Medicare,2,2.0
4,5,1,80,2,0.94,1.0,126.666667,62.000000,78.000000,29.4,,,,,,2,Medicare,2,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14465,14466,1,19,6,,,120.000000,61.333333,61.333333,26.0,58.0,,,176.0,124.511808,2,Private,2,
14466,14467,2,18,2,1.55,,104.666667,80.666667,86.000000,22.3,52.0,,,130.0,140.000000,2,Medicaid,2,
14467,14468,1,19,3,1.19,,127.333333,58.666667,88.000000,26.7,43.0,,,152.0,127.472242,2,OtherGov,2,
14468,14469,1,19,3,1.08,,106.333333,70.000000,,33.2,,,,,,2,Medicaid,2,


In [3]:
test_df

Unnamed: 0,ID,Cycle,Sex,Age,Race,Income,Edu,Systolic,Diastolic,Pulse,BMI,HDL,Trig,LDL,TCHOL,eGFR,CurrentSmoker,Insurance,Diabetes
0,101,20172020,2,72,3,2.80,5,131.666667,59.666667,50.333333,31.5,58.0,,,208.0,68.024512,2,Medicare,1
1,102,20172020,2,39,2,2.90,3,121.666667,79.333333,88.333333,30.2,88.0,,,226.0,99.375157,2,Private,2
2,103,20172020,1,63,3,1.44,3,,,,25.4,51.0,179.0,146.0,229.0,97.988195,1,Medicare,2
3,104,2015,2,73,2,0.76,5,138.000000,52.666667,74.000000,30.3,48.0,,,219.0,33.441709,2,Medicaid,2
4,105,20172020,1,62,4,0.88,4,141.000000,90.333333,84.666667,28.6,61.0,73.0,84.0,161.0,92.840474,2,Uninsured,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,326,20172020,2,65,7,,4,,,,31.9,66.0,70.0,134.0,216.0,70.105080,1,Medicaid,1
226,327,20172020,1,66,4,1.91,4,118.666667,73.333333,91.000000,34.7,52.0,85.0,99.0,169.0,77.400865,2,Medicare,2
227,328,20172020,1,78,4,5.00,4,124.333333,65.000000,74.666667,27.0,44.0,66.0,78.0,136.0,87.419789,2,Medicare,1
228,329,20172020,1,48,3,0.87,4,122.000000,83.000000,73.000000,25.5,47.0,,,215.0,91.737171,2,Uninsured,2


In [4]:
# Clean data
train_df = train_df.drop(columns=['ID', 'Insurance', 'Edu','Income', 'Race' ], errors='ignore')
test_df = test_df.drop(columns=['ID', 'Insurance','Cycle','Edu','Income', 'Race'], errors='ignore')


In [5]:
#Drop rows with no predictions
train_df = train_df.dropna(subset=['MI'])

In [6]:
train_df

Unnamed: 0,Sex,Age,Systolic,Diastolic,Pulse,BMI,HDL,Trig,LDL,TCHOL,eGFR,CurrentSmoker,Diabetes,MI
0,2,22,103.333333,70.666667,82.000000,23.8,52.0,,,130.0,123.841493,1,2,2.0
1,2,63,138.666667,74.666667,56.000000,27.4,65.0,103.0,108.0,194.0,83.002277,1,2,2.0
3,2,74,,,,31.3,91.0,73.0,128.0,236.0,83.840405,2,2,2.0
4,1,80,126.666667,62.000000,78.000000,29.4,,,,,,2,2,2.0
5,1,80,151.333333,50.000000,56.000000,29.1,60.0,163.0,96.0,189.0,74.298243,2,1,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14452,1,33,103.333333,76.000000,66.000000,26.6,56.0,,,205.0,89.920462,2,2,2.0
14453,2,58,,,,28.9,48.0,118.0,153.0,225.0,108.422494,2,1,2.0
14454,1,37,124.333333,73.333333,63.000000,32.0,,,,,,1,2,2.0
14455,1,59,126.000000,76.000000,54.333333,27.8,32.0,160.0,117.0,177.0,101.190176,1,2,2.0


In [7]:
test_df

Unnamed: 0,Sex,Age,Systolic,Diastolic,Pulse,BMI,HDL,Trig,LDL,TCHOL,eGFR,CurrentSmoker,Diabetes
0,2,72,131.666667,59.666667,50.333333,31.5,58.0,,,208.0,68.024512,2,1
1,2,39,121.666667,79.333333,88.333333,30.2,88.0,,,226.0,99.375157,2,2
2,1,63,,,,25.4,51.0,179.0,146.0,229.0,97.988195,1,2
3,2,73,138.000000,52.666667,74.000000,30.3,48.0,,,219.0,33.441709,2,2
4,1,62,141.000000,90.333333,84.666667,28.6,61.0,73.0,84.0,161.0,92.840474,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,2,65,,,,31.9,66.0,70.0,134.0,216.0,70.105080,1,1
226,1,66,118.666667,73.333333,91.000000,34.7,52.0,85.0,99.0,169.0,77.400865,2,2
227,1,78,124.333333,65.000000,74.666667,27.0,44.0,66.0,78.0,136.0,87.419789,2,1
228,1,48,122.000000,83.000000,73.000000,25.5,47.0,,,215.0,91.737171,2,2


In [8]:
# Fix class labels (2.0 -> 0, 1.0 -> 1)
train_df['MI'] = (train_df['MI'] == 1.0).astype(int)
train_df['MI']

0        0
1        0
3        0
4        0
5        0
        ..
14452    0
14453    0
14454    0
14455    0
14456    0
Name: MI, Length: 13768, dtype: int64

In [9]:
#Count distribution
print(f"Class distribution after fixing labels:")
print(train_df['MI'].value_counts())


Class distribution after fixing labels:
MI
0    13213
1      555
Name: count, dtype: int64


In [10]:
def create_medical_features(train_df):
    
    train_df_new = train_df.copy()
    
    # Ratios
    train_df_new['BP_Difference'] = train_df_new['Systolic'] - train_df_new['Diastolic']
    train_df_new['Cholesterol_Ratio'] = train_df_new['TCHOL'] / (train_df_new['HDL'] + 1e-8)
    train_df_new['LDL_HDL_Ratio'] = train_df_new['LDL'] / (train_df_new['HDL'] + 1e-8)
    
    # Age interactions
    train_df_new['Age_Systolic_Interaction'] = train_df_new['Age'] * train_df_new['Systolic']
    train_df_new['Age_BMI_Interaction'] = train_df_new['Age'] * train_df_new['BMI']
    
    # Categories
    train_df_new['Age_Senior'] = (train_df_new['Age'] >= 65).astype(int)
    train_df_new['BMI_Obese'] = (train_df_new['BMI'] >= 30).astype(int)
    train_df_new['BP_High'] = (train_df_new['Systolic'] >= 140).astype(int)
    
    # Cholesterol categories 
    train_df_new['TCHOL_High'] = (train_df_new['TCHOL'] >= 200).astype(int)
    train_df_new['HDL_Low'] = (train_df_new['HDL'] < 40).astype(int)
    train_df_new['LDL_High'] = (train_df_new['LDL'] >= 160).astype(int)
    
    # Lipid ratio categories
    train_df_new['Cholesterol_Ratio_High'] = (train_df_new['Cholesterol_Ratio'] >= 5).astype(int)
    train_df_new['LDL_HDL_Ratio_High'] = (train_df_new['LDL_HDL_Ratio'] >= 3).astype(int)
    
    return train_df_new

In [11]:
# Apply feature engineering
train_df_enhanced = create_medical_features(train_df)
test_df_enhanced = create_medical_features(test_df)

X_train = train_df_enhanced.drop(columns=['MI'])
y_train = train_df_enhanced['MI']
X_test = test_df_enhanced

In [12]:
X_train.columns

Index(['Sex', 'Age', 'Systolic', 'Diastolic', 'Pulse', 'BMI', 'HDL', 'Trig',
       'LDL', 'TCHOL', 'eGFR', 'CurrentSmoker', 'Diabetes', 'BP_Difference',
       'Cholesterol_Ratio', 'LDL_HDL_Ratio', 'Age_Systolic_Interaction',
       'Age_BMI_Interaction', 'Age_Senior', 'BMI_Obese', 'BP_High',
       'TCHOL_High', 'HDL_Low', 'LDL_High', 'Cholesterol_Ratio_High',
       'LDL_HDL_Ratio_High'],
      dtype='object')

In [13]:
y_train

0        0
1        0
3        0
4        0
5        0
        ..
14452    0
14453    0
14454    0
14455    0
14456    0
Name: MI, Length: 13768, dtype: int64

In [14]:
X_test

Unnamed: 0,Sex,Age,Systolic,Diastolic,Pulse,BMI,HDL,Trig,LDL,TCHOL,...,Age_Systolic_Interaction,Age_BMI_Interaction,Age_Senior,BMI_Obese,BP_High,TCHOL_High,HDL_Low,LDL_High,Cholesterol_Ratio_High,LDL_HDL_Ratio_High
0,2,72,131.666667,59.666667,50.333333,31.5,58.0,,,208.0,...,9480.000002,2268.0,1,1,0,1,0,0,0,0
1,2,39,121.666667,79.333333,88.333333,30.2,88.0,,,226.0,...,4745.000001,1177.8,0,1,0,1,0,0,0,0
2,1,63,,,,25.4,51.0,179.0,146.0,229.0,...,,1600.2,0,0,0,1,0,0,0,0
3,2,73,138.000000,52.666667,74.000000,30.3,48.0,,,219.0,...,10074.000000,2211.9,1,1,0,1,0,0,0,0
4,1,62,141.000000,90.333333,84.666667,28.6,61.0,73.0,84.0,161.0,...,8742.000000,1773.2,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,2,65,,,,31.9,66.0,70.0,134.0,216.0,...,,2073.5,1,1,0,1,0,0,0,0
226,1,66,118.666667,73.333333,91.000000,34.7,52.0,85.0,99.0,169.0,...,7832.000002,2290.2,1,1,0,0,0,0,0,0
227,1,78,124.333333,65.000000,74.666667,27.0,44.0,66.0,78.0,136.0,...,9697.999997,2106.0,1,0,0,0,0,0,0,0
228,1,48,122.000000,83.000000,73.000000,25.5,47.0,,,215.0,...,5856.000000,1224.0,0,0,0,1,0,0,0,0


In [15]:
#Pipeline with imputer, scaler, variance threshold, univariate analysis
preprocessor = Pipeline([
    ("imputer", KNNImputer(n_neighbors=5)),
    ("scaler", StandardScaler()),
    ("var_thresh", VarianceThreshold(threshold=0.00)),
    ("feature_sel", SelectKBest(score_func=f_classif, k=15))  
])

In [16]:
#Apply the preprocessing 
# Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train, y_train)
X_test_processed = preprocessor.transform(X_test)

In [17]:
X_train_processed[:,:]

array([[ 0.96137858, -1.60924117, -1.18034312, ..., -0.46456947,
        -0.70118303, -0.44713564],
       [ 0.96137858,  0.72202709,  0.67965831, ..., -0.46456947,
        -0.70118303, -0.44713564],
       [ 0.96137858,  1.34748931,  2.39577282, ..., -0.46456947,
         1.42616116, -0.44713564],
       ...,
       [-1.04017295, -0.75633814, -0.07487057, ..., -0.46456947,
        -0.70118303, -0.44713564],
       [-1.04017295,  0.49458629,  0.01286534, ..., -0.46456947,
        -0.70118303,  2.23645784],
       [-1.04017295, -0.69947794, -0.53109734, ..., -0.46456947,
        -0.70118303,  2.23645784]], shape=(13768, 15))

In [18]:
# Create a 100/100 balanced test split frpm the train.csv file 
import numpy as np

def create_fixed_balanced_test_split(X, y, n_per_class=100, random_state=42, pos_label=1):
    rng = np.random.RandomState(random_state)
    y_series = y if hasattr(y, "iloc") else pd.Series(y)
    y_array = y_series.to_numpy()

    mi_idx = np.where(y_array == pos_label)[0]
    nomi_idx = np.where(y_array != pos_label)[0]

    n_each = min(n_per_class, len(mi_idx), len(nomi_idx))
    if n_each < n_per_class:
        print(f"Warning: not enough samples; using n_per_class={n_each}")

    test_mi_idx = rng.choice(mi_idx, size=n_each, replace=False)
    test_nomi_idx = rng.choice(nomi_idx, size=n_each, replace=False)
    test_idx = np.concatenate([test_mi_idx, test_nomi_idx])
    rng.shuffle(test_idx)

    all_idx = np.arange(len(y_array))
    train_mask = np.ones(len(y_array), dtype=bool)
    train_mask[test_idx] = False
    train_idx = all_idx[train_mask]

    X_train_fixed, X_test_fixed = X[train_idx], X[test_idx]
    y_train_fixed, y_test_fixed = y_series.iloc[train_idx], y_series.iloc[test_idx]

    print("Fixed balanced test split created:")
    print(f"  Train: {len(train_idx)} | Test: {len(test_idx)} (exact {2*n_each})")
    print(f"  Test MI: {y_test_fixed.sum()} | Test No-MI: {len(y_test_fixed) - y_test_fixed.sum()}")
    print(f"  Test MI rate: {y_test_fixed.mean():.4f} ({y_test_fixed.mean()*100:.2f}%)")
    return X_train_fixed, X_test_fixed, y_train_fixed, y_test_fixed

# Use on your processed features and labels
X_train_fixed, X_test_fixed, y_train_fixed, y_test_fixed = create_fixed_balanced_test_split(
    X_train_processed, y_train, n_per_class=100, random_state=42
)


print("\nTraining split class distribution:")
print(y_train_fixed.value_counts())
print("\nTest split class distribution:")
print(y_test_fixed.value_counts())

Fixed balanced test split created:
  Train: 13568 | Test: 200 (exact 200)
  Test MI: 100 | Test No-MI: 100
  Test MI rate: 0.5000 (50.00%)

Training split class distribution:
MI
0    13113
1      455
Name: count, dtype: int64

Test split class distribution:
MI
1    100
0    100
Name: count, dtype: int64


In [19]:


# Undersample majority to match minority (1:1)
equal_under = RandomUnderSampler(sampling_strategy=1.0, random_state=42)
X_res_eq, y_res_eq = equal_under.fit_resample(X_train_fixed, y_train_fixed)

print(f"After undersampling: {X_res_eq.shape[0]} rows | MI rate={y_res_eq.mean():.3f}")


After undersampling: 910 rows | MI rate=0.500


In [20]:
# Train KNN
knn_eq = KNeighborsClassifier(n_neighbors=13, weights='distance', p=2)
knn_eq.fit(X_res_eq, y_res_eq)

In [21]:
# Evaluation utilities
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from scipy import stats
import numpy as np
import pandas as pd

def evaluate_split(name, y_true, y_pred, y_proba):
    # Overall accuracy
    acc = accuracy_score(y_true, y_pred)

    # Per-class metrics: index 0 -> MI(1), index 1 -> NoMI(0)
    # Order labels=[1,0] so we print MI first, then No-MI
    prec_arr, rec_arr, f1_arr, support = precision_recall_fscore_support(
        y_true, y_pred, labels=[1, 0], average=None, zero_division=0
    )
    prec_mi, prec_nomi   = prec_arr[0], prec_arr[1]
    rec_mi,  rec_nomi    = rec_arr[0],  rec_arr[1]
    f1_mi,   f1_nomi     = f1_arr[0],   f1_arr[1]
    sup_mi,  sup_nomi    = support[0],  support[1]

    # Confusion matrix (rows=actual, cols=pred) in the same label order [1,0]
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
    # cm = [[TP, FN],
    #       [FP, TN]] with this ordering (actual 1 first row, actual 0 second row)
    TP, FN = cm[0, 0], cm[0, 1]
    FP, TN = cm[1, 0], cm[1, 1]

    # Per-class accuracy equals recall per class
    acc_mi   = TP / (TP + FN) if (TP + FN) > 0 else 0.0    # same as rec_mi (sensitivity)
    acc_nomi = TN / (TN + FP) if (TN + FP) > 0 else 0.0    # same as rec_nomi (specificity)

    # KL divergence between true and predicted Bernoulli distributions
    actual_pi = y_true.mean()
    pred_pi = y_proba.mean()
    actual_dist = np.array([1 - actual_pi, actual_pi])
    pred_dist   = np.array([1 - pred_pi,   pred_pi])
    kl = stats.entropy(actual_dist, pred_dist)

    print(f"\n=== {name} ===")
    print(f"Overall Accuracy: {acc:.4f}")
    print(f"Per-class Accuracy :  MI: {acc_mi:.4f} | No-MI: {acc_nomi:.4f}")
    print(f"MI(1)   — Precision: {prec_mi:.4f}  Recall: {rec_mi:.4f}  F1: {f1_mi:.4f}  Support: {sup_mi}")
    print(f"NoMI(0) — Precision: {prec_nomi:.4f}  Recall: {rec_nomi:.4f}  F1: {f1_nomi:.4f}  Support: {sup_nomi}")
    print("Confusion matrix (rows=[MI, NoMI], cols=[MI, NoMI]):\n", cm)
    print(f"KL Divergence (true vs predicted class distribution): {kl:.4f}")


    print("\nClassification report (sklearn default order [0,1]):\n",
          classification_report(y_true, y_pred, digits=4, zero_division=0))

    return {
        "overall_accuracy": acc,
        "acc_mi": acc_mi,
        "acc_nomi": acc_nomi,
        "precision_mi": prec_mi,
        "recall_mi": rec_mi,
        "f1_mi": f1_mi,
        "precision_nomi": prec_nomi,
        "recall_nomi": rec_nomi,
        "f1_nomi": f1_nomi,
        "kl_div": kl,
        "cm": cm,
    }

#Evaluate 1:1 undersampling KNN (knn_eq) on your fixed balanced test split
y_pred_eq  = knn_eq.predict(X_test_fixed)
y_proba_eq = knn_eq.predict_proba(X_test_fixed)[:, 1]
metrics_eq = evaluate_split("Undersampling KNN (k=9, distance, p=2)", y_test_fixed, y_pred_eq, y_proba_eq)


=== Undersampling KNN (k=9, distance, p=2) ===
Overall Accuracy: 0.7800
Per-class Accuracy :  MI: 0.7700 | No-MI: 0.7900
MI(1)   — Precision: 0.7857  Recall: 0.7700  F1: 0.7778  Support: 100
NoMI(0) — Precision: 0.7745  Recall: 0.7900  F1: 0.7822  Support: 100
Confusion matrix (rows=[MI, NoMI], cols=[MI, NoMI]):
 [[77 23]
 [21 79]]
KL Divergence (true vs predicted class distribution): 0.0032

Classification report (sklearn default order [0,1]):
               precision    recall  f1-score   support

           0     0.7745    0.7900    0.7822       100
           1     0.7857    0.7700    0.7778       100

    accuracy                         0.7800       200
   macro avg     0.7801    0.7800    0.7800       200
weighted avg     0.7801    0.7800    0.7800       200



In [22]:
# Predict on the provided test.csv (already preprocessed to X_test_processed)
# Choose which trained model to use; pick knn_cs if you want more MI emphasis
final_model = knn_eq

# Probabilities and predicted labels at threshold 0.5
test_proba = final_model.predict_proba(X_test_processed)[:, 1]
test_pred  = (test_proba >= 0.5).astype(int)

# Distribution on the unlabeled test set
pred_mi = int(test_pred.sum())
pred_nomi = int(len(test_pred) - pred_mi)
pred_rate = pred_mi / len(test_pred)
print("\n=== NHANES_Data_P1_MI_test.csv Prediction Distribution ===")
print(f"Total: {len(test_pred)} | MI: {pred_mi} | No-MI: {pred_nomi} | MI rate: {pred_rate:.4f}")

# Create submission file
submission_df = pd.DataFrame({
    "ID": np.arange(1, len(test_proba) + 1),
    "pred_probability": test_proba
})
submission_df.to_csv("MI_predictions_submission.csv", index=False)
print("Saved: MI_predictions_submission.csv")


# Round and save with 4 decimals
submission_df['pred_probability'] = submission_df['pred_probability'].round(4)
submission_df.to_csv('MI_predictions_submission.csv', index=False, float_format='%.4f')


=== NHANES_Data_P1_MI_test.csv Prediction Distribution ===
Total: 230 | MI: 103 | No-MI: 127 | MI rate: 0.4478
Saved: MI_predictions_submission.csv
