## **Waleed Umar (SP25-RAI-021)**

# **Paper Information**

**The paper selected was "Long-Term Coronary Artery Disease Risk Prediction with Machine Learning Models", published on 20 January 2023, in the HEC W Category ranked journal, Sensors.**

The paper is linked [here](https://www.mdpi.com/1424-8220/23/3/1193).

# **Restructuring the Code**

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

df = pd.read_csv('coronary_prediction.csv')

def impute_static_values(df):
    df['education'] = df['education'].fillna(1)
    df['totChol'] = df['totChol'].fillna(236)
    df['BMI'] = df['BMI'].fillna(25.8)
    df['heartRate'] = df['heartRate'].fillna(75)
    return df[['education', 'totChol', 'BMI', 'heartRate']]

def impute_cigs_per_day(df):
    df['cigsPerDay'] = df['cigsPerDay'].fillna(9999)
    df.loc[df['cigsPerDay'] == 9999, 'cigsPerDay'] = df['currentSmoker'].apply(lambda x: 18 if x == 1 else 0)
    return df[['cigsPerDay']]

def impute_bpmeds(df):
    df['BPMeds'] = df['BPMeds'].fillna(9999)
    df.loc[df['BPMeds'] == 9999, 'BPMeds'] = df['prevalentHyp'].apply(lambda x: 1 if x == 1 else 0)
    return df[['BPMeds']]

def impute_glucose(df):
    df['glucose'] = df['glucose'].fillna(9999)
    df.loc[df['glucose'] == 9999, 'glucose'] = df['diabetes'].apply(lambda x: 170 if x == 1 else 79)
    return df[['glucose']]

preprocessor = ColumnTransformer(
    transformers=[
        ('static', FunctionTransformer(impute_static_values), ['education', 'totChol', 'BMI', 'heartRate']),
        ('cigs_per_day', FunctionTransformer(impute_cigs_per_day), ['cigsPerDay', 'currentSmoker']),
        ('bpmeds', FunctionTransformer(impute_bpmeds), ['BPMeds', 'prevalentHyp']),
        ('glucose', FunctionTransformer(impute_glucose), ['glucose', 'diabetes'])
    ],
    remainder='passthrough'  
)

models = {
    "3-NN": KNeighborsClassifier(n_neighbors=3, algorithm='auto', metric='euclidean'),
    "Bagging": BaggingClassifier(RandomForestClassifier(n_estimators=500, random_state=42), n_estimators=10, random_state=42),
    "Decision Tree (J48)": DecisionTreeClassifier(random_state=42, criterion='gini', splitter='best', max_depth=None),
    "Logistic Regression (LR)": LogisticRegression(C=1e8, solver='lbfgs', max_iter=3000, random_state=42),
    "MLP (Multilayer Perceptron)": MLPClassifier(learning_rate_init=0.1, momentum=0.2, max_iter=200, random_state=42),
    "Naive Bayes (NB)": GaussianNB(),
    "Random Forest (RF)": RandomForestClassifier(n_estimators=500, oob_score=True, random_state=42, max_samples=1.0),
     "Stacking": StackingClassifier(estimators=[
        ('rf', RandomForestClassifier(random_state=42)),
        ('nb', GaussianNB())
    ], final_estimator=LogisticRegression(C=1e8, solver='lbfgs', random_state=42)),
    "Voting": VotingClassifier(estimators=[
        ('rf', RandomForestClassifier(random_state=42)),
        ('nb', GaussianNB())
    ], voting='soft')
   
}

def evaluate_metrics(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob) if y_prob is not None else None
    }
    return metrics

results = {"Original Data": {}, "SMOTE Data": {}}

X = df.drop('TenYearCHD', axis=1)  
y = df['TenYearCHD']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# FOR SMOTE ON WHOLE DATA
X_processed = preprocessor.fit_transform(X)

smote = SMOTE(k_neighbors=5, random_state=42)
X_smote, y_smote = smote.fit_resample(X_processed, y)

X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)
    
for name, model in models.items():
    # Original data pipeline
    original_pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('classifier', model)
    ])
    results["Original Data"][name] = evaluate_metrics(original_pipeline, X_train, X_test, y_train, y_test)
    
   # for smote stuff
    results["SMOTE Data"][name] = evaluate_metrics(model, X_train_smote, X_test_smote, y_train_smote, y_test_smote)

# Converting the results to DataFrames
original_results_df = pd.DataFrame(results["Original Data"]).T
smote_results_df = pd.DataFrame(results["SMOTE Data"]).T


print("Original Data Results:")
print(original_results_df)
print("\nSMOTE Data Results:")
print(smote_results_df)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Original Data Results:
                             Accuracy  Precision    Recall       AUC
3-NN                         0.812500   0.213115  0.104839  0.559426
Bagging                      0.853774   0.500000  0.072581  0.691956
Decision Tree (J48)          0.737028   0.163265  0.193548  0.511829
Logistic Regression (LR)     0.859670   0.666667  0.080645  0.704732
MLP (Multilayer Perceptron)  0.853774   0.000000  0.000000  0.500000
Naive Bayes (NB)             0.824292   0.313433  0.169355  0.711326
Random Forest (RF)           0.856132   0.555556  0.080645  0.685534
Stacking                     0.853774   0.500000  0.096774  0.684816
Voting                       0.824292   0.295082  0.145161  0.691120

SMOTE Data Results:
                             Accuracy  Precision    Recall       AUC
3-NN                         0.814325   0.723586  0.988338  0.907344
Bagging                      0.901947   0.913505  0.877551  0.962532
Decision Tree (J48)          0.840751   0.815172  0.861516 