In [12]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2

# Load data
df = pd.read_excel('Final.xlsx')

# Feature selection using information gain and chi-squared
X = df.drop('AACVPR_Risk_Category', axis=1)
y = df['AACVPR_Risk_Category']

# Information Gain (mutual information)
k_best = SelectKBest(score_func=mutual_info_classif, k='all')
k_best.fit(X, y)
scores = k_best.scores_
feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': scores})

# Conditional Probability (chi-squared)
k_best_chi2 = SelectKBest(score_func=chi2, k='all')
k_best_chi2.fit(X, y)
chi2_scores = k_best_chi2.scores_
chi2_feature_scores = pd.DataFrame({'Feature': X.columns, 'Chi2 Score': chi2_scores})

# Combine scores
combined_scores = pd.merge(feature_scores, chi2_feature_scores, on='Feature')
combined_scores = combined_scores.sort_values(by=['Score', 'Chi2 Score'], ascending=False)

# Display the top 10 features
top_k_features = combined_scores.head(10)
print(top_k_features)


                              Feature     Score  Chi2 Score
99                           Pre_METs  0.278396    7.208142
25                     Post_Peak_METs  0.223601    2.505397
100                    Pre_METs_range  0.193704    8.764590
82           Post_Medication_ACEI/ARB  0.110691    3.107889
63                   Past_CV_Prev PCI  0.108471    0.716558
94                           Post_BMI  0.069824    0.310394
27   Exercise_frequency_sessions_week  0.068787    0.986590
101              Post_Peak_METs_range  0.066431    1.795332
88      CR_Medication_Beta-antagonist  0.065765    1.986789
95                      Pre_BMI_range  0.063151    5.208528


In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report

# Select the top features based on the previous feature selection process
selected_features = ['Pre_METs', 'Post_Peak_METs', 'Pre_METs_range', 'Gender', 'Pre_Peak_Heart_Rate_range',
                      'Post_BMI', 'Exercise_frequency_mins_week', 'Pre_Medication_ACEI/ARB',
                      'Admission_Diagnosis_Other cardiothoracic procedures', 'Pre_Medication_Statin']

X = df[selected_features]
y = df['AACVPR_Risk_Category']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=22)

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 200,300],
    'max_depth': [None, 10, 20, 30,40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV with default cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, scoring='accuracy', cv=10)

# Perform grid search on the training dataset
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

# Predict on the test set
y_pred_test = best_rf_model.predict(X_test)

# Display classification report for the test set
print("Best Parameters:", best_params)
print("Classification Report (Test Set):")
print(classification_report(y_test, y_pred_test))


Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.67      0.80      0.73        10
           1       0.88      0.44      0.58        16
           2       0.53      0.89      0.67         9

    accuracy                           0.66        35
   macro avg       0.69      0.71      0.66        35
weighted avg       0.73      0.66      0.65        35



In [14]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

# Select the top features based on the previous feature selection process
selected_features = ['Pre_METs', 'Post_Peak_METs', 'Pre_METs_range', 'Gender', 'Pre_Peak_Heart_Rate_range',
                      'Post_BMI', 'Exercise_frequency_mins_week', 'Pre_Medication_ACEI/ARB',
                      'Admission_Diagnosis_Other cardiothoracic procedures', 'Pre_Medication_Statin']

X = df[selected_features]
y = df['AACVPR_Risk_Category']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

# Initialize the Logistic Regression model
logreg_model = LogisticRegression(random_state=22)

# Define the parameter grid for grid search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'max_iter': [100, 500, 1000],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(logreg_model, param_grid, scoring='accuracy', cv=5)

# Perform grid search on the training dataset
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_logreg_model = grid_search.best_estimator_

# Predict on the test set
y_pred_test = best_logreg_model.predict(X_test)

# Display classification report for the test set
print("Best Parameters:", best_params)
print("Logistic Regression - Classification Report (Test Set):")
print(classification_report(y_test, y_pred_test))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters: {'C': 0.1, 'max_iter': 100}
Logistic Regression - Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.53      0.80      0.64        10
           1       1.00      0.06      0.12        16
           2       0.47      1.00      0.64         9

    accuracy                           0.51        35
   macro avg       0.67      0.62      0.47        35
weighted avg       0.73      0.51      0.40        35



In [15]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report

# Select the top features based on the previous feature selection process
selected_features = ['Pre_METs', 'Post_Peak_METs', 'Pre_METs_range', 'Gender', 'Pre_Peak_Heart_Rate_range',
                      'Post_BMI', 'Exercise_frequency_mins_week', 'Pre_Medication_ACEI/ARB',
                      'Admission_Diagnosis_Other cardiothoracic procedures', 'Pre_Medication_Statin']

X = df[selected_features]
y = df['AACVPR_Risk_Category']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

# Initialize the MLPClassifier (Neural Network)
mlp_classifier = MLPClassifier(random_state=22, max_iter=5000)  # You might need to adjust max_iter based on convergence

# Define the parameter grid for grid search
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(mlp_classifier, param_grid, scoring='accuracy', cv=10)

# Perform grid search on the training dataset
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params_nn = grid_search.best_params_
best_nn_model = grid_search.best_estimator_

# Predict on the test set
y_pred_test_nn = best_nn_model.predict(X_test)

# Display classification report for the test set
print("Best Parameters for Neural Network:", best_params_nn)
print("Neural Network - Classification Report (Test Set):")
print(classification_report(y_test, y_pred_test_nn))


Best Parameters for Neural Network: {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (100, 50)}
Neural Network - Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.70      0.70      0.70        10
           1       0.67      0.50      0.57        16
           2       0.31      0.44      0.36         9

    accuracy                           0.54        35
   macro avg       0.56      0.55      0.55        35
weighted avg       0.58      0.54      0.55        35



In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load data
df = pd.read_excel('Final.xlsx')

# Select the top features based on the previous feature selection process
selected_features = ['Pre_METs', 'Post_Peak_METs', 'Pre_METs_range', 'Gender', 'Pre_Peak_Heart_Rate_range',
                      'Post_BMI', 'Exercise_frequency_mins_week', 'Pre_Medication_ACEI/ARB',
                      'Admission_Diagnosis_Other cardiothoracic procedures', 'Pre_Medication_Statin']

X = df[selected_features]
y = df['AACVPR_Risk_Category']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

# Initialize the Multinomial Naive Bayes classifier
naive_bayes = MultinomialNB()

# Define the parameter grid for grid search
param_grid = {
    'alpha': [0.0001,0.001,0.1, 0.5, 1.0, 2.0,3.0,4.0,5.0],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(naive_bayes, param_grid, scoring='accuracy', cv=10)

# Perform grid search on the training dataset
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_naive_bayes_model = grid_search.best_estimator_

# Predict on the test set
y_pred_test = best_naive_bayes_model.predict(X_test)

# Display classification report for the test set
print("Best Parameters:", best_params)
print("Naive Bayes - Classification Report (Test Set):")
print(classification_report(y_test, y_pred_test))


Best Parameters: {'alpha': 3.0}
Naive Bayes - Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.46      0.60      0.52        10
           1       0.50      0.06      0.11        16
           2       0.35      0.78      0.48         9

    accuracy                           0.40        35
   macro avg       0.44      0.48      0.37        35
weighted avg       0.45      0.40      0.32        35



In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

# Select the top features based on the previous feature selection process
selected_features = ['Pre_METs', 'Post_Peak_METs', 'Pre_METs_range', 'Gender', 'Pre_Peak_Heart_Rate_range',
                      'Post_BMI', 'Exercise_frequency_mins_week', 'Pre_Medication_ACEI/ARB',
                      'Admission_Diagnosis_Other cardiothoracic procedures', 'Pre_Medication_Statin']

X = df[selected_features]
y = df['AACVPR_Risk_Category']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

# Initialize the XGBClassifier
xgb_classifier = XGBClassifier(random_state=22)

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_child_weight': [1, 3, 5],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(xgb_classifier, param_grid, scoring='accuracy', cv=10)

# Perform grid search on the training dataset
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params_xgb = grid_search.best_params_
best_xgb_model = grid_search.best_estimator_

# Predict on the test set
y_pred_test_xgb = best_xgb_model.predict(X_test)

# Display classification report for the test set
print("Best Parameters for XGBoost:", best_params_xgb)
print("XGBoost - Classification Report (Test Set):")
print(classification_report(y_test, y_pred_test_xgb))


Best Parameters for XGBoost: {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 50}
XGBoost - Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.64      0.90      0.75        10
           1       0.57      0.25      0.35        16
           2       0.43      0.67      0.52         9

    accuracy                           0.54        35
   macro avg       0.55      0.61      0.54        35
weighted avg       0.56      0.54      0.51        35



In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier

# Select the top features based on the previous feature selection process
selected_features = ['Pre_METs', 'Post_Peak_METs', 'Pre_METs_range', 'Gender', 'Pre_Peak_Heart_Rate_range',
                      'Post_BMI', 'Exercise_frequency_mins_week', 'Pre_Medication_ACEI/ARB',
                      'Admission_Diagnosis_Other cardiothoracic procedures', 'Pre_Medication_Statin']

X = df[selected_features]
y = df['AACVPR_Risk_Category']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

# Initialize the LGBMClassifier
lgbm_classifier = LGBMClassifier(random_state=22)

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_child_samples': [1, 3, 5],
}

# Initialize GridSearchCV
grid_search_lgbm = GridSearchCV(lgbm_classifier, param_grid, scoring='accuracy', cv=10)

# Perform grid search on the training dataset
grid_search_lgbm.fit(X_train, y_train)

# Get the best parameters and best model
best_params_lgbm = grid_search_lgbm.best_params_
best_lgbm_model = grid_search_lgbm.best_estimator_

# Predict on the test set
y_pred_test_lgbm = best_lgbm_model.predict(X_test)

# Display classification report for the test set
print("Best Parameters for LightGBM:", best_params_lgbm)
print("LightGBM - Classification Report (Test Set):")
print(classification_report(y_test, y_pred_test_lgbm))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124
[LightGBM] [Info] Number of data points in the train set: 124, number of used features: 9
[LightGBM] [Info] Start training from score -1.156720
[LightGBM] [Info] Start training from score -1.209364
[LightGBM] [Info] Start training from score -0.949081
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000069 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 122
[LightGBM] [Info] Number of data points in the train set: 124, number of used features: 9
[LightGBM] [Info] Start training from score -1.156720
[LightGBM] [Info] Sta