In [None]:
#@title Imports { display-mode: "form" }
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, roc_curve, auc
from google.colab import drive
import os
from tqdm import tqdm

In [None]:
#@title Metric function { display-mode: "form" }
def getResult(y_test, y_pred):
  cm = confusion_matrix(y_test, y_pred)
  tp, fn, fp, tn = cm.reshape(-1)
  fpr, tpr, thresholds = roc_curve(y_test, y_pred)
  auc_score = auc(fpr, tpr)
  acc_score = accuracy_score(y_test, y_pred)
  return auc_score, acc_score

In [None]:
#@title Utility methods { display-mode: "form" }

def getEnsembledResultMulti(models, ground_truth, stratergy="AVG", model_1_weight=1, model_2_weight=1):
  y_test = ground_truth["HeartDisease"]

  if stratergy == "AVG":
    model_final = pd.DataFrame()
    index = 1
    col_list = []
    for model in models:
      model_final[str(index)] = model["y_pred"]
      col_list.append(str(index))
      index += 1

    model_final["y_pred"] = model_final[col_list].mean(axis=1)

  if stratergy == "MAX_PROB":
    model_final = pd.DataFrame()
    index = 1
    col_list = []
    for model in models:
      model_final[str(index)] = model["y_prob_1"]
      col_list.append(str(index))
      index += 1

    model_final["y_pred"] = model_final[col_list].max(axis=1)

  if stratergy == "AVG_PROB":
    model_final = pd.DataFrame()
    index = 1
    col_list = []
    for model in models:
      model_final[str(index)] = model["y_prob_1"]
      col_list.append(str(index))
      index += 1

    model_final["y_pred"] = model_final[col_list].mean(axis=1)

  if stratergy == "HARD":
    model_final = pd.DataFrame()
    index = 1
    col_list = []
    for model in models:
      model_final[str(index)] = model["y_pred"]
      col_list.append(str(index))
      index += 1

    model_final["y_pred"] = model_final[col_list].mode(axis=1)

  model_final[model_final["y_pred"] > 0.5] = 1
  model_final[model_final["y_pred"] <= 0.5] = 0

  auc, acc = getResult(model_final["y_pred"], y_test)
  return auc, acc

In [None]:
#@title Read Predictions data { display-mode: "form" }
drive.mount('/content/drive/')
path = '/content/drive/Shareddrives/DA224-O/project/Result_excels/'
ground_truth_path = '/content/drive/Shareddrives/DA224-O/project/data/testset_groundtruth.csv'
files = os.listdir(path)
ground_truth = pd.read_csv(ground_truth_path)
model_data = {}
for name in files:
  splits = name.split("_")

  variant_name = splits[0] + "_"

  if (splits[0] == "svc"):
    variant_name += splits[1] + "_"

  if (splits[0] == "TabNet"):
    if not splits[1].startswith("0."):
      variant_name += splits[1] + "_"

  if splits[-1] != ".csv":
    variant_name += splits[-1].split(".")[0] + "_"
  
  model_name = splits[0]
  model_predictions = pd.read_csv(path + "/" + name)
  if model_name not in model_data:
    model_data[model_name] = list()
    model_data[model_name].append([model_predictions, variant_name])
  else:
    model_data[model_name].append([model_predictions, variant_name])

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
#@title Model Variants & Strategies { display-mode: "form" }
models = ["TabNet", "AdaBoostClassifier", "CatBoost",  "LGBMClassifier", "XGBClassifier", "DecisionTreeClassifier",  "ExtraTreesClassifier",  
          "KNeighborsClassifier", "LogisticRegression", "RandomForestClassifier", "GradientBoostingClassifier", "HistGradientBoostingClassifier", "svc"]
Strategies = ["AVG", "MAX_PROB", "AVG_PROB", "HARD"]

models_variants = [["TabNet", "AdaBoostClassifier"], ["TabNet", "CatBoost"], ["TabNet", "LGBMClassifier"], ["TabNet", "XGBClassifier"], 
                   ["TabNet", "DecisionTreeClassifier"], ["TabNet", "ExtraTreesClassifier"], ["TabNet", "KNeighborsClassifier"], 
                   ["TabNet", "svc"] , ["TabNet", "RandomForestClassifier"], ["TabNet", "GradientBoostingClassifier"], ["TabNet", "HistGradientBoostingClassifier"], 
                   ["TabNet", "LogisticRegression"], 

                   ["LGBMClassifier", "AdaBoostClassifier"], ["LGBMClassifier", "CatBoost"], ["LGBMClassifier", "XGBClassifier"], 
                   ["LGBMClassifier", "DecisionTreeClassifier"], ["LGBMClassifier", "ExtraTreesClassifier"], ["LGBMClassifier", "KNeighborsClassifier"], 
                   ["LGBMClassifier", "RandomForestClassifier"], 


                    ["TabNet", "CatBoost", "LGBMClassifier"], ["TabNet", "HistGradientBoostingClassifier","XGBClassifier"], 
                   ["TabNet", "HistGradientBoostingClassifier", "svc"], ["HistGradientBoostingClassifier", "svc", "KNeighborsClassifier"],
                   ["KNeighborsClassifier", "LogisticRegression", "svc"],                  
                   
                   ["LGBMClassifier"], ["TabNet"], ["XGBClassifier"], ["DecisionTreeClassifier"], ["ExtraTreesClassifier"], ["KNeighborsClassifier"], ["CatBoost"], 
                   ["AdaBoostClassifier"], ["RandomForestClassifier"], ["HistGradientBoostingClassifier"], ["svc"], ["GradientBoostingClassifier"], ["LogisticRegression"]
                   ]

In [None]:
#@title Run Model Ensembling  { display-mode: "form" }
data = {"Models Ensembled" : [], "Ensembling Strategy" : [], "AUC":[], "Accuracy":[]}
for global_variant in tqdm(models_variants):

  print('Running --> ', global_variant)
  
  #Ensembling 3 Models
  if len(global_variant) == 3:
    m_1_variants = model_data[global_variant[0]]
    m_2_variants = model_data[global_variant[1]]
    m_3_variants = model_data[global_variant[2]]

    for each_m1 in m_1_variants:
      for each_m2 in m_2_variants:
        for each_m3 in m_3_variants:
          m1 = each_m1[0]
          m2 = each_m2[0]
          m3 = each_m3[0]
          models = [m1, m2, m3]

          models_name = each_m1[1] + " + " + each_m2[1] + " + " + each_m3[1]

          for st in Strategies:
            if len(models) == 1:
              score = getEnsembledResultMulti(models, ground_truth, "MAX_PROB")
              data["Models Ensembled"].append(models_name)
              data["Ensembling Strategy"].append(st)
              data["AUC"].append(score[0])
              data["Accuracy"].append(score[1])
              break
            else:
              score = getEnsembledResultMulti(models, ground_truth, st)
              data["Models Ensembled"].append(models_name)
              data["Ensembling Strategy"].append(st)
              data["AUC"].append(score[0])
              data["Accuracy"].append(score[1])

  #Ensembling 2 Models
  if len(global_variant) == 2:
    m_1_variants = model_data[global_variant[0]]
    m_2_variants = model_data[global_variant[1]]

    for each_m1 in m_1_variants:
      for each_m2 in m_2_variants:
        m1 = each_m1[0]
        m2 = each_m2[0]
        models = [m1, m2]

        models_name = each_m1[1] + " + " + each_m2[1]

        for st in Strategies:
          if len(models) == 1:
            score = getEnsembledResultMulti(models, ground_truth, "MAX_PROB")
            data["Models Ensembled"].append(models_name)
            data["Ensembling Strategy"].append(st)
            data["AUC"].append(score[0])
            data["Accuracy"].append(score[1])
            break
          else:
            score = getEnsembledResultMulti(models, ground_truth, st)
            data["Models Ensembled"].append(models_name)
            data["Ensembling Strategy"].append(st)
            data["AUC"].append(score[0])
            data["Accuracy"].append(score[1])

  #Individual Model Performance
  if len(global_variant) == 1:
      model_local_variants = model_data[global_variant[0]] #Ex: svc_lin, svc_rbf

      for model in model_local_variants:
        models = [model[0]]
        model_name = model[1]

        for st in Strategies:
          if len(models) == 1:
            score = getEnsembledResultMulti(models, ground_truth, "MAX_PROB")
            data["Models Ensembled"].append(model_name)
            data["Ensembling Strategy"].append(st)
            data["AUC"].append(score[0])
            data["Accuracy"].append(score[1])
            break
          else:
            score = getEnsembledResultMulti(models, ground_truth, st)
            data["Models Ensembled"].append(model_name)
            data["Ensembling Strategy"].append(st)
            data["AUC"].append(score[0])
            data["Accuracy"].append(score[1])

  0%|          | 0/37 [00:00<?, ?it/s]

Running -->  ['TabNet', 'AdaBoostClassifier']


  3%|▎         | 1/37 [00:02<01:47,  2.98s/it]

Running -->  ['TabNet', 'CatBoost']


  5%|▌         | 2/37 [00:06<01:48,  3.09s/it]

Running -->  ['TabNet', 'LGBMClassifier']


  8%|▊         | 3/37 [00:09<01:52,  3.31s/it]

Running -->  ['TabNet', 'XGBClassifier']


 11%|█         | 4/37 [00:12<01:39,  3.01s/it]

Running -->  ['TabNet', 'DecisionTreeClassifier']


 14%|█▎        | 5/37 [00:15<01:33,  2.93s/it]

Running -->  ['TabNet', 'ExtraTreesClassifier']


 16%|█▌        | 6/37 [00:17<01:29,  2.88s/it]

Running -->  ['TabNet', 'KNeighborsClassifier']


 19%|█▉        | 7/37 [00:19<01:11,  2.40s/it]

Running -->  ['TabNet', 'svc']


 22%|██▏       | 8/37 [00:26<01:51,  3.85s/it]

Running -->  ['TabNet', 'RandomForestClassifier']


 24%|██▍       | 9/37 [00:28<01:38,  3.52s/it]

Running -->  ['TabNet', 'GradientBoostingClassifier']


 27%|██▋       | 10/37 [00:31<01:28,  3.28s/it]

Running -->  ['TabNet', 'HistGradientBoostingClassifier']


 30%|██▉       | 11/37 [00:34<01:21,  3.13s/it]

Running -->  ['TabNet', 'LogisticRegression']


 32%|███▏      | 12/37 [00:35<01:04,  2.59s/it]

Running -->  ['LGBMClassifier', 'AdaBoostClassifier']


 35%|███▌      | 13/37 [00:36<00:45,  1.91s/it]

Running -->  ['LGBMClassifier', 'CatBoost']


 38%|███▊      | 14/37 [00:36<00:33,  1.44s/it]

Running -->  ['LGBMClassifier', 'XGBClassifier']


 41%|████      | 15/37 [00:36<00:24,  1.11s/it]

Running -->  ['LGBMClassifier', 'DecisionTreeClassifier']


 43%|████▎     | 16/37 [00:37<00:20,  1.01it/s]

Running -->  ['LGBMClassifier', 'ExtraTreesClassifier']


 46%|████▌     | 17/37 [00:38<00:18,  1.11it/s]

Running -->  ['LGBMClassifier', 'KNeighborsClassifier']


 49%|████▊     | 18/37 [00:38<00:13,  1.36it/s]

Running -->  ['LGBMClassifier', 'RandomForestClassifier']


 51%|█████▏    | 19/37 [00:39<00:13,  1.38it/s]

Running -->  ['TabNet', 'CatBoost', 'LGBMClassifier']


 54%|█████▍    | 20/37 [00:40<00:15,  1.08it/s]

Running -->  ['TabNet', 'HistGradientBoostingClassifier', 'XGBClassifier']


 57%|█████▋    | 21/37 [00:43<00:23,  1.49s/it]

Running -->  ['TabNet', 'HistGradientBoostingClassifier', 'svc']


 59%|█████▉    | 22/37 [00:57<01:18,  5.23s/it]

Running -->  ['HistGradientBoostingClassifier', 'svc', 'KNeighborsClassifier']


 62%|██████▏   | 23/37 [01:00<01:05,  4.68s/it]

Running -->  ['KNeighborsClassifier', 'LogisticRegression', 'svc']


 65%|██████▍   | 24/37 [01:02<00:49,  3.80s/it]

Running -->  ['LGBMClassifier']
Running -->  ['TabNet']


 70%|███████   | 26/37 [01:03<00:23,  2.15s/it]

Running -->  ['XGBClassifier']
Running -->  ['DecisionTreeClassifier']


 78%|███████▊  | 29/37 [01:03<00:08,  1.09s/it]

Running -->  ['ExtraTreesClassifier']
Running -->  ['KNeighborsClassifier']


 86%|████████▋ | 32/37 [01:03<00:02,  1.84it/s]

Running -->  ['CatBoost']
Running -->  ['AdaBoostClassifier']
Running -->  ['RandomForestClassifier']


 92%|█████████▏| 34/37 [01:04<00:01,  2.55it/s]

Running -->  ['HistGradientBoostingClassifier']
Running -->  ['svc']


 97%|█████████▋| 36/37 [01:04<00:00,  2.87it/s]

Running -->  ['GradientBoostingClassifier']
Running -->  ['LogisticRegression']


100%|██████████| 37/37 [01:04<00:00,  1.76s/it]


In [None]:
#@title Run Model Ensembling  { display-mode: "form" }
Report = pd.DataFrame(data)
Report = Report.dropna()
Report = Report.sort_values(by=['AUC', 'Accuracy'], ascending=False, ignore_index=True)
Report.to_csv("EnsembleReport_CourseProject_V3.csv")

In [None]:
#@title Show Performance Plot  { display-mode: "form" }
ReportAsc = Report.sort_values(by=['Accuracy'], ascending=True, ignore_index=True)
ReportAsc['Accuracy'].plot()
ReportAsc = Report.sort_values(by=['AUC'], ascending=True, ignore_index=True)
ReportAsc['AUC'].plot()