# Note:
- The 3-year survival binary classification for candidate and donor with scaled features, excluding any Max Gini Impurity < 5% on every feature after all encoding.
- Remove NaNs from numerical features

## Import Libraries

In [None]:
# path to user functions
import sys  
sys.path.append('../Src/')

from platform import python_version

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import importlib 
from sklearn.preprocessing import OrdinalEncoder

# import user functions
import UserUtilityFunctions as uf
import UserStatisticalFunctions as usf
import UserVisualization as uv
import UserMetricsFunctions as umf
import UserFeatureSelection as ufs
import HyperParameters as parms

# set seaborn theme
sns.set_theme()

# initializing variables
RANDOM_STATE = 1776

# print versions
print("Numpy Version: " + np.__version__)
print("Pandas Version: " + pd.__version__)
print("Seaborn Version: " + sns.__version__)
print("Matplotlib Version: " + plt.matplotlib.__version__)
print("Python Version: " + python_version())

# adjust pandas display options to max
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# adjust pandas display options to ensure full display of content
pd.set_option('display.max_colwidth', None)

## Import Dataset

In [None]:
# import data
df = pd.read_pickle("../Data/6Clean_ML_Heart_Orig.pkl")
df_can = pd.read_pickle("../Data/6Clean_ML_CAN_Orig.pkl")
df_don = pd.read_pickle("../Data/6Clean_ML_DON_Orig.pkl")
df_both = pd.read_pickle("../Data/6Clean_ML_BOTH_Orig.pkl")
df_ordinal = pd.read_pickle("../Data/6Clean_ML_ordinal_Orig.pkl")
df_nominal = pd.read_pickle("../Data/6Clean_ML_nominal_Orig.pkl")
df_numeric = pd.read_pickle("../Data/6Clean_ML_numeric_Orig.pkl")
df_label = pd.read_pickle("../Data/6Clean_ML_label_Orig.pkl")
# print shape
print(f"Heart Dataset Rows: {df.shape[0]:,} & Columns: {df.shape[1]:,}")
print(f"Candidate Features: {df_can.shape[0]:,}")
print(f"Donor Features: {df_don.shape[0]:,}")
print(f"Both Features: {df_both.shape[0]:,}")
print(f"Ordinal Features: {df_ordinal.shape[0]:,}")
print(f"Nominal Features: {df_nominal.shape[0]:,}")
print(f"Numeric Features: {df_numeric.shape[0]:,}")
print(f"Label Features: {df_label.shape[0]:,}")

### Sanity Check

In [None]:
# combine two lists
allCat = df_ordinal.column.to_list() + df_nominal.column.to_list() + df_numeric.column.to_list()

# insanity check
uf.symmetric_difference(set(allCat), set(df.columns.to_list()))

## Remove Unwanted Features

### User Function(s)

In [None]:
def HouseKeeping(data, removeColLst, dataOrdinal, dataNominal, dataCan, dataDon, dataBoth, dataNumeric, display=True):
    """
    Run helper fuction for house keeping
    """
    # remove DataFrame data (house keeping)
    dataOrdinal = uf.remove_row_using_mask(dataOrdinal, removeColLst, colstr='column',  string='df_ordinal', display=display)
    dataNominal = uf.remove_row_using_mask(dataNominal, removeColLst, colstr='column', string='df_nominal',  display=display)
    dataNumeric = uf.remove_row_using_mask(dataNumeric, removeColLst, colstr='column', string='df_numeric',  display=display)
    dataCan = uf.remove_row_using_mask(dataCan, removeColLst, colstr='column', string='df_can',  display=display)
    dataDon = uf.remove_row_using_mask(dataDon, removeColLst, colstr='column', string='df_don',  display=display)
    dataBoth = uf.remove_row_using_mask(dataNominal, removeColLst, colstr='column', string='df_both',  display=display)
    
    # remove features
    data = uf.remove_column(data, removeColLst, display=display)

    return data, dataOrdinal, dataNominal, dataCan, dataDon, dataBoth, dataNumeric


def find_feature(string, column_list):
    # initialize list
    lst = []
    # iterate
    for col in column_list:
        if string in col:
            lst.append(col)
    return lst

### Copy DataFrame 
- `df_copy`

In [None]:
# keep original DataFrame
df_copy = df.copy()

## Label Plot: Survival

In [None]:
# remove NaN
df = df.dropna()
df = uf.remove_cat_zero_count(df).copy()
# print
print(f"Heart Dataset Rows: {df.shape[0]:,} & Columns: {df.shape[1]:,}")

# plot
uv.plot_count(df, ['Survival'], fig_size=(10, 4))

#### Unwanted Labels

In [None]:
# remove unwanted labels
removeCols = df_label.column.to_list()

## Ordinal Plots & Engineering

### Ordinal: Heapmap

In [None]:
# compute the correlation matrix
corr_matrix = df[df_ordinal.column.to_list()].corr(method='spearman')
# create a mask for the UPPER triangle  
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  

# display options
plt.figure(figsize=(40,40))
plt.title("Ordinal Correlation Bottom Half Heatmap", fontsize=30)
plt.xticks(fontsize=20)  # font size for x-ticks
plt.yticks(fontsize=20)  # font size for y-ticks
# plot the BOTTOM half (lower triangle + diagonal)  
sns.heatmap(corr_matrix,  
            mask=mask,  # hide the upper triangle  
            annot=True,  
            cmap='coolwarm',  
            vmin=-1, vmax=1,  
            linewidths=0.5, fmt=".2f", cbar=False)  
plt.show()

### Ordinal Display
- The ordinal features with Spearman correlation below indicate strong to very moderatly strong positive monotonic relationships between the respective variable pairs 
    - Very Strong Associations (|r| ≥ 0.7)
    - Moderate-Strong Associations (0.5 ≤ |r| < 0.7)

In [None]:
# threshold value
threshold = 0.5
# identify upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# find feature columns with correlation greater than 0.7
to_drop = [column for column in upper.columns if any(np.abs(upper[column]) > threshold)]

# display
print(f"Strong correlation: {sorted(to_drop)}\n")
# ordinal correlation: re-check
print(f"Ordinal Correlation Re-Check with Threshold of {threshold * 100}% or greater:")
usf.correlation_with_threshold(df[df_ordinal.column.to_list()], method='spearman', threshold=threshold)

### Engineer: MismatchLevel

In [None]:
features = uf.get_feature_list(df, 'MismatchLevel')

In [None]:
# ordinal correlation
usf.correlation_with_threshold(df[features], method='spearman', threshold=threshold)

In [None]:
# append to removeCols
removeCols.extend(features)

# combine
df['MismatchLevel_Addition'] = df[features].sum(axis=1)

# update DataFrame
df_ordinal = uf.insert_into_dataframe(df_ordinal, ['MismatchLevel_Addition'])
df_both = uf.insert_into_dataframe(df_both, ['MismatchLevel_Addition'])
# display
uf.category_contingency_survival(df, 'MismatchLevel_Addition')

### Engineer: Inotropes

In [None]:
features = uf.get_feature_list(df, 'Inotropes')

In [None]:
# ordinal correlation
usf.correlation_with_threshold(df[features], method='spearman', threshold=threshold)

In [None]:
# append to removeCols
removeCols.extend(features)

# combine
df['Inotropes_Yes_Count_Addition_CAN'] = df[features].sum(axis=1)

# update DataFrame
df_ordinal = uf.insert_into_dataframe(df_ordinal, ['Inotropes_Yes_Count_Addition_CAN'])
df_can = uf.insert_into_dataframe(df_can, ['Inotropes_Yes_Count_Addition_CAN'])
# display
uf.category_contingency_survival(df, 'Inotropes_Yes_Count_Addition_CAN')

### Ordinal Remove

In [None]:
# add to removeCols
removeCols.extend(['PreviousTransplantNumber_CAN','FunctionalStatusTransplant_CAN'])
#remove features
df, df_ordinal, df_nominal, df_can, df_don, df_both, df_numeric = HouseKeeping(df, removeCols, df_ordinal, df_nominal, df_can, df_don, df_both, df_numeric, display=True)
# initialize removeCols
removeCols = []

### Ordinal Association Testing

In [None]:
print(f"Ordinal Correlation Re-Check with Threshold of {threshold * 100}% or greater:")
usf.correlation_with_threshold(df[df_ordinal.column.to_list()], method='spearman', threshold=threshold)

In [None]:
mwutDF = usf.mannwhitneyu_combinations(df, df_ordinal.column.to_list())

# display p_value >= 0.05
mwutDF[(mwutDF.p_value >= 0.05)]

##### There are associations, but they are different features representing something different.

### Ordinal Datatype Update

In [None]:
# change datatype to category
df = uf.convert_to_category(df, df_ordinal.column.to_list(), ordered=True)

## Nominal Plots & Engineering

### Nominal Chi2 Test & Cramer V

In [None]:
# hypothesis testing
nominalDF = usf.multi_independence_category(df, df_nominal.column.to_list())
# display
nominalDF[(nominalDF.p_value <= 0.05) & (nominalDF.cramer_v > .5)]

In [None]:
# display
features = uf.get_feature_list(df, 'NonHeartBeating_DON|CardiacArrest_DON')

In [None]:
for feature in features:
    print(f"{uf.category_contingency_survival(df, feature).to_string()} \t{feature}")
    print("\n")

### Nominal Remove

In [None]:
# add to removeCols
removeCols.extend(['NonHeartBeating_DON']) # mostly No
#remove features
df, df_ordinal, df_nominal, df_can, df_don, df_both, df_numeric = HouseKeeping(df, removeCols, df_ordinal, df_nominal, df_can, df_don, df_both, df_numeric, display=True)
# initialize removeCols
removeCols = []

### Nominal Datatype Update

In [None]:
# change datatype to category
df = uf.convert_to_category(df, df_nominal.column.to_list())

### Numeric Plot & Engineering

In [None]:
# correlation
usf.correlation_with_threshold(df[df_numeric.column.to_list()], method='pearson', threshold=0.5)

#### Engineer: HeightCm_Addition & WeightKg_Addition

In [None]:
features = uf.get_feature_list(df, 'HeightCm|WeightKg')

In [None]:
# append to removeCols
removeCols.extend(features)

# combine
df['Total_Mass'] = df[features].sum(axis=1)

# update DataFrame
df_numeric = uf.insert_into_dataframe(df_numeric, ['Total_Mass'])
df_both = uf.insert_into_dataframe(df_both, ['Total_Mass'])

#### Numeric Remove

In [None]:
# append to removeCols
removeCols.extend(['DistanceFromDonorHospitaltoTXCenter']) # distance dependent on IschemicTimeHour_DON
#remove features
df, df_ordinal, df_nominal, df_can, df_don, df_both, df_numeric = HouseKeeping(df, removeCols, df_ordinal, df_nominal, df_can, df_don, df_both, df_numeric, display=True)
# initialize removeCols
removeCols = []

#### Numeric Heatmap

In [None]:
# compute the correlation matrix
corr_matrix = df[df_numeric.column.to_list()].corr(method='pearson')
# create a mask for the UPPER triangle  
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  

# display options
plt.figure(figsize=(40,40))
plt.title("Correlation Bottom Half Heatmap", fontsize=30)
plt.xticks(fontsize=20)  # font size for x-ticks
plt.yticks(fontsize=20)  # font size for y-ticks
# plot the BOTTOM half (lower triangle + diagonal)  
sns.heatmap(corr_matrix,  
            mask=mask,  # hide the upper triangle  
            annot=True,  
            cmap='coolwarm',  
            vmin=-1, vmax=1,  
            linewidths=0.5, fmt=".2f", cbar=False)  
plt.show()

### Dummy Encoding

In [None]:
# shape
print(f"Shape before encoding: {df.shape}")
# encode
df_dummy = pd.get_dummies(df, columns=df_nominal.column.to_list(), drop_first=True)
# print shape
print(f"Shape after encoding: {df_dummy.shape}")
# add columns to list
dummyCols = df_dummy.columns.tolist()
# remove label
dummyCols.remove('Survival')

## Display GINI Information: Feature Selection

In [None]:
# remove features with GiniPercent
percent = 5
holdDF = ufs.compute_entropy_gini_impurity(df_dummy[dummyCols])
removeCols = holdDF.ColumnName[holdDF.GiniPercent < percent].to_list()
print(f"Number of Features to be removed: {len(removeCols)}")
holdDF.sample(20)

In [None]:
# features to be removed
holdDF[holdDF.ColumnName.isin(removeCols)].sample(10)

## Split Dataset

In [None]:
from sklearn.model_selection import train_test_split

# encoded DataFrame
df_dummy = df_dummy.drop(columns=removeCols)
# print shape
print(f"Encode & Remove Features with GINI: Heart Dataset Rows: {df_dummy.shape[0]:,} & Columns: {df_dummy.shape[1]:,}")

# house keeping
ordinalNumericCols = list((set(df_ordinal.column.to_list()) | set(df_numeric.column.to_list())) - set(removeCols))

# split X & y
X = df_dummy.drop(columns = 'Survival')
y = df_dummy.Survival

# mapping values
y = y.map({'Dead': True, 'Living': False}).astype(int)
# print
print(f"Dead: {True} & Living: {False}")

# split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

## Scale

In [None]:
from sklearn.preprocessing import MinMaxScaler

# initialize MinMaxScaler
scale = MinMaxScaler()

# fit
fitScale = scale.fit(X_train[ordinalNumericCols])

# transform
X_train[ordinalNumericCols] = fitScale.transform(X_train[ordinalNumericCols])
X_test[ordinalNumericCols] = fitScale.transform(X_test[ordinalNumericCols])

In [None]:
# message for dataframe
msg = f"2011-2016: Orginal Features: Grid Search & CV Scoring F1 - Features where Max Gini Impurity < {percent}% removed"

In [None]:
from sklearn.metrics import matthews_corrcoef, make_scorer

# define MCC scorer
mcc_scorer = make_scorer(matthews_corrcoef)

## Machine Learning

### RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
# function from sklearn.utils.class_weight computes the weights for each class to handle imbalanced datasets
from sklearn.utils.class_weight import compute_class_weight

# the 'balanced' mode adjusts weights inversely proportional to class frequencies in the input data for Random Forest
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# grid search Hyperparameters Random Forest Classfiier
parameters = parms.rfc_parms(class_weight_dict)

# initialize Model with class weights
rfc_param = {'random_state': RANDOM_STATE}

# instantiate Random Forest Classifier
Model = RandomForestClassifier(**rfc_param)

# CV with grid search for Hyper-Parameter tuning
rfc_fit = umf.stratified_grid(Model, parameters, X_train, y_train, seed=RANDOM_STATE, n_jobs=-1, n_split=5, score = 'f1')

# best model
Model = rfc_fit.best_estimator_

In [None]:
print(Model)

#### Train

In [None]:
# initialize variables
algorithm = 'RandomForestClassifier'
desc = f'{algorithm} - {msg}'
model = 'rfc_fit'
data_type = 'Training'

# display
df_classfication, threshold = umf.classification_main(algorithm, model, desc, Model, X_train, y_train, data_type, metric_df=None, train_threshold=None)
df_classfication

#### Test

In [None]:
# initialize variable
data_type = 'Validation/Test'

# display
df_classfication, _ = umf.classification_main(algorithm, model, desc, Model, X_test, y_test, data_type,  metric_df=df_classfication, train_threshold=None)
df_classfication

#### Feature Importance

In [None]:
df_rfc = umf.plot_feature_importance(Model, pd.DataFrame(X_train, columns=X_train.columns.to_list()),figsize=(20,50))

In [None]:
df_rfc.sort_values(by='Feature Importance Score', ascending=False).head(20)

### LogisticRegression

In [None]:
# import library
from sklearn.linear_model import LogisticRegression

# define the hyperparameter grid to search
parameters = parms.lrc_parms(class_weight_dict)

# Base Model
lrc_param = {'max_iter':10000, 'random_state': RANDOM_STATE, 'solver': 'saga', 'penalty': 'elasticnet'}

# instantiate LogisticRegression Classifier
Model = LogisticRegression(**lrc_param)

# CV with grid search for Hyper-Parameter tuning
lrc_fit = umf.stratified_grid(Model, parameters, X_train, y_train, seed=RANDOM_STATE, n_jobs=-1, n_split=5, score='f1')

# best model
Model = lrc_fit.best_estimator_

In [None]:
print(Model)

#### Train

In [None]:
# initialize variables
algorithm = 'LogisticRegression'
desc = f'{algorithm} - {msg}'
model = 'lrc_fit'
data_type = 'Training'

# display
df_classfication, threshold = umf.classification_main(algorithm, model, desc, Model, X_train, y_train, data_type, metric_df=df_classfication, train_threshold=None)
df_classfication

#### Test

In [None]:
# initialize variable
data_type = 'Validation/Test'

# display
df_classfication, _ = umf.classification_main(algorithm, model, desc, Model, X_test, y_test, data_type, metric_df=df_classfication, train_threshold=None)
df_classfication

#### Feature Importance

In [None]:
df_lrc = umf.logistic_feature_importance(Model, figsize=(30,30), fontsize=6)

In [None]:
df_lrc.sort_values(by='Coefficient', ascending=False).head(20)

### XGBClassifier

#### Explanation of the Gamma Parameter
- In XGBoost, the regularization term that controls the minimum loss reduction required to make a split at a node is known as the "gamma" parameter. This parameter plays a crucial role in the decision-making process of the algorithm during the tree construction phase.
- Definition: The gamma parameter specifies the minimum loss reduction required to make a further partition on a leaf node of the tree. If the reduction in loss from making a split is less than gamma, then that split will not be made.
- Purpose: By setting a higher value for gamma, you can prevent overfitting by making the model more conservative. It effectively controls how complex the model can become by limiting the number of splits. A lower gamma allows more splits and can lead to a more complex model.
- Impact on Model Complexity:
    - High Gamma Value: Results in fewer splits, leading to simpler trees that may underfit the data.
    - Low Gamma Value: Allows more splits, potentially leading to more complex trees that may overfit the training data.

The gamma parameter is essential for controlling the complexity of models built using XGBoost by regulating how much improvement in loss is necessary for splitting nodes. Adjusting this parameter can help balance between bias and variance, thus improving model performance on unseen data.

In [None]:
# import library
from xgboost import XGBClassifier

# calculate the scale_pos_weight to adjust for class imbalance
# helps to counteract the imbalance by giving more weight to the minority class (usually the positive class in binary classification).
scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)

# define the hyperparameter grid to search
parameters = parameters = parms.xgbc_parms(scale_pos_weight)

# Base Model
xgb_param = {'random_state': RANDOM_STATE, 'tree_method':'hist', early_stopping_rounds=50}

# instantiate XGB Classifier
Model = XGBClassifier(**xgb_param)

# CV with grid search for Hyper-Parameter tuning
xgb_fit = umf.stratified_grid(Model, parameters, X_train, y_train,  seed=RANDOM_STATE, n_jobs=-1, n_split=5, score='f1')

# best model
Model = xgb_fit.best_estimator_

In [None]:
print(Model)

#### Train

In [None]:
# initialize variables
algorithm = 'XGBClassifier'
desc = f'{algorithm} - {msg}'
model = 'xgb_fit'
data_type = 'Training'

# display
df_classfication, threshold = umf.classification_main(algorithm, model, desc, Model, X_train, y_train, data_type, metric_df=df_classfication, train_threshold=None)
df_classfication

#### Test

In [None]:
# initialize variable
data_type = 'Validation/Test'

# display
df_classfication, _ = umf.classification_main(algorithm, model, desc, Model, X_test, y_test, data_type, metric_df=df_classfication, train_threshold=None)
df_classfication

#### Feature Importance

In [None]:
df_xgb = umf.plot_feature_importance(Model, pd.DataFrame(X_train, columns=X_train.columns.to_list()),figsize=(20,50))

In [None]:
df_xgb.sort_values(by = 'Feature Importance Score', ascending=False).head(20)

### KNN
- `K-Nearest Neighbors (KNN)`
    - Type: Instance-based learning (lazy learning)
    - Mechanism: Classifies a data point based on the majority class among its k-nearest neighbors.
    - Pros: Simple, intuitive, no training phase.
    - Cons: Computationally expensive during prediction, sensitive to irrelevant features and the choice of k.

In [None]:
# import library
from sklearn.neighbors import KNeighborsClassifier

# define the parameter grid for GridSearchCV
parameters = parameters = parms.knn_parms()

# create a KNN model
Model = KNeighborsClassifier(n_jobs=-1)

# CV with grid search for Hyper-Parameter tuning
knn_fit = umf.stratified_grid(Model, parameters, X_train, y_train, seed=RANDOM_STATE, n_jobs=-1, n_split=5, score='f1')

# best model
Model = knn_fit.best_estimator_

In [None]:
print(Model)

#### Train

In [None]:
# initialize variables
algorithm = 'KNN'
desc = f'{algorithm} - {msg}'
model = 'knn_fit'
data_type = 'Training'

# display
df_classfication, threshold = umf.classification_main(algorithm, model, desc, Model, X_train, y_train, data_type, metric_df=df_classfication, train_threshold=None)
df_classfication

#### Test

In [None]:
# initialize variable
data_type = 'Validation/Test'

# display
df_classfication, _ = umf.classification_main(algorithm, model, desc, Model, X_test, y_test, data_type, metric_df=df_classfication, train_threshold=None)
df_classfication

### AdaBoostClassifier

In [None]:
# import library
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# define the parameter grid for GridSearchCV
parameters = parms.ada_parms()

# base estimater
estimator = DecisionTreeClassifier(max_depth=1)

# create a AdaBoost model
Model = AdaBoostClassifier(estimator=estimator, algorithm='SAMME', random_state=RANDOM_STATE)

# CV with grid search for Hyper-Parameter tuning
ada_fit = umf.stratified_grid(Model, parameters, X_train, y_train, seed=RANDOM_STATE, n_jobs=-1, n_split=5, score='f1')

# best model
Model = ada_fit.best_estimator_

In [None]:
print(Model)

#### Train

In [None]:
# initialize variables
algorithm = 'AdaBoost'
desc = f'{algorithm} - {msg}'
model = 'ada_fit'
data_type = 'Training'

# display
df_classfication, threshold = umf.classification_main(algorithm, model, desc, Model, X_train, y_train, data_type, metric_df=df_classfication, train_threshold=None)
df_classfication

#### Test

In [None]:
# initialize variable
data_type = 'Validation/Test'

# display
df_classfication, _ = umf.classification_main(algorithm, model, desc, Model, X_test, y_test, data_type, metric_df=df_classfication, train_threshold=None)
df_classfication

#### Feature Importance

In [None]:
df_ada = umf.plot_feature_importance(Model, pd.DataFrame(X_train, columns=X_train.columns.to_list()),figsize=(20,50))

In [None]:
df_ada.sort_values(by = 'Feature Importance Score', ascending=False).head(20)

### Stacking Classifier
- Both AdaBoost and XGBoost are boosting algorithms, but they have key differences:
    - AdaBoost: Focuses on reducing bias by reweighting misclassified samples, works well with weak learners.
        - AdaBoost → Handles noisy datasets and works well with simpler decision stumps. 
    - XGBoost: A gradient boosting algorithm that minimizes loss efficiently using second-order derivatives and feature importance techniques.
        - XGBoost → Captures complex patterns, handles missing values, and optimizes performance.
    - Random Forest is a bagging-based algorithm that reduces variance and is more robust to noise.

In [None]:
# import library
from sklearn.ensemble import StackingClassifier

# best model
AdaModel = ada_fit.best_estimator_
XGBModel = xgb_fit.best_estimator_

# filt model
AdaModel.fit(X_train, y_train)
XGBModel.fit(X_train, y_train)

# create meta-model
meta_model = LogisticRegression(random_state=RANDOM_STATE)

# create stacking ensemble
stacking_ensemble = StackingClassifier(
    estimators=[('adaboost', AdaModel), ('xgboost', XGBModel)],
    final_estimator=meta_model,
    cv=5
)

# Fit the stacking ensemble
stack_AdaXgb = stacking_ensemble.fit(X_train, y_train)

#### Train

In [None]:
# initialize variables
algorithm = 'Stacking(AdaBoost & XGBoost)'
desc = f'{algorithm} - {msg}'
model = 'stack_AdaXgb'
data_type = 'Training'

# display
df_classfication, threshold = umf.classification_main(algorithm, model, desc, stack_AdaXgb, X_train, y_train, data_type, metric_df=df_classfication, train_threshold=None)
df_classfication

#### Test

In [None]:
# initialize variables
data_type = 'Validation/test'

# display
df_classfication, _ = umf.classification_main(algorithm, model, desc, stack_AdaXgb, X_test, y_test, data_type, metric_df=df_classfication, train_threshold=None)
df_classfication

In [None]:
RFCModel = rfc_fit.best_estimator_
XGBModel = xgb_fit.best_estimator_

# filt model
RFCModel.fit(X_train, y_train)
XGBModel.fit(X_train, y_train)

# create meta-model
meta_model = LogisticRegression(random_state=RANDOM_STATE)

# create stacking ensemble
stacking_ensemble = StackingClassifier(
    estimators=[('randomforest', RFCModel), ('xgboost', XGBModel)],
    final_estimator=meta_model,
    cv=5
)

# fit the stacking ensemble
stack_RfcXgb = stacking_ensemble.fit(X_train, y_train)

#### Train

In [None]:
# initialize variables
algorithm = 'Stacking(RandomForest & XGBoost)'
desc = f'{algorithm} - {msg}'
model = 'stack_AdaXgb'
data_type = 'Training'

# display
df_classfication, threshold = umf.classification_main(algorithm, model, desc, stack_RfcXgb, X_train, y_train, data_type, metric_df=df_classfication, train_threshold=None)
df_classfication

#### Test

In [None]:
# initialize variables
data_type = 'Validation/test'

# display
df_classfication, _ = umf.classification_main(algorithm, model, desc, stack_RfcXgb, X_test, y_test, data_type, metric_df=df_classfication, train_threshold=threshold)
df_classfication

#### Hyper-Paramater Grid Search

In [None]:
# paramater search
parameters = parameters = parms.lrc_parms()

# Base Model
lrc_param = {'max_iter':10000, 'class_weight': class_weight_dict, 'random_state': RANDOM_STATE}

# create meta-model
meta_model = LogisticRegression(**lrc_param)

# CV with grid search for Hyper-Parameter tuning
meta_model_fit = umf.stratified_grid(meta_model, parameters, X_train, y_train, seed=RANDOM_STATE, n_jobs=-1, n_split=5, score='f1')

# best model
meta_model = meta_model_fit.best_estimator_

# create stacking ensemble
stacking_ensemble = StackingClassifier(
    estimators=[('randomforest', RFCModel), ('xgboost', XGBModel)],
    final_estimator=meta_model,
    cv=5
)

# fit the stacking ensemble
stack_GS_RfcXgb = stacking_ensemble.fit(X_train, y_train)

In [None]:
print(stack_GS_RfcXgb)

#### Train

In [None]:
# initialize variables
algorithm = 'Stacking(RandomForest & XGBoost)'
desc = f'{algorithm} - {msg}'
model = 'stack_GS_RfcXgb'
data_type = 'Training'

# display
df_classfication, threshold = umf.classification_main(algorithm, model, desc, stack_GS_RfcXgb, X_train, y_train, data_type, metric_df=df_classfication, train_threshold=None)
df_classfication

#### Test

In [None]:
# initialize variables
data_type = 'Validation/test'

# display
df_classfication, _ = umf.classification_main(algorithm, model, desc, stack_GS_RfcXgb, X_test, y_test, data_type, metric_df=df_classfication, train_threshold=None)
df_classfication

## Overall Model Performance

In [None]:
# display
filtered_df = df_classfication[df_classfication.DataType != 'Training']
filtered_df.sort_values(by=['Accuracy', 'ROC_AUC_Score'], ascending=False)

## Save Model to Disk

In [None]:
import joblib

# get the best model
XGBmodel = xgb_fit.best_estimator_
LRmodel = lrc_fit.best_estimator_
RFmodel = rfc_fit.best_estimator_
ADAmodel = ada_fit.best_estimator_
KNNmodel = knn_fit.best_estimator_

# save the model to a file
print(joblib.dump(XGBmodel, '../Models/XGBmodel_Original_GridSearch.joblib'))
print(joblib.dump(LRmodel, '../Models/LRmodel_Original_GridSearch.joblib'))
print(joblib.dump(RFmodel, '../Models/RFmodel_Original_GridSearch.joblib'))
print(joblib.dump(ADAmodel, '../Models/ADAmodel_Original_GridSearch.joblib'))
print(joblib.dump(KNNmodel, '../Models/KNNmodel_Original_GridSearch.joblib'))

# load the model from the file
# loaded_model = joblib.load('../Models/KNN_Threshold.joblib')

#### Save Files

In [None]:
# write dataset - # X_train, y_train, X_test, y_test files
uf.write_to_file(X_train, 'X_train_GINI_GridCV_Orig',path='../Data/XyData/', format='pkl')
uf.write_to_file(y_train, 'y_train_GINI_GridCV_Orig',path='../Data/XyData/', format='pkl')
uf.write_to_file(X_test, 'X_test_GINI_GridCV_Orig',path='../Data/XyData/', format='pkl')
uf.write_to_file(y_test, 'y_test_GINI_GridCV_Orig',path='../Data/XyData/', format='pkl')