In [None]:
# --- create configuration ---

class Config:
    target = 'diagnosed_diabetes'
    train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv', index_col='id')
    test = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv', index_col='id')
    submission = pd.read_csv('/kaggle/input/playground-series-s5e12/sample_submission.csv')
    orig = pd.read_csv('/kaggle/input/diabetes-health-indicators-dataset/diabetes_dataset.csv')
    train = pd.concat([train, orig[train.columns]], axis=0, ignore_index=True)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    state = 42
    n_splits = 10
    early_stop = 200
    metric = 'roc_auc'
    task_type = "binary"
    task_is_regression = task_type == 'regression'
    if task_is_regression:
        n_classes = None
    else:
        n_classes = train[target].nunique()
        labels = list(train[target].unique())

    folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=state)

    outliers = False
    log_trf = False
    missing = False

In [None]:
# --- EDA ---
## --- Version 1 ---

class EDA(Config):
    
    def __init__(self):
        super().__init__()

        self.cat_features = self.train.drop(self.target, axis=1).select_dtypes(include=['object', 'bool']).columns.tolist()
        self.num_features = self.train.drop(self.target, axis=1).select_dtypes(exclude=['object', 'bool']).columns.tolist()
        self.data_info()
        self.heatmap()
        self.dist_plots()
        self.cat_feature_plots()
        if self.task_is_regression:
            self.target_plot()
        else:
            self.target_pie()
                
    def data_info(self):
        
        for data, label in zip([self.train, self.test], ['Train', 'Test']):
            table_style = [{'selector': 'th:not(.index_name)',
                            'props': [('background-color', '#3cb371'),
                                      ('color', '#FFFFFF'),
                                      ('font-weight', 'bold'),
                                      ('border', '1px solid #DCDCDC'),
                                      ('text-align', 'center')]
                            }, 
                            {'selector': 'tbody td',
                             'props': [('border', '1px solid #DCDCDC'),
                                       ('font-weight', 'normal')]
                            }]
            print(Style.BRIGHT+Fore.GREEN+f'\n{label} head\n')
            display(data.head().style.set_table_styles(table_style))
                           
            print(Style.BRIGHT+Fore.GREEN+f'\n{label} info\n'+Style.RESET_ALL)               
            display(data.info())
                           
            print(Style.BRIGHT+Fore.GREEN+f'\n{label} describe\n')
            display(data.describe().drop(index='count', columns=self.target, errors = 'ignore').T
                    .style.set_table_styles(table_style).format('{:.3f}'))
            
            print(Style.BRIGHT+Fore.GREEN+f'\n{label} missing values\n'+Style.RESET_ALL)               
            display(data.isna().sum())
        return self
    
    def heatmap(self):
        print(Style.BRIGHT+Fore.GREEN+f'\nCorrelation Heatmap\n')
        plt.figure(figsize=(10, 10))
        corr = self.train[self.num_features+[self.target]].corr(method='pearson')
        sns.heatmap(corr, fmt = '0.2f', cmap = 'Greens', square=True, annot=True, linewidths=1, cbar=False)
        plt.show()
        
    def dist_plots(self):
        print(Style.BRIGHT+Fore.GREEN+f"\nDistribution analysis\n")
        df = pd.concat([self.train[self.num_features].assign(Source = 'Train'), 
                        self.test[self.num_features].assign(Source = 'Test'),], 
                        axis=0, ignore_index = True)

        fig, axes = plt.subplots(len(self.num_features), 2 ,figsize = (18, len(self.num_features) * 6), 
                                 gridspec_kw = {'hspace': 0.3, 
                                                'wspace': 0.2, 
                                                'width_ratios': [0.70, 0.30]
                                               }
                                )
        for i,col in enumerate(self.num_features):
            ax = axes[i,0]
            sns.kdeplot(data = df[[col, 'Source']], x = col, hue = 'Source', 
                        palette = ['#3cb371', 'r'], ax = ax, linewidth = 2
                       )
            ax.set(xlabel = '', ylabel = '')
            ax.set_title(f"\n{col}")
            ax.grid()

            ax = axes[i,1]
            sns.boxplot(data = df, y = col, x=df.Source, width = 0.5,
                        linewidth = 1, fliersize= 1,
                        ax = ax, palette=['#3cb371', 'r']
                       )
            ax.set_title(f"\n{col}")
            ax.set(xlabel = '', ylabel = '')
            ax.tick_params(axis='both', which='major')
            ax.set_xticklabels(['Train', 'Test'])

        plt.tight_layout()
        plt.show()
               
    def cat_feature_plots(self):
        fig, axes = plt.subplots(max(len(self.cat_features), 1), 2 ,figsize = (18, len(self.cat_features) * 6), 
                                 gridspec_kw = {'hspace': 0.5, 
                                                'wspace': 0.2,
                                               }
                                )
        if len(self.cat_features) == 1:
            axes = np.array([axes])
            
        for i, col in enumerate(self.cat_features):
            ax = axes[i,0]
            sns.barplot(data=self.train[col].value_counts().nlargest(10).reset_index(), x=col, y='count', ax=ax, color='#3cb371')
            ax.set(xlabel = '', ylabel = '')
            ax.set_title(f"\n{col} Train")
            
            ax = axes[i,1]
            sns.barplot(data=self.train[col].value_counts().nlargest(10).reset_index(), x=col, y='count', ax=ax, color='r')
            ax.set(xlabel = '', ylabel = '')
            ax.set_title(f"\n{col} Test")

        plt.tight_layout()
        plt.show()

    def target_pie(self):
        print(Style.BRIGHT+Fore.GREEN+f"\nTarget feature distribution\n")
        targets = self.train[self.target]
        plt.figure(figsize=(6, 6))
        plt.pie(targets.value_counts(), labels=targets.value_counts().index, autopct='%1.2f%%', colors=sns.color_palette('viridis', len(targets.value_counts())))
        plt.show()

    def target_plot(self):
        print(Style.BRIGHT+Fore.GREEN+f"\nTarget feature distribution\n")
        
        fig, axes = plt.subplots(1, 2 ,figsize = (14, 6), 
                                 gridspec_kw = {'hspace': 0.3, 
                                                'wspace': 0.2, 
                                                'width_ratios': [0.70, 0.30]
                                               }
                                )
        ax = axes[0]
        sns.kdeplot(data = self.train[self.target], 
                    color = '#3cb371', ax = ax, linewidth = 2
                   )
        ax.set(xlabel = '', ylabel = '')
        ax.set_title(f"\n{self.target}")
        ax.grid()

        ax = axes[1]
        sns.boxplot(data = self.train, y = self.target, width = 0.5,
                    linewidth = 1, fliersize= 1,
                    ax = ax, color = '#3cb371'
                   )
        ax.set_title(f"\n{self.target}")
        ax.set(xlabel = '', ylabel = '')
        ax.tick_params(axis='both', which='major')

        plt.tight_layout()
        plt.show()



## --- Version 2 ---

class EDA(Config, Preprocessing):
    
    def __init__(self):
        super().__init__()
        
        self.data_info()
        self.heatmap()
        self.dist_plots()
        self.cat_feature_plots()
        self.target_pie()
                
    def data_info(self):
        
        for data, label in zip([self.train, self.test], ['Train', 'Test']):
            table_style = [{'selector': 'th:not(.index_name)',
                            'props': [('background-color', '#3cb371'),
                                      ('color', '#FFFFFF'),
                                      ('font-weight', 'bold'),
                                      ('border', '1px solid #DCDCDC'),
                                      ('text-align', 'center')]
                            }, 
                            {'selector': 'tbody td',
                             'props': [('border', '1px solid #DCDCDC'),
                                       ('font-weight', 'normal')]
                            }]
            print(Style.BRIGHT+Fore.GREEN+f'\n{label} head\n')
            display(data.head().style.set_table_styles(table_style))
                           
            print(Style.BRIGHT+Fore.GREEN+f'\n{label} info\n'+Style.RESET_ALL)               
            display(data.info())
                           
            print(Style.BRIGHT+Fore.GREEN+f'\n{label} describe\n')
            display(data.describe().drop(index='count', columns=self.targets, errors = 'ignore').T
                    .style.set_table_styles(table_style).format('{:.3f}'))
            
            print(Style.BRIGHT+Fore.GREEN+f'\n{label} missing values\n'+Style.RESET_ALL)               
            display(data.isna().sum())
        return self
    
    def heatmap(self):
        print(Style.BRIGHT+Fore.GREEN+f'\nCorrelation Heatmap\n')
        plt.figure(figsize=(7,7))
        corr = self.train.select_dtypes(exclude='object').corr(method='pearson')
        sns.heatmap(corr, fmt = '0.2f', cmap = 'Greens', annot=True, cbar=False)
        plt.show()
        
    def dist_plots(self):
        print(Style.BRIGHT+Fore.GREEN+f"\nDistribution analysis\n")
        df = pd.concat([self.train[self.num_features].assign(Source = 'Train'), 
                        self.test[self.num_features].assign(Source = 'Test'),], 
                        axis=0, ignore_index = True)

        fig, axes = plt.subplots(len(self.num_features), 2 ,figsize = (18, len(self.num_features) * 6), 
                                 gridspec_kw = {'hspace': 0.3, 
                                                'wspace': 0.2, 
                                                'width_ratios': [0.70, 0.30]
                                               }
                                )
        for i,col in enumerate(self.num_features):
            ax = axes[i,0]
            sns.kdeplot(data = df[[col, 'Source']], x = col, hue = 'Source', 
                        palette = ['#3cb371', 'r'], ax = ax, linewidth = 2
                       )
            ax.set(xlabel = '', ylabel = '')
            ax.set_title(f"\n{col}")
            ax.grid()

            ax = axes[i,1]
            sns.boxplot(data = df, y = col, x=df.Source, width = 0.5,
                        linewidth = 1, fliersize= 1,
                        ax = ax, palette=['#3cb371', 'r']
                       )
            ax.set_title(f"\n{col}")
            ax.set(xlabel = '', ylabel = '')
            ax.tick_params(axis='both', which='major')
            ax.set_xticklabels(['Train', 'Test'])

        plt.tight_layout()
        plt.show()
               
    def cat_feature_plots(self):
        fig, axes = plt.subplots(len(self.cat_features), 2 ,figsize = (18, len(self.cat_features) * 6), 
                                 gridspec_kw = {'hspace': 0.5, 
                                                'wspace': 0.2,
                                               }
                                )

        for i, col in enumerate(self.cat_features):
            
            ax = axes[i,0]
            sns.barplot(data=self.train[col].value_counts().nlargest(10).reset_index(), x=col, y='count', ax=ax, color='#3cb371')
            ax.set(xlabel = '', ylabel = '')
            ax.set_title(f"\n{col} Train")
            
            ax = axes[i,1]
            sns.barplot(data=self.train[col].value_counts().nlargest(10).reset_index(), x=col, y='count', ax=ax, color='r')
            ax.set(xlabel = '', ylabel = '')
            ax.set_title(f"\n{col} Test")

        plt.tight_layout()
        plt.show()
        
    def target_pie(self):
        print(Style.BRIGHT+Fore.GREEN+f"\nTarget feature distribution\n")
        targets = self.train[self.targets]
        plt.figure(figsize=(6, 6))
        plt.pie(targets.value_counts(), labels=targets.value_counts().index, autopct='%1.2f%%', colors=sns.color_palette('viridis', len(targets.value_counts())))
        plt.show()   



In [None]:
# --- Version 3 ---

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from colorama import Style, Fore, init
from IPython.display import display

init(autoreset=True)


class EDA(Config, Preprocessing):
    

    def __init__(self, train=None, test=None, target=None, targets=None, task_is_regression=None):
        super().__init__()

        # Allow overriding attributes passed to ctor; otherwise rely on base classes
        if train is not None:
            self.train = train
        if test is not None:
            self.test = test
        if target is not None:
            self.target = target
        if targets is not None:
            self.targets = targets
        if task_is_regression is not None:
            self.task_is_regression = task_is_regression

        # Validate presence
        if not hasattr(self, 'train') or self.train is None:
            raise ValueError("`train` DataFrame is required (either pass it or provide in Config).")
        if not hasattr(self, 'test') or self.test is None:
            # test is optional, but set to empty frame for safe concat
            self.test = pd.DataFrame(columns=self.train.columns)

        # Normalize targets: prefer `targets` (list-like), fallback to `target` (single)
        if hasattr(self, 'targets') and self.targets:
            self.target_list = list(self.targets) if isinstance(self.targets, (list, tuple)) else [self.targets]
        elif hasattr(self, 'target') and self.target:
            self.target_list = [self.target]
        else:
            self.target_list = []

        # Determine categorical and numerical features excluding target columns
        drop_cols = self.target_list if self.target_list else []
        self.cat_features = self.train.drop(columns=drop_cols, errors='ignore') \
                                     .select_dtypes(include=['object', 'bool', 'category']).columns.tolist()
        self.num_features = self.train.drop(columns=drop_cols, errors='ignore') \
                                     .select_dtypes(include=[np.number]).columns.tolist()

        # Run EDA steps
        self.data_info()
        self.heatmap()
        self.dist_plots()
        self.cat_feature_plots()
        # Target visualizations (handle multi-target)
        if len(self.target_list) == 0:
            print(Style.BRIGHT + Fore.YELLOW + "\nNo target provided; skipping target plots.")
        else:
            for t in self.target_list:
                if getattr(self, 'task_is_regression', None):
                    self.target_plot(t)
                else:
                    self.target_pie(t)

    def data_info(self):
        for data, label in zip([self.train, self.test], ['Train', 'Test']):
            table_style = [
                {'selector': 'th:not(.index_name)',
                 'props': [('background-color', '#3cb371'),
                           ('color', '#FFFFFF'),
                           ('font-weight', 'bold'),
                           ('border', '1px solid #DCDCDC'),
                           ('text-align', 'center')]},
                {'selector': 'tbody td',
                 'props': [('border', '1px solid #DCDCDC'),
                           ('font-weight', 'normal')]}
            ]

            print(Style.BRIGHT + Fore.GREEN + f'\n{label} head\n')
            display(data.head().style.set_table_styles(table_style))

            print(Style.BRIGHT + Fore.GREEN + f'\n{label} info\n' + Style.RESET_ALL)
            display(data.info())

            print(Style.BRIGHT + Fore.GREEN + f'\n{label} describe\n')
            # drop counts row and target columns from describe view for clarity
            desc = data.describe().drop(index=['count'], errors='ignore')
            if self.target_list:
                desc = desc.drop(columns=self.target_list, errors='ignore')
            display(desc.T.style.set_table_styles(table_style).format('{:.3f}'))

            print(Style.BRIGHT + Fore.GREEN + f'\n{label} missing values\n' + Style.RESET_ALL)
            display(data.isna().sum())

        return self

    def heatmap(self):
        print(Style.BRIGHT + Fore.GREEN + f'\nCorrelation Heatmap\n')

        # Select numeric columns (including numeric targets if present)
        numeric_cols = self.train.select_dtypes(include=[np.number]).columns.tolist()
        if not numeric_cols:
            print(Style.BRIGHT + Fore.YELLOW + "No numeric columns available for correlation heatmap.")
            return self

        corr = self.train[numeric_cols].corr(method='pearson')

        plt.figure(figsize=(8, 8))
        sns.heatmap(corr, fmt='.2f', cmap='Greens', annot=True, cbar=False, square=True, linewidths=0.5)
        plt.title('Correlation Heatmap')
        plt.show()
        return self

    def dist_plots(self):
        print(Style.BRIGHT + Fore.GREEN + f"\nDistribution analysis\n")

        if len(self.num_features) == 0:
            print(Style.BRIGHT + Fore.YELLOW + "No numerical features to plot distributions for.")
            return self

        # Prepare combined df with Source marker; ensure columns exist in both frames
        train_num = self.train[self.num_features].copy()
        train_num['Source'] = 'Train'
        test_num = self.test.reindex(columns=self.num_features).copy()
        test_num['Source'] = 'Test'
        df = pd.concat([train_num, test_num], axis=0, ignore_index=True)

        n = len(self.num_features)
        fig, axes = plt.subplots(n, 2, figsize=(18, max(6 * n, 6)),
                                 gridspec_kw={'hspace': 0.3, 'wspace': 0.2, 'width_ratios': [0.7, 0.3]})

        # Normalise axes shape to (n,2)
        if n == 1:
            axes = np.atleast_2d(axes)

        for i, col in enumerate(self.num_features):
            ax_kde = axes[i, 0]
            ax_box = axes[i, 1]

            try:
                sns.kdeplot(data=df, x=col, hue='Source', palette=['#3cb371', 'r'], ax=ax_kde, linewidth=2)
            except Exception:
                # fallback to hist if kde fails (e.g., many identical values)
                sns.histplot(data=df, x=col, hue='Source', palette=['#3cb371', 'r'], ax=ax_kde, element='step', stat='density')

            ax_kde.set(xlabel='', ylabel='')
            ax_kde.set_title(f"\n{col}")
            ax_kde.grid()

            # Boxplot for Train vs Test
            sns.boxplot(data=df, y=col, x='Source', width=0.5, linewidth=1, fliersize=1, ax=ax_box,
                        palette=['#3cb371', 'r'])
            ax_box.set_title(f"\n{col}")
            ax_box.set(xlabel='', ylabel='')
            ax_box.tick_params(axis='both', which='major')
            ax_box.set_xticklabels(['Train', 'Test'])

        plt.tight_layout()
        plt.show()
        return self

    def cat_feature_plots(self):
        if len(self.cat_features) == 0:
            print(Style.BRIGHT + Fore.YELLOW + "No categorical features to plot.")
            return self

        n = len(self.cat_features)
        fig, axes = plt.subplots(n, 2, figsize=(18, max(6 * n, 6)),
                                 gridspec_kw={'hspace': 0.5, 'wspace': 0.2})
        if n == 1:
            axes = np.atleast_2d(axes)

        for i, col in enumerate(self.cat_features):
            # Prepare top-k categories for stable plotting
            train_counts = self.train[col].value_counts().nlargest(10).reset_index()
            train_counts.columns = [col, 'count']
            test_counts = self.test[col].value_counts().nlargest(10).reset_index()
            test_counts.columns = [col, 'count']

            ax_train = axes[i, 0]
            sns.barplot(data=train_counts, x=col, y='count', ax=ax_train, color='#3cb371')
            ax_train.set(xlabel='', ylabel='')
            ax_train.set_title(f"\n{col} Train")
            ax_train.tick_params(axis='x', rotation=45)

            ax_test = axes[i, 1]
            sns.barplot(data=test_counts, x=col, y='count', ax=ax_test, color='r')
            ax_test.set(xlabel='', ylabel='')
            ax_test.set_title(f"\n{col} Test")
            ax_test.tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.show()
        return self

    def target_pie(self, target_col):
        print(Style.BRIGHT + Fore.GREEN + f"\nTarget `{target_col}` distribution (pie)\n")
        if target_col not in self.train.columns:
            print(Style.BRIGHT + Fore.YELLOW + f"Target `{target_col}` not found in train data.")
            return self

        values = self.train[target_col].value_counts()
        plt.figure(figsize=(6, 6))
        plt.pie(values, labels=values.index.astype(str), autopct='%1.2f%%',
                colors=sns.color_palette('viridis', len(values)))
        plt.title(f"{target_col} distribution")
        plt.show()
        return self

    def target_plot(self, target_col):
        print(Style.BRIGHT + Fore.GREEN + f"\nTarget `{target_col}` distribution (regression)\n")
        if target_col not in self.train.columns:
            print(Style.BRIGHT + Fore.YELLOW + f"Target `{target_col}` not found in train data.")
            return self

        fig, axes = plt.subplots(1, 2, figsize=(14, 6), gridspec_kw={'width_ratios': [0.7, 0.3], 'wspace': 0.2})
        ax_kde = axes[0]
        ax_box = axes[1]

        try:
            sns.kdeplot(data=self.train, x=target_col, color='#3cb371', ax=ax_kde, linewidth=2)
        except Exception:
            sns.histplot(data=self.train, x=target_col, color='#3cb371', ax=ax_kde, stat='density', element='step')

        ax_kde.set(xlabel='', ylabel='')
        ax_kde.set_title(f"\n{target_col}")
        ax_kde.grid()

        sns.boxplot(data=self.train, y=target_col, width=0.5, linewidth=1, fliersize=1, ax=ax_box, color='#3cb371')
        ax_box.set_title(f"\n{target_col}")
        ax_box.set(xlabel='', ylabel='')
        ax_box.tick_params(axis='both', which='major')

        plt.tight_layout()
        plt.show()
        return self

In [None]:
# --- Preprocessing ---

## --- Version 1 ---

class Preprocessing(Config):
    
    def __init__(self, n_splits=5, random_state=42, smoothing=20):
        super().__init__()
        self.global_stats = {}
        self.encodings = {}
        self.freq_encodings = {}
        self.count_encodings = {}
        self.n_splits = n_splits
        self.random_state = random_state
        self.smoothing = smoothing

    def fit_transform(self):
        self.prepare_data()
        if self.missing:
            self.missing_values()

        combine = pd.concat([self.X, self.test])
        combine = self.feature_engineering(combine)
        self.X = combine.iloc[:len(self.X)].copy()
        self.test = combine.iloc[len(self.X):].copy()

        self.num_features = self.test.select_dtypes(exclude=['object', 'bool', 'category']).columns.tolist()
        self.cat_features = self.test.select_dtypes(include=['object', 'bool','category']).columns.tolist()

        if self.outliers:
            self.remove_outliers()
        if self.log_trf:
            self.log_transformation()

        return self.X, self.y, self.test, self.cat_features, self.num_features

    def prepare_data(self):
        self.train_raw = self.train.copy()
        self.y = self.train[self.target]
        self.X = self.train.drop(self.target, axis=1)

        self.num_features = self.X.select_dtypes(exclude=['object', 'bool']).columns.tolist()
        self.cat_features = self.X.select_dtypes(include=['object', 'bool']).columns.tolist()
        to_cat = ['family_history_diabetes', 'hypertension_history', 'cardiovascular_history']
        self.cat_features = self.cat_features + to_cat
        self.num_features = [col for col in self.num_features if col not in to_cat]

    def feature_engineering(self, data):
        df = data.copy()
        
        global_stats = {'mean': self.orig[self.target].mean(), 'count': 0}
        for c in self.num_features + self.cat_features:
            for a in ['mean', 'count']:
                col = f'{c}_org_{a}'
                tmp = (self.orig.groupby(c)[self.target]
                       .agg(a)
                       .rename(col)
                       .reset_index())
                df = df.merge(tmp, on=c, how='left')
                df[col] = df[col].fillna(global_stats[a])

        for c in self.cat_features:
            freqs = df[c].value_counts(normalize=True)
            df[f"{c}_fe"] = df[c].map(freqs)

        df[self.cat_features] = df[self.cat_features].astype('category')
        return df

    def log_transformation(self):
        self.y = np.log1p(self.y)

    def remove_outliers(self):
        Q1 = self.y.quantile(0.25)
        Q3 = self.y.quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        mask = (self.y >= lower_limit) & (self.y <= upper_limit)
        self.X = self.X[mask]
        self.y = self.y[mask]
        self.X.reset_index(drop=True, inplace=True)

    def missing_values(self):
        self.X[self.cat_features] = self.X[self.cat_features].fillna('NaN')
        self.test[self.cat_features] = self.test[self.cat_features].fillna('NaN')



In [None]:
# --- Model Training ---

class Trainer(Config):
    
    def __init__(self, X, y, test, models, num_features, cat_features, training=True):
        self.X = X
        self.test = test
        self.y = y
        self.models = models
        self.training = training
        self.scores = pd.DataFrame(columns=['Score'], dtype=float)
        self.OOF_preds = pd.DataFrame(dtype=float)
        self.TEST_preds = pd.DataFrame(dtype=float)
        self.num_features = num_features
        self.cat_features = cat_features

    def ScoreMetric(self, y_true, y_pred):
        if self.metric == 'roc_auc':
            return roc_auc_score(y_true, y_pred, multi_class="ovr") if self.n_classes > 2 else roc_auc_score(y_true, y_pred)
        elif self.metric == 'accuracy':
            return accuracy_score(y_true, y_pred)
        elif self.metric == 'f1':
            return f1_score(y_true, y_pred, average='weighted') if self.n_classes > 2 else f1_score(y_true, y_pred)
        elif self.metric == 'precision':
            return precision_score(y_true, y_pred, average='weighted') if self.n_classes > 2 else precision_score(y_true, y_pred)
        elif self.metric == 'recall':
            return recall_score(y_true, y_pred, average='weighted') if self.n_classes > 2 else recall_score(y_true, y_pred)
        elif self.metric == 'mae':
            return mean_absolute_error(y_true, y_pred)
        elif self.metric == 'r2':
            return r2_score(y_true, y_pred)
        elif self.metric == 'rmse':
            return root_mean_squared_error(y_true, y_pred)
        elif self.metric == 'rmsle':
            return root_mean_squared_error(y_true, y_pred)
        elif self.metric == 'mse':
            return mean_squared_error(y_true, y_pred, squared=True)

    def train(self, model, X, y, test, model_name):
        oof_pred = np.zeros(X.shape[0], dtype=float)
        test_pred = np.zeros(test.shape[0], dtype=float)

        print('='*20)
        print(model_name)
        params=model.get_params()
        w_full = np.array([1]*678260+[16]*21740+[8]*100000)
        for n_fold, (train_id, valid_id) in enumerate(self.folds.split(X, y)):
            features = X.columns.to_list()

            X_train = X[features].loc[train_id].copy()
            y_train = y[train_id]
            X_val = X[features].iloc[valid_id].copy()
            y_val = y[valid_id]
            X_test = test[features].copy()
            w_trn = w_full[train_id]
            w_val = w_full[valid_id]

            if model_name != 'Ensemble':
                TE = TargetEncoder(random_state=42, shuffle=True, cv=5, smooth=15)
                X_train[self.cat_features] = te.fit_transform(X_train[self.cat_features], y_train).astype('float32')
                X_val[self.cat_features] = te.transform(X_val[self.cat_features]).astype('float32')
                X_test[self.cat_features] = te.transform(X_test[self.cat_features]).astype('float32')
            
            print(f'Fold {n_fold+1}')
            
            if "LGBM" in model_name:
                X_train = lightgbm.Dataset(X_train, label=y_train)
                val_dataset = lightgbm.Dataset(X_val, label=y_val)
                model = lightgbm.train(
                    params=params,
                    train_set=X_train,
                    valid_sets=[val_dataset],
                    num_boost_round=100_000,
                )

            elif any(model in model_name for model in ["NN", "TabM"]):
                model.num_features = X_train.select_dtypes(exclude=['category']).columns.tolist()
                model.cat_features = X_train.select_dtypes(include=['category']).columns.tolist()
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
                
            elif "XGB" in model_name:
                X_train = DMatrix(X_train, label=y_train, enable_categorical=True, weight=w_trn)
                X_val   = DMatrix(X_val, label=y_val, enable_categorical=True, weight=w_val)
                X_test  = DMatrix(X_test, enable_categorical=True)
                model = xgb.train(
                    params=params,
                    dtrain=X_train,
                    evals=[(X_val, "valid")],
                    num_boost_round=100_000,
                    early_stopping_rounds=200,
                    verbose_eval=False
                )

            elif "CAT" in model_name:
                X_train = Pool(X_train, label=y_train, cat_features=self.cat_features)
                X_val = Pool(X_val, label=y_val, cat_features=self.cat_features)
                X_test = Pool(test, cat_features=self.cat_features)
                model.fit(X_train, eval_set=X_val, verbose=False)
                
            elif any(model in model_name for model in ["HGB", "YDF"]):
                model.fit(X_train, y_train, X_val=X_val, y_val=y_val)
                
            elif "Ensemble" in model_name:
                model = Pipeline([
                    ("scaler", StandardScaler(with_mean=True, with_std=True)),
                    ("ridge", model)
                ])
                model.fit(X_train, y_train)
                
            else:
                encoder = FeatureEncoder(num_features=self.num_features, cat_features=self.cat_features)
                encoder.fit(X)
                X_train, X_val, X_test = encoder.transform_fold(X_train, X_val, X_test)
          
                model.fit(X_train, y_train)

            if self.task_type == "regression" :
                y_pred_val = model.predict(X_val)           
                test_pred += model.predict(X_test) / self.n_splits
            elif self.task_type == "binary" :
                y_pred_val = model.predict_proba(X_val)[:, 1]            
                test_pred += model.predict_proba(X_test)[:, 1] / self.n_splits
            elif self.task_type == "multiclass" :
                y_pred_val = model.predict_proba(X_val)            
                test_pred += model.predict_proba(X_test) / self.n_splits
                
            oof_pred[valid_id] = y_pred_val
            score = self.ScoreMetric(y_val, y_pred_val)
            print(score)
            self.scores.loc[f'{model_name}', f'Fold {n_fold+1}'] = score

        self.scores.loc[f'{model_name}', 'Score'] = self.scores.loc[f'{model_name}'][1:].mean()

        return oof_pred, test_pred

    def run(self):
        for model_name, model in tqdm(self.models.items()):

            if self.training:                
                X = self.X.copy()
                test = self.test.copy()

                oof_pred, test_pred = self.train(model, X, self.y, test, model_name)
                pd.DataFrame(oof_pred, columns=[f'{model_name}']).to_csv(f'{model_name}_oof.csv', index=False)
                pd.DataFrame(test_pred, columns=[f'{model_name}']).to_csv(f'{model_name}_test.csv', index=False)
            
            else:
                oof_pred = pd.read_csv(f'/kaggle/input/diabet-models/{model_name}_oof.csv')
                test_pred = pd.read_csv(f'/kaggle/input/diabet-models/{model_name}_test.csv')

                for n_fold, (train_id, valid_id) in enumerate(self.folds.split(oof_pred, self.y)):
                    y_pred_val, y_val = oof_pred.loc[valid_id], self.y.loc[valid_id]
                    self.scores.loc[f'{model_name}', f'Fold {n_fold+1}'] = self.ScoreMetric(y_val, y_pred_val)
                self.scores.loc[f'{model_name}', 'Score'] = self.scores.loc[f'{model_name}'][1:].mean()

            self.OOF_preds[f'{model_name}'] = oof_pred
            self.TEST_preds[f'{model_name}'] = test_pred
            
        if len(self.models)>1:
            if self.task_is_regression:
                meta_model = LinearRegression()
            else:
                meta_model = LogisticRegression()
            
            self.OOF_preds["Ensemble"], self.TEST_preds["Ensemble"] = self.train(meta_model, self.OOF_preds, y, self.TEST_preds, 'Ensemble')            
            self.scores = self.scores.sort_values('Score')
            self.score_bar()
            self.plot_result(self.OOF_preds["Ensemble"])
            return self.TEST_preds["Ensemble"]
        else:
            print(Style.BRIGHT+Fore.GREEN+f'{model_name} score {self.scores.loc[f"{model_name}", "Score"]:.7f}\n')
            self.plot_result(self.OOF_preds[f'{model_name}'])
            return self.TEST_preds[f'{model_name}']
            
    def score_bar(self):
        plt.figure(figsize=(18, 7))      
        colors = ['#3cb371' if i != 'Ensemble' else 'r' for i in self.scores.Score.index]
        hbars = plt.barh(self.scores.index, self.scores.Score, color=colors, height=0.8)
        plt.bar_label(hbars, fmt='%.6f')
        plt.ylabel('Models')
        plt.xlabel('Score')
        plt.show()
        
    def plot_result(self, oof):
        if self.task_is_regression:
            cmap = LinearSegmentedColormap.from_list("red2green", ["#3cb371", "r"], N=10)
            fig, axes = plt.subplots(1, 2, figsize=(14, 6))
            
            errors = np.abs(y - oof)
            axes[0].scatter(y, oof, c=errors, cmap=cmap, alpha=0.5, s=5)
            axes[0].plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
            axes[0].set_xlabel('Actual')
            axes[0].set_ylabel('Predicted')
            axes[0].set_title('Actual vs. Predicted')
            
            residuals = y - oof
            axes[1].scatter(oof, residuals, c=errors, cmap=cmap, alpha=0.5, s=5)
            axes[1].axhline(y=0, color='black', linestyle='--', lw=2)
            axes[1].set_xlabel('Predicted Values')
            axes[1].set_ylabel('Residuals')
            axes[1].set_title('Residual Plot')
            
            plt.tight_layout()
            plt.show()
        else:
            fig, axes = plt.subplots(1, 2, figsize=(14, 7))
    
            for col in self.OOF_preds:
                RocCurveDisplay.from_predictions(self.y, self.OOF_preds[col], name=f"{col}", ax=axes[0])            
            axes[0].plot([0, 1], [0, 1], linestyle='--', lw=2, color='black')
            axes[0].set_xlabel('False Positive Rate')
            axes[0].set_ylabel('True Positive Rate')
            axes[0].set_title('ROC')
            axes[0].legend(loc="lower right")
            
            ConfusionMatrixDisplay.from_predictions(y, (oof>=0.5).astype(int), display_labels=self.labels, colorbar=False, ax=axes[1], cmap = 'Greens')
            axes[1].set_title('Confusion Matrix')
            
            plt.tight_layout()
            plt.show()