In [None]:
import os
import re
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy import linalg
import matplotlib.pyplot as plt
from scipy.stats import kendalltau
from sklearn.covariance import MinCovDet
from pandas.api.types import is_numeric_dtype, is_bool_dtype
from matplotlib.colors import TwoSlopeNorm, LogNorm
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

In [None]:
def showMissing(df):
    """ Show features with missing values """
    nullOrd = df.isnull().sum().sort_values(ascending=False)
    nullOrd = nullOrd[nullOrd > 0]
    nullOrd = pd.DataFrame(nullOrd, columns=['TotalNA'])
    nullOrd['PropNA'] = nullOrd['TotalNA'] / len(df)
    return nullOrd


def computeCorrelation(df, p=0.05):
    """ Compute pairwise correlation, p-value and pair counts """
    correlations = []
    for method in ['kendall', kendalltaur_pval, countPair]:
        values = df.corr(method=method).stack()
        correlations.append(values)
    correlations = (
        pd.concat(correlations, axis=1)
        .reset_index()
        .rename(columns={'level_0': 'feature1',
                         'level_1': 'feature2',
                         0: 'R', 1: 'p', 2: 'n'}))
    correlations['significant'] = correlations['p'] < p
    correlations = correlations[correlations['feature1'] != correlations['feature2']]
    return correlations


def kendalltaur_pval(x,y):
    try:
        return kendalltau(x,y)[1]
    except ValueError:
        return np.nan


def countPair(x, y):
    """ Return count of valid pairs (both not nan) """

    # Indices where both x and y are NOT np.nan
    validIndices = np.intersect1d(
        np.where(~np.isnan(x)),
        np.where(~np.isnan(y)))
    return len(validIndices)


def plotTargetCorrelation(correlations, feature, out=None):
    """ Plot correlations relative to feature """
    targetCorr = (
        correlations.loc[correlations['feature1'] == feature]
        .set_index('feature2'))
    targetCorr = targetCorr.sort_values(by=['p'], ascending=True)
    fig, (ax1, ax2) = plt.subplots(1, 2)
    targetCorr = targetCorr.loc[targetCorr.index != targetCorr['feature1']]
    sns.heatmap(pd.DataFrame(targetCorr['R']), yticklabels=1, cmap='bwr',
                norm=TwoSlopeNorm(vmin=-1, vcenter=0, vmax=1), ax=ax1)
    ax1.set_xlabel('')
    ax1.set_ylabel('')
    ax1.tick_params(left=True)
    sns.heatmap(pd.DataFrame(targetCorr['p']), yticklabels=1,
                cmap='viridis_r', norm=LogNorm(vmax=1), ax=ax2)
    ax2.tick_params(left=True)
    ax2.set_ylabel('')
    fig.tight_layout()
    if out is not None:
        fig.savefig(out)
    return fig, (ax1, ax2)


def plotPairwiseCorrelation(correlations, out=None):
    """ Plot pairwise correlation matrix with
        output from computeCorrelation() """
    wideCorr = correlations.pivot(
        columns='feature1', index='feature2', values='R')
    fig, ax = plt.subplots()
    sns.heatmap(wideCorr, yticklabels=1, cmap='bwr', square=True,
                norm=TwoSlopeNorm(vmin=-1, vcenter=0, vmax=1), ax=ax)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_facecolor('lightgrey')
    ax.tick_params(left=True)
    fig.tight_layout()
    if out is not None:
        fig.savefig(out)
    return fig, ax


In [None]:
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12,8)
warnings.filterwarnings('ignore')

In [None]:
train = 'train.csv'
test = 'test.csv'
index = 'PassengerId'
target = 'Survived'

In [None]:
dtypes = ({
    'Survived': bool, 
    'Pclass':   int, 
    'Name':     str,
    'Sex':      'category',
    'Age':      float,
    'SibSp':    int,
    'Parch':    int,
    'Ticket':  'category',
    'Fare':     float,
    'Cabin':   'category',
    'Embarked':'category'
})
data = pd.read_csv(train, index_col=index, dtype=dtypes)

In [None]:
allCorrelations = computeCorrelation(data)

In [None]:
fig, ax = plotPairwiseCorrelation(allCorrelations)

In [None]:
missingVals = showMissing(data)
print(missingVals)

In [None]:
plotTargetCorrelation(allCorrelations, 'Age')

In [None]:
reference = 'Age'
validFeatures = data.select_dtypes(exclude=['float', 'int']).columns
for feature in validFeatures:
    if feature == reference:
        continue
    grouping = [group[reference].dropna().values for _, group in data.groupby(feature)]
    H, p = stats.kruskal(*grouping)
    if not np.isnan(H):
        print(feature, H, p)

### Exploring Parch vs Survived
  - Parch feature indicates number of parents/children.

In [None]:
sns.histplot(x='Parch', hue='Survived', stat='density', data=data)

In [None]:
temp = data.copy()
temp['AgeGroup'] = pd.cut(data['Age'], 5)
temp['FamSize'] = temp['Parch'] + temp['SibSp']
temp = temp.groupby(['AgeGroup', 'FamSize'])['Survived'].mean().reset_index()
temp = temp.pivot(index='AgeGroup', columns='FamSize', values='Survived')
sns.heatmap(temp, cmap='Reds_r')

In [None]:
data['Title'] = data['Name'].apply(lambda x: re.split(',|\.', x)[1].strip())
data['Girl'] = (data['Title'] == 'Miss') & (data['Parch'] > 0)
def estimateAgeGroup(X):
    if (X['Title'] == 'Miss') & (X['Parch'] > 0):
        return 'girl'
    elif (X['Title'] == 'Master'):
        return 'boy'
    elif (X['Sex'] == 'male'):
        return 'man'
    else:
        return 'woman'
data['estAgeGroup'] = data.apply(estimateAgeGroup, axis=1)
data.groupby(['estAgeGroup', 'Pclass'])['Age'].mean()

In [None]:
#data['FamSize'] = data['SibSp'].apply(lambda x: x if x > 0 else 2)
data['FamSize'] = np.log(data['SibSp'] + data['Parch'] + 1)
feature = 'FamSize'
temp = data.groupby([feature])['Survived'].mean().reset_index()
sns.lineplot(x=feature, y='Survived', data=temp)

In [None]:
#data['FamSize'] = data['SibSp'].apply(lambda x: x if x > 0 else 2)
data['FamSize'] = (data['SibSp'] + data['Parch']) + (data['Age'] / 70)
sns.histplot(x='Age', hue='Survived', data=data)

In [None]:
temp

In [None]:
sns.histplot(x='Parch', y='SibSp', data=data[data['Survived']==False])

In [None]:
data['x2'] = data['Parch'] + data['SibSp'] + 1 + (data['Age'] / data['Age'].max())
data['TicketFrequency'] = data.groupby('Ticket')['Ticket'].transform('count')
data['FareAdj'] = data['Fare'] / data['TicketFrequency']
sns.histplot(x='FareAdj', hue='Pclass', stat='density', data=data)

In [None]:
male1Survived = data.loc[(data['Pclass'] == 3) & (data['Survived'] == 0) & (data['Sex'] == 'female')]
male1Survived

 https://medium.com/analytics-vidhya/scikit-learn-pipelines-with-custom-transformer-a-step-by-step-guide-9b9b886fd2cc
        https://stackoverflow.com/questions/48320396/create-a-custom-sklearn-transformermixin-that-transforms-categorical-variables-c

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, PowerTransformer, KBinsDiscretizer, MinMaxScaler
from sklearn.linear_model import LogisticRegression, Lasso, RidgeCV, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector as selector
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
from sklearn import set_config
from xgboost import XGBRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import ParameterGrid
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel, RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingRegressor, RandomForestClassifier
from sklearn.svm import SVR
from sklearn.metrics import make_scorer
from sklearn.kernel_ridge import KernelRidge
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LinearRegression

In [None]:
set_config(display='diagram')
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12,8)

In [None]:
class GroupImputer(BaseEstimator, TransformerMixin):
    """ Extension of SimpleImputer to optionally impute 
        values by group and return a pandas dataframe. """
    
    def __init__(self, variable, by=[], strategy='median'): 
        self.variable = variable
        self.by = by
        if strategy == 'most_frequent':
            self.strategy = lambda x: x.mode().sample(1).values[0]
        else:
            self.strategy = strategy
        self.maps = []

    def fit(self, X, y=None):
        # Store impute for ungrouped data
        self.simpleImpute = X[self.variable].agg(self.strategy)
        # Store maps for all grouping levels
        for i in range(len(self.by), 0, -1):
            subBy = self.by[:i]
            mapper = X.groupby(subBy)[self.variable].agg(self.strategy)
            if i == 1:
                mapper = {(k,): v for k, v in mapper.to_dict().items()}
            self.maps.append((subBy , mapper))
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for (by, mapper) in self.maps:
            fillVals = X[by].apply(tuple, axis=1).map(mapper)
            X[self.variable] = X[self.variable].fillna(fillVals)
            if not X[self.variable].isnull().values.any():
                break
        else:
            # Replace remaining NaN (with ungrouped)
            X[self.variable] = X[self.variable].fillna(self.simpleImpute)
        return X

In [None]:
class GroupImputer2(BaseEstimator, TransformerMixin):
    """ Extension of SimpleImputer to optionally impute 
        values by group and return a pandas Series. """
    
    def __init__(self, variable, by=[], strategy='median'): 
        self.variable = variable
        self.by = by
        if strategy == 'most_frequent':
            self.strategy = lambda x: x.mode().sample(1).values[0]
        else:
            self.strategy = strategy
        self.maps = []

    def fit(self, X, y=None):
        # Store impute for ungrouped data
        self.simpleImpute = X[self.variable].agg(self.strategy)
        # Store maps for all grouping levels
        for i in range(len(self.by), 0, -1):
            subBy = self.by[:i]
            mapper = X.groupby(subBy)[self.variable].agg(self.strategy)
            if i == 1:
                mapper = {(k,): v for k, v in mapper.to_dict().items()}
            self.maps.append((subBy , mapper))
        return self

    def transform(self, X, y=None):
        imputed = X[self.variable]
        for (by, mapper) in self.maps:
            fillVals = X[by].apply(tuple, axis=1).map(mapper)
            imputed = imputed.fillna(fillVals)
            if not imputed.isnull().values.any():
                break
        else:
            # Replace remaining NaN (with ungrouped)
            imputed = imputed.fillna(self.simpleImpute)
        return imputed

In [None]:
class FeatureFilter(BaseEstimator, TransformerMixin):
    """ Use for filtering columns by boolean mask """
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[:, self.columns]

In [None]:
class NoTransformer(BaseEstimator, TransformerMixin):
    """ Dummy transformer """
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X

In [None]:
def plotFeatureImportance(X, y, prePreprocessor, estimator, vline=None):
    """ Run decision tree ensemble method on a preModel 
        pipline and plot feature importance """
    pipeline = Pipeline(steps=[
        ('prePreprocessor', prePreprocessor),
        ('selector',        estimator)])
    clf = pipeline.fit(X, y)
    columnTransformer = (
        clf.named_steps['prePreprocessor'].named_steps['columnTransform'])
    try:
        selector = clf.named_steps['prePreprocessor'].named_steps['selector']
    except KeyError:
        selector = None
    featureNames = getFeatureNames(columnTransformer, selector)
    features = (pd.DataFrame(
        {'feature': featureNames,
         'importance': clf.named_steps['selector'].feature_importances_})
        .sort_values(by=['importance'], ascending=False))
    
    print(f'Total unfiltered features: {len(featureNames)}')
    fig, ax = plt.subplots()
    sns.barplot(y='feature', x='importance', data=features, ax=ax)
    if vline is not None:
        ax.axvline(vline)
    ax.set_ylabel('')
    ax.set_xlabel('Feature importance')
    fig.tight_layout()

In [None]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    """ Custom imputation and feature engineering 
        of Titanic dataset """
    
    def __init__(self):
        self._imputes = {}
        self._models = {}
        

    def fit(self, X, y=None):
        X = X.copy()
        # Engineer features required for imputing fits
        X['Title'] = X['Name'].apply(self.getTitle)
        X['ageGroup'] = X.apply(self.estimateAgeGroup, axis=1)
        self.maxAge = X['Age'].max()
        self._imputes['Age'] = GroupImputer2(
            'Age', by=['ageGroup', 'Pclass'], strategy='median').fit(X)
        # Remove misleading 0 fares (crew) and adjust fare before imputing fit
        # Compute true ticket frequency and compute mean per fam size
        X['FamSize'] = X['Parch'] + X['SibSp'] + 1
        X['TicketFrequency'] = X.groupby('Ticket')['Ticket'].transform('count')
        self._models['TicketFrequency'] = LinearRegression().fit(
            X['FamSize'].to_frame(), X['TicketFrequency'])
        X['Fare'].replace(0, np.nan)
        X['FareAdj'] = X['Fare'] / (X['TicketFrequency'] * 10)
        self._imputes['FareAdj'] = GroupImputer2(
            'FareAdj', by=['Pclass'], strategy='median').fit(X)

        return self

    
    def transform(self, X, y=None):
        X = X.copy()
        X['Title'] = X['Name'].apply(self.getTitle)
        X['ageGroup'] = X.apply(self.estimateAgeGroup, axis=1)
        X['Age'] = self._imputes['Age'].transform(X)
        # Switch crew fares to 0 and correctly impute
        X['Fare'].replace(0, np.nan) 
        X['FamSize'] = X['Parch'] + X['SibSp'] + 1
        X['TicketFrequency'] = self._models['TicketFrequency'].predict(
            X['FamSize'].to_frame())
        X['FareAdj'] = X['Fare'] / (X['TicketFrequency'] * 10)
        X['FareAdj'] = self._imputes['FareAdj'].transform(X)
        X['normSibp'] = X['SibSp'].apply(lambda x: x if x > 0 else 2)
        return X
        
        
    def estimateAgeGroup(self, X):
        if (X['Title'] == 'Miss') & (X['Parch'] > 0):
            return 'girl'
        elif (X['Title'] == 'Master'):
            return 'boy'
        elif (X['Sex'] == 'male'):
            return 'man'
        else:
            return 'woman'
        
        
    def getTitle(self, x):
        """ Extract title from name """
        return re.split(',|\.', x)[1].strip()

        
    def _titles(self):
        return {
            "Ms":         "Mrs",
            "Mr" :        "Mr",
            "Mrs" :       "Mrs",
            "Miss" :      "Mrs",
            "Master" :    "Master"}
      
        
    def _makeCabin(self, X):
        """ Convert cabin number of cabin section """
        cabins = X['Cabin'].apply(lambda x: x[0]).fillna('Unknown')
        return cabins

In [None]:
def getFeatureNames(columnTransformer, selector=None):
    """ Extract feature names from column transformer. 
        If transformers are pipelines then encoding step
        should be last step of that pipeline
        Ref: https://github.com/scikit-learn/scikit-learn/issues/12525 
    """
    colNames = np.array([])
    for tupleTransformer in columnTransformer.transformers_[:-1]:
        if isinstance(tupleTransformer[1], Pipeline): 
            transformer = tupleTransformer[1].steps[-1][1]
        else:
            transformer = tupleTransformer[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError:
            names = tupleTransformer[2]
        # This is for kBinDiscretizers, which have nbins
        if (isinstance(transformer, KBinsDiscretizer)
                and transformer.encode != 'ordinal'):
            if transformer.encode != 'ordinal':
                nbins = transformer.n_bins_
                newNames = []
                for col, n in zip(names, nBins):
                    newNames = [f'{col}-{i}' for i in range(n)]
                names = newNames
        colNames = np.append(colNames, names)
    if selector is not None:
        colNames = colNames[selector.get_support()]
    return colNames

In [None]:
X = pd.read_csv(train, index_col=index, dtype=dtypes)
y = X.pop(target)

split = train_test_split(X, y, random_state=0, train_size=0.8, test_size=0.2)
X_train, X_valid, y_train, y_valid = map(lambda x: x.copy(), split)

# Data pre-processing step

In [None]:
CountTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])
CatTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot' , OneHotEncoder(handle_unknown='ignore')),
])
BinTransformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder())
])
FareDiscretizer = Pipeline(steps=[
    ('discrete', KBinsDiscretizer(encode='ordinal', strategy='kmeans'))
])
AgeDiscretizer = Pipeline(steps=[
    ('discrete', KBinsDiscretizer(encode='ordinal', strategy='kmeans'))
])

In [None]:
transformers = ([
    ('Count',  CountTransformer,  []),
    ('Bin',    BinTransformer,    ['Sex']),
    ('Fare',   FareDiscretizer,   ['FareAdj']),
    ('Age',    AgeDiscretizer,    ['Age']),
    ('None',   NoTransformer(),   ['FamSize', 'normSibp']),
])
preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')

In [None]:
# Define a preModel pipeline distinct from modelling step
dataEngineering = Pipeline(steps=[
    ('engineer',        FeatureEngineer()),
    ('columnTransform', preprocessor),
])

## Assessing feature importance

In [None]:
selectEstimator = RandomForestClassifier(random_state=1, n_estimators=50, max_features='sqrt')
plotFeatureImportance(X_train, y_train, preSelector, selectEstimator, 0.015)

### Perform feature selection
  - Combine the preProcess pipeline with feature selector.
  - Run feature selection and identify selected features.
  - Selected features and passed to parameter hypertuning pipeline (stage 3).

In [None]:
# Configure the cross-validation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
nJobs = 4

In [None]:
featureSelector = Pipeline(steps=[
    ('preProcess',    dataEngineering),
    ('selector',      RFECV(selectEstimator, cv=cv, scoring='accuracy')),
])

In [None]:
# Fit data to pipeline
featureSelector.fit(X_train, y_train)
# Extract columnTransformer and selector to extract feature names
columnTransformer = featureSelector.named_steps['preProcess'].named_steps['columnTransform']
selector = featureSelector.named_steps['selector']
featureNames = getFeatureNames(columnTransformer, selector)
# Create dataframe of transformed data
transformedDF = pd.DataFrame(
    featureSelector.transform(X_valid), 
    columns=featureNames)
transformedDF.head()

selectedFeatures = selector.get_support()

## Estimator hypertuning

In [None]:
fullModel = Pipeline(steps=[
    ('preProcess',    dataEngineering),
    ('featureFilter', FeatureFilter(selectedFeatures)),
    ('model',         RandomForestClassifier(random_state=1)),
])

In [None]:
# Provide list of param dictionaries, because we only want
# to search the gamma parameter space for the linear model
params =([
    {'preProcess__columnTransform__Fare__discrete__n_bins': Integer(2, 6),
     'preProcess__columnTransform__Age__discrete__n_bins': Integer(2, 6),
     'model__n_estimators':      Integer(10, 1000),
     'model__max_depth':         Integer(3, 20),
     'model__max_features':      Categorical(['sqrt']),
     'model__criterion':         Categorical(['gini'])},
])

In [None]:
gridSearch = BayesSearchCV(
    fullModel, params, scoring='accuracy',
    cv=cv, refit=True, n_jobs=nJobs, n_iter=100)
gridSearch.fit(X_train, y_train)

In [None]:
score = gridSearch.score(X_valid, y_valid)
print(f'Best score: {score}')

gridSearch.best_params_

### Generate predictions
 - Refit model with full test dataset

In [None]:
#gridSearch.fit(X, y)
X_test = pd.read_csv(test, index_col=index, dtype=dtypes)
predictions = gridSearch.predict(X_test).astype(int)
submission = pd.DataFrame({'PassengerId':X_test.index,'Survived': predictions})
submission.to_csv('submission.csv', index=False)

# Interpretation - assessing feature importance

In [None]:
import eli5

In [None]:
preModelTransformer = gridSearch.best_estimator_.named_steps['preModel']
columnTransformer = preModelTransformer.named_steps['columnTransform']
model = gridSearch.best_estimator_.named_steps['model']
featureNames = getFeatureNames(columnTransformer)

### Try out the preModelTransformer

In [None]:
X.head()

In [None]:
transformedDf = pd.DataFrame(preModelTransformer.transform(X_valid), columns=featureNames)
transformedDf.head()

In [None]:
eli5.explain_weights(model, feature_names=featureNames)

In [None]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
sns.barplot(y='Fare', x='FamilySize', hue='Pclass', data=data)

In [None]:
data['FareAdj'] = data['Fare'] / data['FamilySize']

In [None]:
sns.histplot(data=data[data['Pclass']==3], x='FareAdj', hue='Pclass')

In [None]:
sns.distplot(x='x2', hue='Survived', kind='kde', data=data)

## Hard encoded rules

In [None]:
class RuleAugmentedEstimator(BaseEstimator, ClassifierMixin):
    """ Augments sklearn estimators with rule-based logic.
        This class is a wrapper class for sklearn estimators with the additional
    possibility of adding rule-based logic to the underlying estimator.
    The provided rules are hard-coded and take precedence over the underlying
    estimator's predictions.
    """

    def __init__(self, base_model: BaseEstimator, **baseParams):  
        self.base_model = base_model
        self.base_model.set_params(**baseParams)

   
    def _get_base_model_data(self, X, y):
        """Filters the trainig data for data points not affected by the rules."""
        
        train_x = X
        male23 = (train_x['Sex'] == 'male') & (train_x['Pclass'] != 1)
        fem1 = (train_x['Sex'] == 'female') & (train_x['Pclass'] == 2)
        mask = [any(tup) for tup in zip(male23, fem1)]
        train_x = train_x.loc[mask]
        train_y = y.loc[mask]
        
        train_x = train_x.reset_index(drop=True)
        train_y = train_y.reset_index(drop=True)
        
        return train_x, train_y   

    
    def fit(self, X, y, **kwargs):
        train_x, train_y = self._get_base_model_data(X, y)
        self.base_model.fit(train_x, train_y, **kwargs)
    
    
    def predict(self, X: pd.DataFrame) -> np.array:
        """Gets predictions for the provided feature data.
        
        The predicitons are evaluated using the provided rules wherever possible
        otherwise the underlying estimator is used.
        
        Args:
            X: The feature data to evaluate predictions for.
        
        Returns:
            np.array: Evaluated predictions.
        """
        
        p_X = X.copy()
        p_X['prediction'] = np.nan
        p_X.loc[(p_X['Sex'] == 'male') & (p_X['Pclass'] != 1), 'prediction'] = False
        p_X.loc[(p_X['Sex'] == 'female') & (p_X['Pclass'] == 1), 'prediction'] = True
        
        if len(p_X.loc[p_X['prediction'].isna()].index != 0):
            base_X = p_X.loc[p_X['prediction'].isna()].copy()
            base_X.drop('prediction', axis=1, inplace=True)
            p_X.loc[p_X['prediction'].isna(), 'prediction'] = self.base_model.predict(base_X)
        return list(p_X['prediction'])
    
    
    def get_params(self, deep: bool = True):
        return self.base_model.get_params(deep=deep)
    
