# Test Train Split

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(0)
import xgboost as xgb

# read dataset 
df = pd.read_csv("data.csv")
df = df.sort_values(by='indexingFeature').reset_index(drop=True)

# remove identification features
colsToDrop = ['...']
df.drop(colsToDrop, axis=1, inplace=True)

In [None]:
from scipy.stats import fisher_exact
from scipy.stats import chi2_contingency

isNa = df.isna()
data = []
for column in isNa.columns:
    crosstab = pd.crosstab(isNa[column], df.targetFeature).reindex(index=[False, True], columns=[0, 1], fill_value=0)
    
    # True->0
    suppT0 = crosstab.loc[True, 0] # P(T&0)
    confT0 = crosstab.loc[True, 0]/(sum(crosstab.loc[True, :])+1) # ~P(0|T)
    liftT0 = crosstab.loc[True, 0]/((sum(crosstab.loc[True, :])+1)*(sum(crosstab.loc[:, 0])+1)) # ~P(0|T)/P(0)
    # True->1
    suppT1 = crosstab.loc[True, 1] # P(T&1)
    confT1 = crosstab.loc[True, 1]/(sum(crosstab.loc[True, :])+1) # ~P(1|T)
    liftT1 = crosstab.loc[True, 1]/((sum(crosstab.loc[True, :])+1)*(sum(crosstab.loc[:, 1])+1)) # ~P(1|T)/P(1)
    # False->0
    suppF0 = crosstab.loc[False, 0] # P(F&0)
    confF0 = crosstab.loc[False, 0]/(sum(crosstab.loc[False, :])+1) # ~P(0|F)
    liftF0 = crosstab.loc[False, 0]/((sum(crosstab.loc[False, :])+1)*(sum(crosstab.loc[:, 0])+1)) # ~P(0|F)/P(0)
    # False->1
    suppF1 = crosstab.loc[False, 1] # P(F&1)
    confF1 = crosstab.loc[False, 1]/(sum(crosstab.loc[False, :])+1) # ~P(1|F)
    liftF1 = crosstab.loc[False, 1]/((sum(crosstab.loc[False, :])+1)*(sum(crosstab.loc[:, 1])+1)) # ~P(1|F)/P(1)

    fisherStat, fisherP = fisher_exact(crosstab, alternative='two-sided')
    chi2Stat, chi2P, chi2Dof, chi2Expected = chi2_contingency(crosstab + 0.5)
    naProp = isNa[column].mean()

    data.append([naProp, suppT0, confT0, liftT0, suppT1, confT1, liftT1, suppF0, confF0, liftF0, suppF1, confF1, liftF1, fisherP, chi2P])
liftAnalysis = pd.DataFrame(data, columns=['naProp', 'suppT0', 'confT0', 'liftT0', 'suppT1', 'confT1', 'liftT1', 'suppF0', 'confF0', 'liftF0', 'suppF1', 'confF1', 'liftF1', 'fisherP', 'chi2P'], index=isNa.columns)
plt.figure(figsize=(30, 5))
np.log(liftAnalysis.fisherP).plot(label='fisher exact test')
np.log(liftAnalysis.chi2P).plot(label='chi squared test')
plt.legend()
plt.show()

In [None]:
isNa = df.isna().add_suffix('_isNA')
df = df.join(isNa, how='left')

dfTrain = df[(df.indexingFeature < '2024-01-01') & (df.indexingFeature >= '2021-01-01')]
trainIndex = dfTrain.index
dfTest = df[df.indexingFeature >= '2024-01-01']
dfTrain.to_csv('raw_train.csv')
dfTest.to_csv('raw_test.csv')

# Data Cleaning

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class RemoveEmptyFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9, verbose=True):
        self.threshold = threshold
        self.emptyFeatures = None
        self.verbose=verbose

    def fit(self, X, y=None):
        self.print("=== RemoveEmptyFeatures ===")
        self.emptyFeatures = (X.isna().mean()).sort_values()
        return self

    def transform(self, X):
        nonEmptyFeatureNames = self.emptyFeatures[self.emptyFeatures <= self.threshold].index
        return X[nonEmptyFeatureNames].copy()

    def print(self, msg):
        if self.verbose:
            print(msg)

In [None]:
class RemoveColumnPerFrequency(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=1, verbose=True):
        self.threshold = threshold
        self.featureModes = None
        self.verbose=verbose

    def fit(self, X, y=None):
        self.print("=== RemoveColumnPerFrequency ===")
        self.featureModes = X.mode(dropna=False).iloc[0]
        return self

    def transform(self, X):
        newX = X.copy()
        XLen = len(X)
        for column in self.featureModes.index:
            value = self.featureModes[column]
            if pd.isna(value):
                modeFreq = X[column].isna().sum()
            else:
                modeFreq = X[column].value_counts()[value]
            modeProp = modeFreq/XLen
            if modeProp >= self.threshold:
                self.print(f"dropping: {column} [mode proportion {modeProp}]")
                newX.drop(column, axis=1, inplace=True)
        return newX
    
    def print(self, msg):
        if self.verbose:
            print(msg)

In [None]:
class RemoveDuplicatedColumns(BaseEstimator, TransformerMixin):
    def __init__(self, verbose=True):
        self.verbose=verbose

    def fit(self, X, y=None):
        self.print("=== RemoveDuplicatedColumns ===")
        return self

    def transform(self, X):
        newX = X.copy()
        columns = list(X.columns)
        for i, colI in enumerate(columns):
            for j, colJ in enumerate(columns[i+1:]):
                if X[colI].equals(X[colJ]) and colJ in newX.columns:
                    self.print(f"dropping: {colJ} [{colI} equals {colJ}]")
                    newX.drop(colJ, axis=1, inplace=True)
        return newX

    def print(self, msg):
        if self.verbose:
            print(msg)

In [None]:
from pandas.api.types import is_numeric_dtype

class RemoveCorrelatedColumns(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.75, targetFeature='targetFeature', verbose=True):
        self.threshold = threshold
        self.targetFeature = targetFeature
        self.pairWiseCorr = None
        self.verbose=verbose

    def fit(self, X, y=None):
        self.print("=== RemoveCorrelatedColumns ===")
        self.pairWiseCorr = X.select_dtypes(include=['number']).corr()
        return self

    def transform(self, X):
        newX = X.copy()
        columns = list(self.pairWiseCorr.columns)
        for i, colI in enumerate(columns):
            if colI not in newX.columns or colI==self.targetFeature:
                continue
            for j, colJ in enumerate(columns[i+1:]):
                if colJ not in newX.columns or colJ==self.targetFeature:
                    continue
                corr = abs(self.pairWiseCorr.loc[colI, colJ])
                if corr >= abs(self.threshold):
                    colICorr = abs(self.pairWiseCorr.loc[colI,self.targetFeature])
                    colJCorr = abs(self.pairWiseCorr.loc[colJ,self.targetFeature])
                    if colICorr >= colJCorr:
                        self.print(f"dropping: {colJ} [Corr(i,j)={corr} Corr(i,t)={colICorr}]")
                        newX.drop(colJ, axis=1, inplace=True)
                    else:
                        self.print(f"dropping: {colI} [Corr(i,j)={corr} Corr(j,t)={colJCorr}]")
                        newX.drop(colI, axis=1, inplace=True)
                        break
        return newX

    def print(self, msg):
        if self.verbose:
            print(msg)

In [None]:
from sklearn.pipeline import Pipeline

dfTrain = pd.read_csv('raw_train.csv', index_col='Unnamed: 0')
assert dfTrain.index.is_monotonic_increasing
assert dfTrain.extraction_date.is_monotonic_increasing

pipeline = Pipeline([
    ('RemoveEmptyFeatures', RemoveEmptyFeatures(threshold=0.9, verbose=True)),
    ('RemoveColumnPerFrequency', RemoveColumnPerFrequency(threshold=1, verbose=True)),
    ('RemoveDuplicatedColumns', RemoveDuplicatedColumns(verbose=True)),
    ('RemoveCorrelatedColumns', RemoveCorrelatedColumns(threshold=0.75, targetFeature='idc_trgt', verbose=True))])

dfTrainClean = pipeline.fit_transform(dfTrain, dfTrain.idc_trgt)
dfTrainClean.to_csv("clean_train.csv")

In [None]:
dfTest = pd.read_csv('raw_test.csv', index_col='Unnamed: 0')
dfTestClean = dfTest[dfTrainClean.columns]
dfTestClean.to_csv("clean_test.csv")

# Data Encoding

In [None]:
import re

class EncodeDateTimeColumns(BaseEstimator, TransformerMixin):
    def __init__(self, patterns=[r".*_dt.*", r".*_dttm.*"], indexingFeature='indexingFeature', verbose=True):
        self.patterns = patterns
        self.indexingFeature = indexingFeature
        self.verbose=verbose
        self.dateTimeColumns = None
        self.columns = []

    def fit(self, X, y=None):
        self.print("=== EncodeDateTimeColumns ===")
        xObjects = X.select_dtypes(include=['object'])
        self.dateTimeColumns = [column for column in xObjects.columns if any(re.search(pattern, column) for pattern in self.patterns)]
        if self.indexingFeature in self.dateTimeColumns:
            self.dateTimeColumns.remove(self.indexingFeature)
        return self

    def transform(self, X):
        newX = X.copy()
        for column in self.dateTimeColumns:
            self.print(f"converting: {column}")
            newX.loc[:, column] = pd.to_datetime(newX[column], errors='coerce')
            newX[f"{column}_month"] = newX[column].apply(lambda x: x.month if pd.notna(x) else pd.NA).astype("Int64")
            newX[f"{column}_day"] = newX[column].apply(lambda x: x.day if pd.notna(x) else pd.NA).astype("Int64")
            newX[f"{column}_dayOfWeek"] = newX[column].apply(lambda x: x.dayofweek if pd.notna(x) else pd.NA).astype("Int64")
            self.columns += [f"{column}_day", f"{column}_month"]
            newX.drop(column, axis=1, inplace=True)
        return newX

    def print(self, msg):
        if self.verbose:
            print(msg)

In [None]:
class EncodeLowCardinality(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=50, indexingFeature='indexingFeature', parentDf=None, verbose=True):
        self.threshold = threshold
        self.indexingFeature = indexingFeature
        self.parentDf = parentDf
        if self.parentDf is not None:
            self.parentDf = self.parentDf.map(lambda x: x.strip() if isinstance(x, str) else x)
        self.verbose=verbose
        self.lowCardColumns = None
        self.columns = []

    def fit(self, X, y=None):
        self.print("=== EncodeLowCardinality ===")
        xObjects = X.select_dtypes(include=['object']).astype(str).map(lambda x: x.strip())
        self.lowCardColumns = [column for column in xObjects.columns if xObjects[column].nunique() < self.threshold]
        
        if self.parentDf is not None:
            self.lowCardColumns = [column for column in xObjects.columns if self.parentDf[column].nunique() < self.threshold]
            
        if self.indexingFeature in self.lowCardColumns:
            self.lowCardColumns.remove(self.indexingFeature)
        return self

    def transform(self, X):
        newX = X.copy().map(lambda x: x.strip() if isinstance(x, str) else x)
        for column in self.lowCardColumns:
            self.print(f"converting: {column} [{newX[column].nunique()} unique values]")
        oneHotColumns = pd.get_dummies(newX[self.lowCardColumns]).astype(int)
        
        if self.parentDf is not None:
            parentOneHotColumns = pd.get_dummies(self.parentDf[self.lowCardColumns]).astype(str)
            oneHotColumnsNames = set(oneHotColumns.columns)
            parentOneHotColumnsNames = set(parentOneHotColumns.columns)
            oneHotColumns = oneHotColumns[list(oneHotColumnsNames & parentOneHotColumnsNames)]
            for column in list(parentOneHotColumnsNames - oneHotColumnsNames):
                oneHotColumns[column] = 0
        self.columns = list(oneHotColumns.columns)

        newX = newX.join(oneHotColumns, how='inner')
        newX.drop(self.lowCardColumns, axis=1, inplace=True)
        return newX

    def print(self, msg):
        if self.verbose:
            print(msg)

In [None]:
class EncodeHighCardinality(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=50,  targetFeature='targetFeature', indexingFeature='indexingFeature', parentDf=None, verbose=True):
        self.threshold = threshold
        self.targetFeature = targetFeature
        self.indexingFeature = indexingFeature
        self.parentDf = parentDf
        if self.parentDf is not None:
            self.parentDf = self.parentDf.map(lambda x: x.strip() if isinstance(x, str) else x)
        self.verbose=verbose
        self.highCardColumns = None
        self.columns = []

    def fit(self, X, y=None):
        self.print("=== EncodeHighCardinality ===")
        xObjects = X.select_dtypes(include=['object']).astype(str).map(lambda x: x.strip())
        self.highCardColumns = [column for column in xObjects.columns if xObjects[column].nunique() >= self.threshold]

        if self.parentDf is not None:
            self.lowCardColumns = [column for column in xObjects.columns if self.parentDf[column].nunique() >= self.threshold]
        
        if self.indexingFeature in self.highCardColumns:
            self.highCardColumns.remove(self.indexingFeature)
        return self

    def transform(self, X):
        newX = X.copy().map(lambda x: x.strip() if isinstance(x, str) else x)
        
        xCombined = newX
        if self.parentDf is not None:
            xCombined = pd.concat([newX, self.parentDf])
            
        for column in self.highCardColumns:
            if column == self.targetFeature:
                continue
            self.print(f"converting: {column} [{X[column].nunique()} unique values]")
            target_map = self.target_encoding(xCombined, [column, self.indexingFeature])
            mapping = newX[[column, self.indexingFeature]].copy().merge(target_map.reset_index(), on=[column, self.indexingFeature], how='left')
            assert mapping['mean'].isna().sum() == newX[column].isna().sum()
            newX.loc[:, column] = mapping['mean']
            newX[column] = newX[column].astype(float)
        self.columns = self.highCardColumns
        return newX

    def print(self, msg):
        if self.verbose:
            print(msg)

    def target_encoding(self, df, columns):
        grouping = df.groupby(columns)[self.targetFeature]
        sum = grouping.sum().reset_index().groupby(columns[0])[self.targetFeature].cumsum()
        count = grouping.count().reset_index().groupby(columns[0])[self.targetFeature].cumsum()
        map = grouping.count().reset_index()
        map['mean'] = sum/count
        map = map[columns + ['mean']].set_index(columns)
        return pd.DataFrame(map.groupby(columns[0])['mean'].shift(1).fillna(0), columns=['mean']) 

In [None]:
from sklearn.utils import resample

class DownSample(BaseEstimator, TransformerMixin):
    def __init__(self, negToPosRatio=2,  targetFeature='targetFeature', verbose=True):
        self.negToPosRatio = negToPosRatio
        self.targetFeature = targetFeature
        self.verbose=verbose

    def fit(self, X, y=None):
        self.print("=== DownSample ===")
        return self

    def transform(self, X):
        newX = X.copy()
        neg = newX[newX[self.targetFeature]==0]
        pos = newX[newX[self.targetFeature]==1]
        neg = resample(neg, n_samples=len(pos)*self.negToPosRatio, random_state=0)
        newX = pd.concat([pos, neg]).sort_index()
        return newX

    def print(self, msg):
        if self.verbose:
            print(msg)

In [None]:
dfTrainClean = pd.read_csv("clean_train.csv", index_col='Unnamed: 0')
assert dfTrainClean.index.is_monotonic_increasing
assert dfTrainClean.indexingFeature.is_monotonic_increasing

pipeline = Pipeline([
    ('EncodeDateTimeColumns', EncodeDateTimeColumns(patterns=[r".*_dt.*", r".*_dttm.*"], indexingFeature='indexingFeature', verbose=True)),
    ('EncodeLowCardinality', EncodeLowCardinality(threshold=50, indexingFeature='indexingFeature', verbose=True)),
    ('EncodeHighCardinality', EncodeHighCardinality(threshold=50,  targetFeature='targetFeature', indexingFeature='indexingFeature', verbose=True)),
    ('DownSample', DownSample(negToPosRatio=2,  targetFeature='targetFeature', verbose=True))
])

dfTrainEnc = pipeline.fit_transform(dfTrainClean, dfTrain.idc_trgt)
dfTrainEnc.to_csv("encoded_train.csv")

In [None]:
dfTestClean = pd.read_csv('clean_test.csv', index_col='Unnamed: 0')
dfTrainClean = pd.read_csv("clean_train.csv", index_col='Unnamed: 0')
assert dfTestClean.index.is_monotonic_increasing
assert dfTestClean.indexingFeature.is_monotonic_increasing

testEncPipeline = Pipeline([
    ('EncodeDateTimeColumns', EncodeDateTimeColumns(patterns=[r".*_dt.*", r".*_dttm.*"], indexingFeature='indexingFeature', verbose=True)),
    ('EncodeLowCardinality', EncodeLowCardinality(threshold=50, indexingFeature='indexingFeature', parentDf=dfTrainClean, verbose=True)),
    ('EncodeHighCardinality', EncodeHighCardinality(threshold=50,  targetFeature='targetFeature', indexingFeature='indexingFeature', parentDf=dfTrainClean, verbose=True))
])
dfTestEnc = testEncPipeline.fit_transform(dfTestClean, dfTestClean.idc_trgt)
dfTestEnc.to_csv("encoded_test.csv")

# Data Imputation

In [None]:
def time_series_split(df, target_variable='indexingFeature', n_splits=100, gap=100):
    assert df.index.is_monotonic_increasing
    assert df[target_variable].is_monotonic_increasing

    newDf = df.copy()
    newDf.index = range(len(newDf))
    
    current = 0
    previous = 0
    partitions = []
    total = len(newDf.index)
    indexOfVariableChanges = newDf.groupby(target_variable).apply(lambda group: group.index.max(), include_groups=False)

    averageCount = (total-gap*(n_splits-1))/n_splits
    requireGap = False
    left = 0
    right = None
    for target, index in indexOfVariableChanges.items():
        if index - current >= (averageCount if not requireGap  else gap):
            previous = current
            current = index
            if requireGap:
                requireGap = False
                left = index+1
            else:
                requireGap = True
                right = index
                partitions.append((left, right))
    return [(partitions[i][1], partitions[i+1]) for i in range(len(partitions)-1)]

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import f1_score

def miss_forest(df, startIndex=0, numCols=[], partitions=10, partition_variable='indexingFeature', rounds=5):
    endIndex = max(df.index)
    partitionRanges = time_series_split(df[startIndex:], target_variable=partition_variable, n_splits=10, gap=0)
    X_new = df.copy()
    X_new.drop(partition_variable, axis=1, inplace=True)
    catCols = [col for col in X_new.columns if col not in numCols]
    X_new.loc[:, numCols] = X_new[numCols].astype('float').fillna(X_new[numCols].mean())
    X_new.loc[:, catCols] = X_new[catCols].fillna(X_new[catCols].mode().T.to_dict()[0])

    round = 0
    evals = {}
    l = 0
    k = df.loc[startIndex:].isna().sum().sort_values().index

    while l <= 0 and round <= rounds:
        evals[round] = {}
        evals[round]['cat'] = {}
        evals[round]['num'] = {}
        lCatNum = 0
        lCatDen = 1
        lNumNum = 0
        lNumDen = 1
        X_old = X_new.copy()

        for s in k:
            if df[s].isna().sum() <= 0 or df[s].notna().sum() <= 0:
                    continue
            category = 'num' if s in numCols else 'cat'
            print(f"{s} ({category}): obs{df[s].notna().sum()} mis{df[s].isna().sum()}")
            obsCols = X_old.columns[X_old.columns != s]
            obsIndex = df[df[s].notna()].index
            misIndex = df[df[s].isna()].index
            evals[round][category][s] = {}
            evals[round][category][s]['base'] = []
            evals[round][category][s]['impute'] = []
            
            for (trainLeftIndex, (testLeftIndex, testRightIndex)) in partitionRanges:
                print(".", end="")
                i_obs = obsIndex[obsIndex <trainLeftIndex]
                i_mis = misIndex[(misIndex <= testRightIndex) & (misIndex >= testLeftIndex)]
                i_eval = obsIndex[(obsIndex <= testRightIndex) & (obsIndex >= testLeftIndex)]
                assert df.loc[i_obs, s].isna().sum() == 0
                assert df.loc[i_eval, s].isna().sum() == 0
                assert df.loc[i_mis, s].isna().sum() == len(df.loc[i_mis, s])
                
                if len(i_obs) <= 0 or len(i_mis) <= 0:
                    continue

                if s in numCols:
                    estimator = RandomForestRegressor(n_estimators=100, random_state=0, max_depth=5, oob_score=False, n_jobs=-1, warm_start=False)
                else:
                    estimator = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=5, oob_score=False, n_jobs=-1, warm_start=False)
            
                estimator.fit(X_old.loc[i_obs, obsCols], X_old.loc[i_obs, s])
                X_new.loc[i_mis, s] = estimator.predict(X_old.loc[i_mis, obsCols])

                eval_pred = estimator.predict(X_old.loc[i_eval, obsCols])

                if s in numCols:
                    lNumNum += sum((X_new.loc[i_mis, s]-X_old.loc[i_mis, s])**2/(X_old.loc[i_mis, s])**2)
                    lNumDen += sum(X_old.loc[i_mis, s]**2)
                    evals[round][category][s]['impute'].append(mean_absolute_error(X_old.loc[i_eval, s], eval_pred))
                    evals[round][category][s]['base'].append(mean_absolute_error(X_old.loc[i_eval, s], [X_old.loc[i_obs, s].mean()]*len(X_old.loc[i_eval, s])))
                else:
                    lCatNum += sum(X_new.loc[i_mis, s]==X_old.loc[i_mis, s])
                    lCatDen += len(i_mis)
                    evals[round][category][s]['impute'].append(f1_score(X_old.loc[i_eval, s], eval_pred, average='macro'))
                    evals[round][category][s]['base'].append(f1_score(X_old.loc[i_eval, s], [X_old.loc[i_obs, s].mode()[0]]*len(X_old.loc[i_eval, s]), average='macro'))
            print(":")
        l = (lCatNum/lCatDen)*(lNumNum/lNumDen)
        round += 1
    return X_new, evals

In [None]:
dfTrainEnc = pd.read_csv('encoded_train.csv', index_col='Unnamed: 0')
dfTrainEncodedLeakFree = dfTrainEnc.drop(['idc_trgt'], axis=1)

catCols = pipeline.named_steps['EncodeDateTimeColumns'].columns + pipeline.named_steps['EncodeLowCardinality'].columns + ['indexingFeature'] + dfTrainEnc.columns[dfTrainEnc.dtypes==np.dtype('bool')].tolist()
numCols = [column for column in dfTrainEncodedLeakFree if column not in catCols]
dfTrainImputed, evals = miss_forest(dfTrainEncodedLeakFree, startIndex=0, numCols=numCols, partitions=10, rounds=5)
dfTrainImputed.drop(dfTrainImputed.columns[dfTrainImputed.isna().sum() > 0], axis=1, inplace=True)
dfTrainImputed = dfTrainImputed.join(dfTrainEnc.idc_trgt, how='inner')

In [None]:
plt.figure(figsize=(30, 5))
x = np.nan_to_num([np.mean(x['impute']) for x in evals[0]['num'].values()], nan=0)
y = np.nan_to_num([np.mean(y['base']) for y in evals[0]['num'].values()], nan=0)
plt.plot(np.log(x+0.001), alpha=0.5, label='impute error')
plt.plot(np.log(y+0.001), alpha=0.5, label='base error')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(30, 5))
x = np.nan_to_num([np.mean(x['impute']) for x in evals[0]['cat'].values()], nan=0)
y = np.nan_to_num([np.mean(y['base']) for y in evals[0]['cat'].values()], nan=0)
plt.plot(x, alpha=0.5, label='impute error')
plt.plot(y, alpha=0.5, label='base error')
plt.legend()
plt.show()

In [None]:
dfTestEnc = pd.read_csv('encoded_test.csv', index_col='Unnamed: 0')
dfTrainEnc = pd.read_csv('encoded_train.csv', index_col='Unnamed: 0')
startIndex = min(dfTestEnc.index)
combinedDf = pd.concat([dfTrainEnc, dfTestEnc]).drop(['idc_trgt'], axis=1)
assert len(set(dfTrainEnc.index) & set(dfTestEnc.index)) == 0
testIndex = dfTestEnc.index
assert combinedDf.index.is_monotonic_increasing

catCols = pipeline.named_steps['EncodeDateTimeColumns'].columns + pipeline.named_steps['EncodeLowCardinality'].columns + ['indexingFeature'] + dfTrainEnc.columns[dfTrainEnc.dtypes==np.dtype('bool')].tolist()
numCols = [column for column in dfTrainEncodedLeakFree if column not in catCols]
dfTestImputed, testEvals = miss_forest(combinedDf, startIndex=startIndex, numCols=numCols, partitions=10, rounds=5)
dfTestImputed = dfTestImputed.loc[testIndex]
dfTestImputed = dfTestImputed.join(dfTestEnc.idc_trgt, how='inner')
dfTestImputed = dfTestImputed[dfTrainImputed.columns]

# Model Development

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import f1_score, confusion_matrix
import xgboost as xgb
import shap

dfTrainImputed = pd.read_csv('imputed_train.csv')
dfTrainImputed = dfTrainImputed.join(dfTrainClean.indexingFeature, how='inner')
tscv = time_series_split(dfTrainImputed, target_variable='indexingFeature', n_splits=100, gap=100)
dfTrainImputed.drop('indexingFeature', axis=1, inplace=True)
dfTrainProcessedX = dfTrainImputed.drop('targetFeature', axis=1)
dfTrainProcessedY = dfTrainImputed['targetFeature']

model = xgb.XGBClassifier(max_depth=5, eval_metric='logloss', n_jobs=-1, nthread=4, random_state=0)

tp = []
fp = []
fn = []
tn = []
shapValues = []
precisions = []
recalls = []
fprs = []
tprs = []

for i, (trainLeftIndex, (testLeftIndex, testRightIndex)) in enumerate(tscv):   
    trainX = dfTrainProcessedX.iloc[:trainLeftIndex+1]
    trainY = dfTrainProcessedY.iloc[:trainLeftIndex+1]
    testX = dfTrainProcessedX.iloc[testLeftIndex:testRightIndex+1]
    testY = dfTrainProcessedY.iloc[testLeftIndex:testRightIndex+1]
    assert len(set(trainX.join(dfTrainClean.indexingFeature, how='inner').indexingFeature) & set(testX.join(dfTrainClean.indexingFeature, how='inner').indexingFeature)) == 0
    assert max(trainX.join(dfTrainClean.indexingFeature, how='inner').indexingFeature) < min(testX.join(dfTrainClean.indexingFeature, how='inner').indexingFeature)
    
    print(".", end="")
    if len(trainY.unique()) < 2 or len(testY.unique()) < 2:
            continue

    model.fit(trainX, trainY)
    y_pred = model.predict(testX)
    cm=confusion_matrix(testY, y_pred)
    tn.append(cm[0][0])
    fp.append(cm[0][1])
    fn.append(cm[1][0])
    tp.append(cm[1][1])
    precision, recall, thresholds = precision_recall_curve(testY,model.predict_proba(testX).T[-1])
    precisions.append(precision)
    recalls.append(recall)
    fpr, tpr, thresholds = roc_curve(testY,model.predict_proba(testX).T[-1])
    fprs.append(fpr)
    tprs.append(tpr)
    
    explainer = shap.Explainer(model)
    shap_values = shap.TreeExplainer(model).shap_values(testX)
    shapValues.append(shap_values)

In [None]:
from matplotlib.lines import Line2D

for i in range(len(precisions)):
    plt.plot(fprs[i], tprs[i], color='blue', alpha=0.1)
    plt.plot(recalls[i], precisions[i], color='red', alpha=0.1)
plt.xlabel("Recall/FPT")
plt.ylabel("Precision/TPR")

handles, labels = plt.gca().get_legend_handles_labels()
line1 = Line2D([0], [0], label='fpr/tpr', color='blue')
line2 = Line2D([0], [0], label='recall/precision', color='red')
handles.extend([line1, line2])

plt.legend(handles=handles)
plt.show()

In [None]:
tfMean = pd.merge(pd.DataFrame(pd.DataFrame(fprs).mean(), columns=['fpr']), pd.DataFrame(pd.DataFrame(tprs).mean(), columns=['tpr']), left_index=True, right_index=True).sort_values('fpr')
tfStd = pd.merge(pd.DataFrame(pd.DataFrame(fprs).std(), columns=['fpr']), pd.DataFrame(pd.DataFrame(tprs).std(), columns=['tpr']), left_index=True, right_index=True).reindex(tfMean.index)

plt.plot(tfMean.fpr, tfMean.tpr, color='blue', alpha=1, label='TF curve')
plt.fill_between(tfMean.fpr, tfMean.tpr-tfStd.tpr, tfMean.tpr+tfStd.tpr, color='blue', alpha=0.2)

prMean = pd.merge(pd.DataFrame(pd.DataFrame(recalls).mean(), columns=['recall']), pd.DataFrame(pd.DataFrame(precisions).mean(), columns=['precision']), left_index=True, right_index=True).sort_values('recall')
prStd = pd.merge(pd.DataFrame(pd.DataFrame(recalls).std(), columns=['recall']), pd.DataFrame(pd.DataFrame(precisions).std(), columns=['precision']), left_index=True, right_index=True).reindex(prMean.index)

plt.plot(prMean.recall, prMean.precision, color='red', alpha=1, label='PR curve')
plt.fill_between(prMean.recall, prMean.precision-prStd.precision, prMean.precision+prStd.precision, color='red', alpha=0.2)
plt.legend()
plt.ylabel('Precision/TPR')
plt.xlabel('Recall/FPR')
plt.title('Evaluation Curves')
plt.show()

In [None]:
index = [i for i in range(len(tp))]
accuracy = [(tp[i] + tn[i])/(tp[i] + tn[i] + fp[i] + fn[i]) for i in range(len(tp))]
precision = [(tp[i])/(tp[i] + fp[i]) for i in range(len(tp))]
recall = [(tp[i])/(tp[i] + fn[i]) for i in range(len(tp))]
f1Score = [(2*precision[i]*recall[i])/(precision[i]+recall[i]) for i in range(len(tp))]
specificity = [(tn[i])/(tn[i] + fp[i]) for i in range(len(tp))]
plt.figure(figsize=(30, 15))
plt.plot(index, accuracy, label = 'accuracy')
plt.plot(index, precision, label = 'precision')
plt.plot(index, recall, label = 'recall')
plt.plot(index, f1Score, label = 'f1Score')
plt.plot(index, specificity, label = 'specificity')
plt.legend()
plt.show()

In [None]:
i = 0
print(f"Accuracy: mean{np.mean(accuracy[i:])} sd{np.std(accuracy[i:])}")
print(f"Precision: mean{np.mean(precision[i:])} sd{np.std(precision[i:])}")
print(f"Recall: mean{np.mean(recall[i:])} sd{np.std(recall[i:])}")
print(f"F1-score: mean{np.mean(f1Score[i:])} sd{np.std(f1Score[i:])}")
print(f"Specificity: mean{np.mean(specificity[i:])} sd{np.std(specificity[i:])}")