In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploratory Analysis

This section contains the code to satisfy https://github.com/Quantyra/kaggle-ml-pipeline-icr-challenge/issues/5
It processes and cleans the features, prepares them for use in the model.

See https://www.kaggle.com/competitions/icr-identify-age-related-conditions/data for information about the dataset

In [1]:
def extract_classes(dataset):
    assert dataset['Id'].nunique() == dataset.shape[0]
    dataset.index = dataset['Id']
    dataset.drop(['Id'], axis='columns', inplace=True)
    if 'Class' in dataset.columns:
        y = dataset['Class']
        dataset.drop(['Class'], axis='columns', inplace=True)
        return y

In [2]:
# Some checks to ensure sanity of outputs after engineering
def data_integrity_checks(dataset, classes, initial_row_count):
    assert classes.name == 'Class'
    assert classes.shape[0] == dataset.shape[0]
    assert not dataset.isna().any().any() #No more nulls should be remaining as we converted them to zero
    assert dataset.shape[0] == initial_row_count

In [3]:
#Encode the categorical values, but put an error check in place for the test set since it doesn't have the second class in the sample data.
def encode_categories(dataset, column):
    if dataset[column].nunique() > 1:
        dataset[column] = pd.get_dummies(dataset[column], drop_first=True)
    else:
        dataset[column] = 0

In [4]:
#Mark rows that have null values with a binary flag, and reset null values to zeroes.
def mark_missing_observations(dataset):
    for column in dataset.columns:
        new_columns = dataset[column].apply(lambda x: 1 if pd.isna(x) else 0)
        new_columns.name = f"{column}_missing"
        dataset = pd.concat([dataset, new_columns], axis='columns')
        dataset[column] = dataset[column].fillna(0)
        
    return dataset

In [5]:
import pandas as pd

## Training set engineering

We clean the training set by extracting the classes column, encoding the EJ column as a binary categorical value, and marking those rows/columns that have missing data while replacing null values with zero. Then we do some data integrity checks to ensure we have no null values and sane outputs.

In [6]:
train_df = pd.read_csv('data/train.csv')
greeks_df = pd.read_csv('data/greeks.csv')
initial_row_count = train_df.shape[0]
y_train = extract_classes(train_df)
encode_categories(train_df, 'EJ')
train_df = mark_missing_observations(train_df)
train_df = train_df.astype(float)
data_integrity_checks(train_df, y_train, initial_row_count)
print('Summary Tables')
print('-----------')
print(train_df.info())
print('-----------')
print(train_df.describe())

Summary Tables
-----------
<class 'pandas.core.frame.DataFrame'>
Index: 617 entries, 000ff2bfdfe9 to ffcca4ded3bb
Columns: 112 entries, AB to GL_missing
dtypes: float64(112)
memory usage: 544.7+ KB
None
-----------
               AB            AF           AH          AM          AR  \
count  617.000000    617.000000   617.000000  617.000000  617.000000   
mean     0.477149   3502.013221   118.624513   38.968552   10.128242   
std      0.468388   2300.322717   127.838950   69.728226   10.518877   
min      0.081187    192.593280    85.200147    3.177522    8.138688   
25%      0.252107   2197.345480    85.200147   12.270314    8.138688   
50%      0.354659   3120.318960    85.200147   20.533110    8.138688   
75%      0.559763   4361.637390   113.739540   39.139886    8.138688   
max      6.161666  28688.187660  1910.123198  630.518230  178.943634   

               AX          AY          AZ           BC           BD   ...  \
count  617.000000  617.000000  617.000000   617.000000    6

In [7]:
initial_row_count = test_df.shape[0]
y_test = extract_classes(test_df)
encode_categories(test_df, 'EJ')
test_df = mark_missing_observations(test_df)
test_df = test_df.astype(float)
print("Summary Tables")
print("______________")
print(test_df.info())
print("______________")
print(test_df.describe())

NameError: name 'test_df' is not defined

## Correlation Matrix

The following heatmap shows cases of colinearity

In [8]:
corr = train_df.corr()
corr.style.background_gradient(cmap='coolwarm').format(precision=2)

  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,BN,BP,BQ,BR,BZ,CB,CC,CD,CF,CH,CL,CR,CS,CU,CW,DA,DE,DF,DH,DI,DL,DN,DU,DV,DY,EB,EE,EG,EH,EJ,EL,EP,EU,FC,FD,FE,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL,AB_missing,AF_missing,AH_missing,AM_missing,AR_missing,AX_missing,AY_missing,AZ_missing,BC_missing,BD _missing,BN_missing,BP_missing,BQ_missing,BR_missing,BZ_missing,CB_missing,CC_missing,CD _missing,CF_missing,CH_missing,CL_missing,CR_missing,CS_missing,CU_missing,CW _missing,DA_missing,DE_missing,DF_missing,DH_missing,DI_missing,DL_missing,DN_missing,DU_missing,DV_missing,DY_missing,EB_missing,EE_missing,EG_missing,EH_missing,EJ_missing,EL_missing,EP_missing,EU_missing,FC_missing,FD _missing,FE_missing,FI_missing,FL_missing,FR_missing,FS_missing,GB_missing,GE_missing,GF_missing,GH_missing,GI_missing,GL_missing
AB,1.0,0.35,0.25,0.53,0.16,0.47,0.01,0.16,0.33,0.28,0.25,0.41,0.2,0.15,0.38,0.08,0.16,0.44,0.3,0.43,0.04,-0.01,0.16,-0.07,-0.11,-0.18,-0.05,0.3,-0.2,0.34,-0.14,0.2,0.06,0.04,0.02,0.28,0.06,0.12,0.07,0.06,-0.04,0.03,-0.03,0.24,0.07,0.04,0.0,0.17,0.02,0.03,0.32,0.0,-0.02,0.15,0.0,-0.06,,,,,,,,,,,,,0.03,,,-0.04,0.25,,,,,,,,,,,,,,,,0.02,,,,,,,,0.05,,,0.49,,,,0.02,,0.35,,,,,,0.02
AF,0.35,1.0,0.04,0.18,0.04,0.25,0.04,0.22,0.06,0.05,0.13,0.24,0.3,0.02,0.07,0.01,0.06,0.18,0.07,0.13,-0.03,-0.04,0.05,0.13,-0.12,-0.13,-0.11,0.08,-0.03,0.33,0.06,0.26,0.11,-0.02,-0.01,0.14,-0.05,0.1,0.12,0.12,0.1,-0.05,-0.0,0.05,0.09,0.08,0.02,0.15,0.02,-0.01,0.08,0.08,-0.16,0.19,0.14,-0.13,,,,,,,,,,,,,-0.03,,,-0.02,0.14,,,,,,,,,,,,,,,,0.04,,,,,,,,-0.01,,,0.27,,,,0.04,,0.24,,,,,,0.04
AH,0.25,0.04,1.0,0.13,0.75,0.04,0.06,0.1,0.37,0.21,0.06,0.22,0.12,0.01,0.44,0.08,0.16,0.13,0.16,0.68,0.69,-0.02,0.63,0.07,0.06,0.26,0.14,0.11,0.11,0.15,-0.05,-0.03,-0.03,0.75,0.0,0.71,0.18,0.03,-0.02,0.02,-0.06,0.68,-0.01,0.22,-0.01,-0.01,0.04,-0.02,-0.01,0.02,0.16,0.01,-0.03,-0.04,-0.07,0.03,,,,,,,,,,,,,0.0,,,-0.0,0.01,,,,,,,,,,,,,,,,-0.01,,,,,,,,-0.0,,,0.03,,,,-0.01,,0.02,,,,,,-0.01
AM,0.53,0.18,0.13,1.0,0.18,0.23,0.08,0.13,0.33,0.28,0.16,0.18,0.22,0.01,0.33,0.01,0.16,0.25,0.37,0.25,-0.03,-0.03,0.03,0.0,-0.03,-0.26,-0.06,0.32,-0.2,0.35,-0.09,0.06,0.04,-0.04,-0.04,0.08,-0.02,0.04,0.01,0.07,0.05,-0.04,-0.02,0.03,0.0,0.08,-0.04,0.19,-0.01,0.04,0.22,-0.01,-0.02,0.09,0.1,-0.08,,,,,,,,,,,,,0.04,,,-0.02,-0.0,,,,,,,,,,,,,,,,-0.02,,,,,,,,0.0,,,0.03,,,,-0.02,,0.02,,,,,,-0.02
AR,0.16,0.04,0.75,0.18,1.0,0.01,0.1,0.1,0.07,-0.0,0.05,0.18,0.18,0.01,0.14,0.01,-0.02,0.02,-0.0,0.66,0.75,-0.15,0.72,0.11,0.08,0.28,0.11,0.3,0.17,0.32,-0.01,-0.03,-0.01,0.82,-0.05,0.74,0.18,0.03,-0.01,0.02,-0.02,0.75,0.01,-0.01,0.0,0.03,0.02,0.08,-0.01,0.07,0.0,0.03,0.03,-0.12,0.07,0.03,,,,,,,,,,,,,-0.02,,,0.02,0.01,,,,,,,,,,,,,,,,-0.01,,,,,,,,-0.04,,,-0.01,,,,-0.01,,-0.01,,,,,,-0.01
AX,0.47,0.25,0.04,0.23,0.01,1.0,0.02,0.27,0.08,0.12,0.14,0.43,0.11,0.04,0.12,0.02,0.16,0.25,0.2,0.18,0.05,-0.06,0.09,0.08,-0.09,-0.03,0.04,0.02,-0.04,0.18,-0.02,0.18,-0.03,0.02,0.03,0.11,0.03,0.16,-0.03,-0.0,0.0,-0.03,-0.01,0.17,-0.03,0.01,0.11,0.02,-0.01,0.04,0.23,0.1,0.08,0.12,0.05,-0.0,,,,,,,,,,,,,0.18,,,-0.05,0.07,,,,,,,,,,,,,,,,0.02,,,,,,,,0.18,,,0.2,,,,0.02,,0.18,,,,,,0.02
AY,0.01,0.04,0.06,0.08,0.1,0.02,1.0,0.04,0.13,0.51,0.04,0.03,0.12,0.0,0.12,-0.01,0.01,0.03,0.02,0.15,0.06,-0.06,0.03,0.1,0.07,-0.05,-0.02,0.0,-0.0,0.14,0.01,-0.02,-0.01,0.06,0.0,0.08,-0.03,-0.01,-0.01,-0.06,0.05,0.08,0.0,-0.02,-0.01,-0.01,0.36,-0.03,-0.0,-0.01,0.03,0.0,-0.01,0.2,0.02,0.07,,,,,,,,,,,,,-0.02,,,0.01,0.0,,,,,,,,,,,,,,,,-0.0,,,,,,,,-0.02,,,-0.0,,,,-0.0,,-0.0,,,,,,-0.0
AZ,0.16,0.22,0.1,0.13,0.1,0.27,0.04,1.0,0.04,0.14,0.07,0.14,0.12,0.02,0.04,0.01,0.11,-0.0,0.1,0.17,0.12,-0.03,0.16,0.04,0.09,0.1,0.04,0.13,0.08,0.15,0.02,0.18,-0.03,0.09,0.05,0.12,0.06,0.12,0.03,-0.11,0.18,0.0,-0.03,0.13,0.03,-0.05,0.1,-0.05,-0.01,0.02,0.22,0.02,0.22,0.11,0.07,0.11,,,,,,,,,,,,,-0.11,,,-0.09,-0.08,,,,,,,,,,,,,,,,-0.01,,,,,,,,-0.16,,,-0.02,,,,-0.01,,-0.0,,,,,,-0.01
BC,0.33,0.06,0.37,0.33,0.07,0.08,0.13,0.04,1.0,0.75,0.08,0.15,0.11,0.02,0.91,0.0,0.49,0.09,0.55,0.1,0.05,-0.03,0.03,0.08,-0.02,-0.07,0.03,0.03,-0.12,0.08,0.0,0.02,-0.01,0.03,-0.02,0.18,-0.06,0.01,-0.01,-0.04,0.03,0.07,-0.0,0.03,-0.0,0.06,0.03,-0.02,0.03,0.02,0.42,0.05,-0.04,0.13,-0.02,0.04,,,,,,,,,,,,,-0.03,,,-0.0,-0.0,,,,,,,,,,,,,,,,-0.0,,,,,,,,-0.03,,,-0.0,,,,-0.0,,-0.0,,,,,,-0.0
BD,0.28,0.05,0.21,0.28,-0.0,0.12,0.51,0.14,0.75,1.0,0.15,0.19,0.11,-0.01,0.68,-0.03,0.39,0.05,0.44,0.1,-0.01,-0.03,0.05,0.1,0.04,-0.05,0.08,0.01,-0.04,0.07,0.01,0.04,-0.0,-0.04,-0.01,0.1,-0.0,0.04,-0.0,-0.03,0.04,-0.03,-0.03,0.04,-0.01,-0.0,0.26,-0.01,-0.01,0.02,0.44,-0.0,0.01,0.21,-0.01,0.02,,,,,,,,,,,,,-0.03,,,0.01,0.02,,,,,,,,,,,,,,,,-0.02,,,,,,,,-0.03,,,0.03,,,,-0.02,,0.02,,,,,,-0.02


In [9]:
y_train

Id
000ff2bfdfe9    1
007255e47698    0
013f2bd269f5    0
043ac50845d5    0
044fb8a146ec    1
               ..
fd3dafe738fd    0
fd895603f071    0
fd8ef6377f76    0
fe1942975e40    0
ffcca4ded3bb    0
Name: Class, Length: 617, dtype: int64

In [10]:
import shap
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing


X_train, X_test, y_trainer, y_test = train_test_split(train_df, y_train, random_state=1)
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)
model = LogisticRegression(penalty='l2', max_iter=1000, random_state=1)
model.fit(X_scaled, y_trainer)

masker = shap.maskers.Independent(data=X_test)

explainer = shap.Explainer(
    model, masker=masker, feature_names=X_train.columns, algorithm="linear"
)

sv = explainer(X_test)

## Test dataset

Upon notebook submission, the test dataset will be replaced by the full competition test dataset. This section of the notebook prepares the test data for use in the model.

In [11]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [12]:


# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_trainer)

In [13]:
print(model)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)


In [14]:

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [15]:


# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 95.48%


In [16]:
import shap
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(train_df, y_train, random_state=1)
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)
model = LogisticRegression(penalty='l2', max_iter=1000, random_state=1)
model.fit(X_scaled, y_train)

masker = shap.maskers.Independent(data=X_test)

explainer = shap.Explainer(
    model, masker=masker, feature_names=X_train.columns, algorithm="linear"
)

sv = explainer(X_test)

# Generate prediction of the odds of the classes
odds_predictions = model.predict_proba(X_test)

# fit model on training data
model = XGBClassifier()
model.fit(X_train, y_train)

# make predictions for test data
odds_predictions = model.predict_proba(X_test)

# evaluate predictions
accuracy = accuracy_score(y_test, model.predict(X_test))
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 95.48%


X has feature names, but LogisticRegression was fitted without feature names


In [17]:
odds_predictions

array([[9.9985379e-01, 1.4622841e-04],
       [3.6983323e-01, 6.3016677e-01],
       [9.9985713e-01, 1.4284383e-04],
       [9.9886245e-01, 1.1375265e-03],
       [9.9922425e-01, 7.7572878e-04],
       [9.9968702e-01, 3.1295637e-04],
       [9.9839336e-01, 1.6066603e-03],
       [9.9895364e-01, 1.0463863e-03],
       [9.4233704e-01, 5.7662953e-02],
       [9.9944210e-01, 5.5792578e-04],
       [3.4949481e-02, 9.6505052e-01],
       [7.6899767e-02, 9.2310023e-01],
       [9.9891490e-01, 1.0850752e-03],
       [9.9904203e-01, 9.5799007e-04],
       [9.9787992e-01, 2.1200578e-03],
       [9.9993759e-01, 6.2404048e-05],
       [9.0657890e-02, 9.0934211e-01],
       [4.1674167e-01, 5.8325833e-01],
       [9.8523921e-01, 1.4760791e-02],
       [9.8460943e-01, 1.5390584e-02],
       [7.7960491e-03, 9.9220395e-01],
       [9.9338704e-01, 6.6129644e-03],
       [1.9600630e-02, 9.8039937e-01],
       [9.9882501e-01, 1.1749599e-03],
       [9.9935150e-01, 6.4850057e-04],
       [9.9360645e-01, 6.