DLMI Data challenge
=================



Define imports

In [2]:
import datetime

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score
from xgboost import XGBClassifier

## Parameters

In [2]:
LR = 0.1

## Data preparation

In [3]:
OH_enc = OneHotEncoder(handle_unknown='ignore')

def DOB_parser(dob):
    try:
        dtob = datetime.datetime.strptime(dob, '%m/%d/%Y')
    except ValueError:
        dtob = datetime.datetime.strptime(dob, '%d-%m-%Y')
    return (datetime.date(2020, 5, 1) - dtob.date()).days

def df_dob_parser(df):
    return df.applymap(DOB_parser)

def df_to_upper(df):
    return df.applymap(str.upper)

dob_transformer = FunctionTransformer(func=df_dob_parser, validate=False)
upper_transformer = FunctionTransformer(func=df_to_upper, validate=False)

gender_prep = Pipeline(steps=[('uppercase', upper_transformer),
                              ('one hot encoder', OH_enc)])

prep = ColumnTransformer(transformers=[('age', dob_transformer, ['DOB']),
                                       ('gender', gender_prep, ['GENDER']),
                                       ('float', 'passthrough', ['LYMPH_COUNT'])])

## Model

In [4]:
model = XGBClassifier(n_estimators=10000, learning_rate=LR, random_state=0)

## Import data

In [3]:
dataset = pd.read_csv("data/clinical_annotation.csv", index_col=0)
print(dataset.head())
# print(dataset.isna().any())

      LABEL GENDER        DOB  LYMPH_COUNT
ID                                        
P26       1      M  11/3/1933         11.2
P183      1      M  5/15/1942         12.8
P89       1      M  6/19/1935          9.6
P123      1      M  1/27/1931        122.6
P61       1      F   3/5/1931         11.6


In [6]:
dataset.GENDER.unique()

array(['M', 'F', 'f'], dtype=object)

In [7]:
# Separate test data
dataset_test = dataset[dataset.LABEL == -1]
dataset_tv = dataset[dataset.LABEL != -1]

# Define variables and target
ID_test = dataset_test.ID
y_test = dataset_test.LABEL
X_test = prep.fit_transform(dataset_test.drop(columns=['LABEL', 'ID']))
y_tv = dataset_tv.LABEL
X_tv = prep.fit_transform(dataset_tv.drop(columns=['LABEL', 'ID']))

#Split train and val sets
X_train, X_valid, y_train, y_valid = train_test_split(X_tv, y_tv, train_size=0.8, test_size=0.2, random_state=0)

## Train

In [8]:
model.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_valid, y_valid)])
y_valid_pred = model.predict(X_valid)
acc = balanced_accuracy_score(y_valid, y_valid_pred)
print("acc =", acc)

[0]	validation_0-logloss:0.62382
[1]	validation_0-logloss:0.56477
[2]	validation_0-logloss:0.51532
[3]	validation_0-logloss:0.47156
[4]	validation_0-logloss:0.43580
[5]	validation_0-logloss:0.40518
[6]	validation_0-logloss:0.37614
[7]	validation_0-logloss:0.35235
[8]	validation_0-logloss:0.32928
[9]	validation_0-logloss:0.31116
[10]	validation_0-logloss:0.29369
[11]	validation_0-logloss:0.27728
[12]	validation_0-logloss:0.26285




[13]	validation_0-logloss:0.25102
[14]	validation_0-logloss:0.24134
[15]	validation_0-logloss:0.23231
[16]	validation_0-logloss:0.22162
[17]	validation_0-logloss:0.21281
[18]	validation_0-logloss:0.20641
[19]	validation_0-logloss:0.20129
[20]	validation_0-logloss:0.19612
[21]	validation_0-logloss:0.19207
[22]	validation_0-logloss:0.18618
[23]	validation_0-logloss:0.18083
[24]	validation_0-logloss:0.17812
[25]	validation_0-logloss:0.17317
[26]	validation_0-logloss:0.17093
[27]	validation_0-logloss:0.16678
[28]	validation_0-logloss:0.16387
[29]	validation_0-logloss:0.16197
[30]	validation_0-logloss:0.15957
[31]	validation_0-logloss:0.15824
[32]	validation_0-logloss:0.15414
[33]	validation_0-logloss:0.15107
[34]	validation_0-logloss:0.14871
[35]	validation_0-logloss:0.14698
[36]	validation_0-logloss:0.14336
[37]	validation_0-logloss:0.14095
[38]	validation_0-logloss:0.13860
[39]	validation_0-logloss:0.13728
[40]	validation_0-logloss:0.13547
[41]	validation_0-logloss:0.13256
[42]	validatio

## Test

In [9]:
model_full = XGBClassifier(n_estimators=20, learning_rate=LR, random_state=0)
model_full.fit(X_tv, y_tv)
y_test_preds = model_full.predict(X_test)
print(y_test_preds)
output = pd.DataFrame({'Id': ID_test,
                       'Predicted': y_test_preds})
output.to_csv('submission.csv', index=False)
print(output)

[0 1 1 1 0 0 1 1 1 1 1 1 0 1 0 0 1 1 0 0 1 1 1 1 0 1 0 1 1 1 0 1 1 0 0 1 0
 1 0 1 1 1]
       Id  Predicted
6     P71          0
9     P16          1
12   P114          1
15   P170          1
18    P98          0
23    P69          0
29    P92          1
30   P132          1
31    P81          1
41    P73          1
46   P143          1
50   P175          1
62    P56          0
67   P139          1
72   P152          0
73   P203          0
75    P75          1
84     P9          1
93    P24          0
94     P4          0
95    P32          1
98   P120          1
101  P138          1
110  P172          1
120   P57          0
122  P195          1
126   P68          0
128  P133          1
130   P14          1
134  P119          1
138    P7          0
145   P49          1
164   P93          1
166  P178          0
177   P58          0
178  P108          1
183  P197          0
191  P196          1
193   P86          0
194   P18          1
196  P188          1
198  P148          1
