In [9]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import os
import glob

sns.set(style="darkgrid")
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)
pd.set_option('max_colwidth', 800)
pd.options.display.float_format = '{:.4f}'.format

import random
import re
import pickle
import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

## Pipeline Description

* This is only the submission for the test dataset pipeline. There are separate files for EDA and training. I will provide them if requested.
* The models will treat this multi labels problem as multiple separate binary classification problems.
* The submission is from ensembled models built with LightGBM and Catboost. Iâ€™ve tried many other binary classification algorithms, and only LightGBM and Catboost have performed well so far. I also tried to model the label associations by building a second layer model to stack all the binary classifications output. But ensembling model predictions by weighted average outperforms stacking.

In [2]:
train_raw = pd.read_csv('data/train.csv')
test_raw = pd.read_csv('data/test.csv')
train_raw.shape, test_raw.shape 

((2989, 365), (586, 337))

In [3]:
#fix outliers and errors
train_raw.loc[train_raw['Age_of_onsets'].str.strip()=='2, 3','Age_of_onsets'] = 2
train_raw['Age_of_onsets'] = train_raw['Age_of_onsets'].str.strip().fillna(9).astype(np.int16)

train_raw.loc[train_raw['Treatment_of_rhinitis']=='1.3','Treatment_of_rhinitis'] = '1,3'
train_raw.loc[train_raw['Treatment_of_rhinitis']=='2.3','Treatment_of_rhinitis'] = '2,3'
test_raw.loc[test_raw['Treatment_of_rhinitis']=='2.3','Treatment_of_rhinitis'] = '1,3'

train_raw['Treatment_of_rhinitis'] = train_raw['Treatment_of_rhinitis'].astype(str)
test_raw['Treatment_of_rhinitis'] = test_raw['Treatment_of_rhinitis'].astype(str)

In [4]:
#we have many target columns, here are the definitions
invalid_target_cols = ['Type_of_Food_Allergy_Other','Type_of_Food_Allergy_Cereals_&_Seeds']

valid_target_cols = ['Allergy_Present','Severe_Allergy','Respiratory_Allergy','Food_Allergy','Venom_Allergy','Type_of_Respiratory_Allergy_ARIA',
               'Type_of_Respiratory_Allergy_CONJ','Type_of_Respiratory_Allergy_GINA','Type_of_Respiratory_Allergy_IGE_Pollen_Gram',
               'Type_of_Respiratory_Allergy_IGE_Pollen_Herb','Type_of_Respiratory_Allergy_IGE_Pollen_Tree',
               'Type_of_Respiratory_Allergy_IGE_Dander_Animals','Type_of_Respiratory_Allergy_IGE_Mite_Cockroach',
               'Type_of_Respiratory_Allergy_IGE_Molds_Yeast','Type_of_Food_Allergy_Aromatics',
               'Type_of_Food_Allergy_Egg','Type_of_Food_Allergy_Fish','Type_of_Food_Allergy_Fruits_and_Vegetables',
               'Type_of_Food_Allergy_Mammalian_Milk','Type_of_Food_Allergy_Oral_Syndrom','Type_of_Food_Allergy_Other_Legumes',
               'Type_of_Food_Allergy_Peanut','Type_of_Food_Allergy_Shellfish','Type_of_Food_Allergy_TPO',
               'Type_of_Food_Allergy_Tree_Nuts','Type_of_Venom_Allergy_ATCD_Venom','Type_of_Venom_Allergy_IGE_Venom']

food_target_cols = [x for x in valid_target_cols if x.find('Type_of_Food_')==0]
resp_target_cols = [x for x in valid_target_cols if x.find('Type_of_Respiratory')==0]
venom_target_cols = [x for x in valid_target_cols if x.find('Type_of_Venom')==0]
general_target_cols = ['Allergy_Present','Severe_Allergy','Respiratory_Allergy','Food_Allergy','Venom_Allergy',]
target_cols = invalid_target_cols+valid_target_cols
len(valid_target_cols), len(invalid_target_cols)

(27, 2)

In [5]:
def process_cat_cols(df):
    dropped_cols=['trustii_id','Patient_ID','Chip_Code','Chip_Type','Chip_Image_Name','Food_Type_0']
    
    mlb = MultiLabelBinarizer(sparse_output=True)
    for col in ['Treatment_of_atopic_dematitis','Treatment_of_athsma','Treatment_of_rhinitis','General_cofactors']:
        values = df[col].str.replace('.',',').str.replace(' ','').str.split(',')
        df = df.join(pd.DataFrame.sparse.from_spmatrix(
                        mlb.fit_transform(values),
                        index=values.index,
                        columns=[f'{col}_{x}' for x in mlb.classes_])).drop(col,axis=1)
    
    df['French_Residence_Department'] = df['French_Residence_Department'].astype('category')
    df['French_Region'] = df['French_Region'].astype('category')
    return df.drop(dropped_cols,axis=1,errors='ignore')

In [6]:
train_raw['test'] = 0
test_raw['test'] = 1

data = process_cat_cols(pd.concat([train_raw,test_raw]).reset_index(drop=True))
assert len(train_raw)+len(test_raw) == len(data)

binary_cols = [x for x in data.columns if x.find('Treatment_of')==0 or x.find('General_cofactors')==0]
non_signal_cols=['Age', 'Gender', 'Blood_Month_sample', 'French_Residence_Department', 'French_Region', 'Rural_or_urban_area', 'Sensitization', 'Age_of_onsets', 'Skin_Symptoms']
signals = [x for x in data.columns if x not in binary_cols+non_signal_cols+target_cols+['test']]

#total 318 test signals. there are only two types of test results. 
#one has 300 readings(ALEX), the other has 112 readings (ISAC).
data['test_type'] = (data[signals].notna().sum(axis=1)==300).astype(np.int8)
data['test_zero_num'] = (data[signals]==0).sum(axis=1).values
data['no_treatment'] = (data[['Treatment_of_atopic_dematitis_0','Treatment_of_athsma_0','Treatment_of_rhinitis_0']].sum(axis=1)==3).astype(np.int8)

train = data[data['test']==0]
test = data[data['test']==1]

#when Allergy_Present is zero, Respiratory_Allergy and Food_Allergy should be zero(some samples are 9)
train.loc[train['Allergy_Present']==0,'Respiratory_Allergy'] = 0
train.loc[train['Allergy_Present']==0,'Food_Allergy'] = 0

## Tune for F1 metric

* The models will have probability predictions for each target.

* We need to predict yes/no labels. Therefore, thresholds optimized for F1 score are needed. The idea of optimizing threshold is based on quantile. 

* First we assume the targets from test and train have the same distribution. For instance, if people in the train has a 5% of chance of milk allergy, the same rate remains in the test dataset.

* Models are tuned for AUC, which is basically the order of predictions. Suppose the target rate is 10%, then for a perfect classifier, if we pick the top 10% quantile as the threshold, we will have a perfect F1 of 1.

* For a not-perfect classifier, then we will search for a multiplier which will time the target rate and obtain the best F1 score from the out-of-fold predictions. This multiplier could be smaller than 1, suggesting increasing the recall by labelling more positives. It could also be greater than 1, suggesting increasing the precision by labelling fewer positives. The search will find the best multiplier to balance the recall and precision to achieve the best F1.

In [7]:
#find the best multiplier to get the highest f1 score
#deviation: the test target distribution may not be same as the train
def find_best_multiplier(y_true, y_pred, deviation=1):
    best_score = 0
    best_threshold = 0
    for i in np.arange(0.6,1.4,0.01):
        if y_true.ndim>1:            
            score = f1_score(y_true, y_pred > np.quantile(y_pred,1-np.mean(y_true)*deviation)*i,average='macro')
        else:
            score = f1_score(y_true, y_pred > np.quantile(y_pred,1-np.mean(y_true)*deviation)*i)
        if score>best_score:
            best_score=score
            best_threshold=i
    return best_threshold, best_score

def test_best_multiplier(y_true, y_pred, deviation_mean=1, deviation_std= 0.04, test_num=50):
    multipliers, scores = [],[]
    for deviation in np.random.normal(deviation_mean,deviation_std,test_num):
        _multiplier, _score = find_best_multiplier(y_true,y_pred,deviation)
        multipliers.append(_multiplier)
        scores.append(_score)  
    return np.mean(multipliers), np.mean(scores)

In [10]:
ensembled_models = ['lgb_auc_762','lgb_auc_760','cat_auc_758','lgb_auc_optuna_754']
ensembled_weights = [0.6,0.2,0.1,0.1,]

rows = {}
RANDOM_NOISE = False

for target_col in valid_target_cols:
    oofs = []
    for ensemble in ensembled_models:
        file = f"{ensemble}/{target_col.replace('Type_of_','')}.pkl"
        with open(file, 'rb') as handle:
             saved_models = pickle.load(handle)
        assert saved_models['target_col'] == target_col
        oofs.append(saved_models['oof'])
    
    #get ensemble oof predictions and take average by the given weights
    ensemble_oof = np.average(np.array(oofs),axis=0, weights=ensembled_weights)        
    target_values = train[train[target_col].isin([0,1])][target_col].values    
    
    #find the best multiplier
    if RANDOM_NOISE:                
        _multiplier, _score = test_best_multiplier(target_values,ensemble_oof, deviation_mean=1.03)            
    else:
        _multiplier, _score = find_best_multiplier(target_values,ensemble_oof,1)        
        
    row = {}
    row['multiplier'] = _multiplier
    row['score'] = _score
    rows[target_col] = row

best_multipliers = pd.DataFrame.from_dict(rows,orient='index') 
print(f"Food:{best_multipliers.loc[food_target_cols,'multiplier'].mean():.2f}")
print(f"Resp:{best_multipliers.loc[resp_target_cols,'multiplier'].mean():.2f}")
print(f"Venom:{best_multipliers.loc[venom_target_cols,'multiplier'].mean():.2f}")
print(f"General:{best_multipliers.loc[general_target_cols,'multiplier'].mean():.2f}")    
print(f"Score mean:{best_multipliers.loc[:,'score'].mean():.2f}")    

Food:0.85
Resp:0.86
Venom:0.91
General:0.89
Score mean:0.62


## Generate Final Submission and Post-processing

* First, we average the ensembled models' probability predictions based on weights.

* Then, we transfer the probabilities to the final labels using the found multiplier.
* Last, we do postprocessing based on the rules found in EDA.  Some rules are straightforward, like if Severe_Allergy is true, then Allergy_Present. Some rules with high confidence like if Type_of_Respiratory_Allergy_CONJ is true, then Type_of_Respiratory_Allergy_ARIA will very likely be true.


In [11]:
submission = test_raw[['trustii_id']].copy()
for col in invalid_target_cols:
    submission[col] = 0

for target_col in valid_target_cols:    
    ensemble_pred = []
    for ensemble in ensembled_models:
        file = f"{ensemble}/{target_col.replace('Type_of_','')}.pkl"
        with open(file, 'rb') as handle:
             saved_models = pickle.load(handle)
        assert saved_models['target_col'] == target_col        
        ensemble_pred.append(saved_models['test_pred'])

    target_values = train[train[target_col].isin([0,1])][target_col].values    
    final_pred = np.average(np.array(ensemble_pred),axis=0, weights=ensembled_weights)   
    threshold = np.quantile(final_pred,1-np.mean(target_values)) 
    
    if target_col in food_target_cols:
        multiplier = 0.85
    elif target_col in resp_target_cols:
        multiplier = 0.86
    elif target_col in venom_target_cols:
        multiplier = 0.91
    else:
        multiplier = 0.89
    
    if target_col in ['Food_Allergy','Respiratory_Allergy','Venom_Allergy','Allergy_Present']:
        submission[f"{target_col}_pred"] = (final_pred>=threshold*multiplier).astype(np.int8)   
    else:    
        submission[target_col] = (final_pred>=threshold*multiplier).astype(np.int8)   

In [12]:
OVERRIDE = True

submission.loc[submission[food_target_cols].sum(axis=1)>0,'Food_Allergy']=1
submission.loc[submission[resp_target_cols].sum(axis=1)>0,'Respiratory_Allergy']=1
submission.loc[submission[venom_target_cols].sum(axis=1)>0,'Venom_Allergy']=1
#after ater new columns, set default to 0
submission = submission.fillna(0)

submission.loc[submission['Food_Allergy_pred']==1,'Food_Allergy']=1
submission.loc[submission['Respiratory_Allergy_pred']==1,'Respiratory_Allergy']=1
submission.loc[submission['Venom_Allergy_pred']==1,'Venom_Allergy']=1
submission.loc[submission[['Respiratory_Allergy','Food_Allergy','Venom_Allergy']].sum(axis=1)>0,'Allergy_Present']=1
#after ater new columns, set default to 0
submission = submission.fillna(0)

submission['Allergy_Present_before'] = submission['Allergy_Present']
submission.loc[(submission['Allergy_Present_pred']==1) & (submission['Allergy_Present']==0),'Food_Allergy']=1
submission.loc[(submission['Allergy_Present_pred']==1) & (submission['Allergy_Present']==0),'Respiratory_Allergy']=1
submission.loc[submission['Allergy_Present_pred']==1,'Allergy_Present']=1
print(f"add general prediction: {submission[(submission['Allergy_Present_before']==0) & (submission['Allergy_Present']==1)].index.tolist()}")

#fix severe allergy
submission.loc[submission['Allergy_Present']==0,'Severe_Allergy']=0
#confidence==1 from association analysis
submission.loc[submission['Type_of_Respiratory_Allergy_ARIA']==1,'Severe_Allergy']=1 

if OVERRIDE:
    #if venom allergy, set all sub types to be true
    submission.loc[(submission[venom_target_cols].sum(axis=1)==0) & (submission['Venom_Allergy']==1),'Type_of_Venom_Allergy_ATCD_Venom']=1
    submission.loc[(submission[venom_target_cols].sum(axis=1)==0) & (submission['Venom_Allergy']==1),'Type_of_Venom_Allergy_IGE_Venom']=1    
    submission.loc[submission['Type_of_Respiratory_Allergy_CONJ']==1,'Severe_Allergy']=1 #confidence:0.95
    
print(submission[(submission[food_target_cols].sum(axis=1)==0) & (submission['Food_Allergy']==1)].shape[0])
print(submission[(submission[resp_target_cols].sum(axis=1)==0) & (submission['Respiratory_Allergy']==1)].shape[0])
print(submission[(submission[venom_target_cols].sum(axis=1)==0) & (submission['Venom_Allergy']==1)].shape[0])

add general prediction: []
105
0
0


In [13]:
assert np.all(submission.loc[submission[food_target_cols].sum(axis=1)>0,'Food_Allergy']==1)
assert np.all(submission.loc[submission[resp_target_cols].sum(axis=1)>0,'Respiratory_Allergy']==1)
assert np.all(submission.loc[submission[venom_target_cols].sum(axis=1)>0,'Venom_Allergy']==1)
assert np.all(submission.loc[submission[['Respiratory_Allergy','Food_Allergy','Venom_Allergy']].sum(axis=1)>0,'Allergy_Present']==1)
assert np.all(submission[submission['Severe_Allergy']==1]['Allergy_Present']==1)
assert np.all((submission[(submission['Allergy_Present']==1)][['Respiratory_Allergy','Food_Allergy','Venom_Allergy']]==1).sum(axis=1))
submission[['trustii_id']+target_cols].to_csv('submission.csv',index=False)  
submission.tail()

Unnamed: 0,trustii_id,Type_of_Food_Allergy_Other,Type_of_Food_Allergy_Cereals_&_Seeds,Allergy_Present_pred,Severe_Allergy,Respiratory_Allergy_pred,Food_Allergy_pred,Venom_Allergy_pred,Type_of_Respiratory_Allergy_ARIA,Type_of_Respiratory_Allergy_CONJ,Type_of_Respiratory_Allergy_GINA,Type_of_Respiratory_Allergy_IGE_Pollen_Gram,Type_of_Respiratory_Allergy_IGE_Pollen_Herb,Type_of_Respiratory_Allergy_IGE_Pollen_Tree,Type_of_Respiratory_Allergy_IGE_Dander_Animals,Type_of_Respiratory_Allergy_IGE_Mite_Cockroach,Type_of_Respiratory_Allergy_IGE_Molds_Yeast,Type_of_Food_Allergy_Aromatics,Type_of_Food_Allergy_Egg,Type_of_Food_Allergy_Fish,Type_of_Food_Allergy_Fruits_and_Vegetables,Type_of_Food_Allergy_Mammalian_Milk,Type_of_Food_Allergy_Oral_Syndrom,Type_of_Food_Allergy_Other_Legumes,Type_of_Food_Allergy_Peanut,Type_of_Food_Allergy_Shellfish,Type_of_Food_Allergy_TPO,Type_of_Food_Allergy_Tree_Nuts,Type_of_Venom_Allergy_ATCD_Venom,Type_of_Venom_Allergy_IGE_Venom,Food_Allergy,Respiratory_Allergy,Venom_Allergy,Allergy_Present,Allergy_Present_before
581,1276,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1.0,0.0,1.0,1.0
582,1277,0,0,1,1,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1.0,0.0,1.0,1.0
583,1280,0,0,1,1,1,0,0,1,1,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1.0,0.0,1.0,1.0
584,1281,0,0,1,1,1,1,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1.0,0.0,1.0,1.0
585,1282,0,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1.0,1.0,0.0,1.0,1.0


In [14]:
print(f"All:{len(submission[submission['Allergy_Present']==1])}, Food:{len(submission[submission['Food_Allergy']==1])}") 
print(f"Resp:{len(submission[submission['Respiratory_Allergy']==1])}, Venom:{len(submission[submission['Venom_Allergy']==1])}")

All:512, Food:292
Resp:482, Venom:12
