In [1]:
# data stuff:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer  # class to help make dummy variables
from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import PowerTransformer
from feature_engine.imputation import EndTailImputer
from helper_functions import *

# plotting stuff:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import Image
import matplotlib.pyplot as plt

# silence!
import warnings
warnings.filterwarnings("ignore")


---

## Import data

In [2]:
covid_flu = pd.read_csv('./data/covid_flu.csv')
covid_flu

Unnamed: 0,Diagnosis,InitialPCRDiagnosis,Age,Sex,neutrophil,serumLevelsOfWhiteBloodCell,lymphocytes,CReactiveProteinLevels,DurationOfIllness,CTscanResults,RiskFactors,GroundGlassOpacity,Diarrhea,Fever,Coughing,ShortnessOfBreath,SoreThroat,NauseaVomitting,Temperature,Fatigue
0,H1N1,,67.00,F,,,,,,,,,,Yes,Yes,,No,,38.111111,No
1,H1N1,,29.00,M,,,,,,,,,,,,,,,,
2,H1N1,,22.00,F,,,,,,,,,,,,,,,,
3,H1N1,,20.00,F,,,,,,,immuno,,,Yes,Yes,,No,,36.555556,Yes
4,H1N1,,21.00,M,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1477,COVID19,,1.40,,,9.2,,9.4,,,,,,,,,,,,
1478,COVID19,,0.83,,,14.8,,0.9,,,,,,,,,,,,
1479,COVID19,,3.00,,,15.0,,0.7,,,,,,,,,,,,
1480,COVID19,,4.00,,,6.6,,0.2,,,,,,,,,,,,


---

## Feature Improvement:
* impute missing data for numerical variables

### Write a function to take care of the processing (improvement transformations):

In [3]:
def tweak_covid_flu(df):
    # domain-level info:
    flu_indicators = ['Diarrhea', 'Fever', 'Coughing', 'SoreThroat', 'NauseaVomitting', 'Fatigue']
    
     # preliminary cleaning:
    df_transformed = (df
                      .replace({'Yes': True, 
                                'No': False})
                      .assign(is_female = lambda df_: df_['Sex']
                                  .replace({'F': True, 
                                            'M': False, 
                                            np.nan: False}),
                             CReactiveProteinLevels =  lambda df_: df_['CReactiveProteinLevels']
                                  .str.replace(' mg/dL', '')
                                  .astype(float),
                             flu_symptoms = lambda df_: df_[flu_indicators].sum(axis=1) >= 2,
                             Age = lambda df_: df_['Age'] + 0.1)
                      .drop('Sex', axis=1)
                     )
    
    # split the dataset into data types:
    numerical_feats = [x for x in df_transformed.columns if df_transformed[x].dtype not in ['O', bool]]
    categorical = [x for x in df_transformed.columns if df_transformed[x].dtype == 'O' and x not in numerical_feats]
    
    # impute categoricals with 'False' bool: 
    # df_transformed[categorical] = df_transformed[categorical].fillna(False)
    
    # all categoricals should now be binary:
    binary_feats = [x for x in df_transformed.columns if df_transformed[x].nunique()==2]
    binary_feats.remove('Diagnosis')
    
    return df_transformed, numerical_feats, binary_feats
    

In [4]:
# call and assign values:
cleaned_covid_flu, numerical_feats, binary_feats = tweak_covid_flu(covid_flu)
cleaned_covid_flu

Unnamed: 0,Diagnosis,InitialPCRDiagnosis,Age,neutrophil,serumLevelsOfWhiteBloodCell,lymphocytes,CReactiveProteinLevels,DurationOfIllness,CTscanResults,RiskFactors,...,Diarrhea,Fever,Coughing,ShortnessOfBreath,SoreThroat,NauseaVomitting,Temperature,Fatigue,is_female,flu_symptoms
0,H1N1,,67.10,,,,,,,,...,,True,True,,False,,38.111111,False,True,True
1,H1N1,,29.10,,,,,,,,...,,,,,,,,,False,False
2,H1N1,,22.10,,,,,,,,...,,,,,,,,,True,False
3,H1N1,,20.10,,,,,,,immuno,...,,True,True,,False,,36.555556,True,True,True
4,H1N1,,21.10,,,,,,,,...,,,,,,,,,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1477,COVID19,,1.50,,9.2,,9.4,,,,...,,,,,,,,,False,False
1478,COVID19,,0.93,,14.8,,0.9,,,,...,,,,,,,,,False,False
1479,COVID19,,3.10,,15.0,,0.7,,,,...,,,,,,,,,False,False
1480,COVID19,,4.10,,6.6,,0.2,,,,...,,,,,,,,,False,False


In [5]:
binary_feats

['InitialPCRDiagnosis',
 'CTscanResults',
 'GroundGlassOpacity',
 'Diarrhea',
 'Fever',
 'Coughing',
 'ShortnessOfBreath',
 'SoreThroat',
 'NauseaVomitting',
 'Fatigue',
 'is_female',
 'flu_symptoms']

In [6]:
numerical_feats

['Age',
 'neutrophil',
 'serumLevelsOfWhiteBloodCell',
 'lymphocytes',
 'CReactiveProteinLevels',
 'DurationOfIllness',
 'Temperature']

&nbsp;

### Create a custom class to handle messy column with nested values:
We will unnest the values and turn them into dummy columns.

In [7]:
class DummifyRiskFactor(BaseEstimator,TransformerMixin):
    def __init__(self):
        self.label_binarizer = None
        
    def parse_risk_factors(self, comma_sep_factors):
        ''' asthma,heart disease -> ['asthma', 'heart disease'] '''
        try:
            return [s.strip().lower() for s in comma_sep_factors.split(',')]
        except:
            return []
    
    def fit(self, X, y=None):
        self.label_binarizer = MultiLabelBinarizer()
        self.label_binarizer.fit(X.apply(self.parse_risk_factors))  # create dummy variable for each risk factor
        return self
    
    def transform(self, X, y=None):
        return self.label_binarizer.transform(X.apply(self.parse_risk_factors))

In [8]:
# instantiate class:
drf = DummifyRiskFactor()

# fit and transform (test):
risks = drf.fit_transform(cleaned_covid_flu['RiskFactors'])

risks_df = pd.DataFrame(risks, columns=drf.label_binarizer.classes_)
risks_df

Unnamed: 0,asthma,athero,atopic dermatitis and repetitive respiratory infections,begin tumor (removed),chronic,chronic endrocrine disorder,chronic liver dieseas,chronic liver disorder,chronic neurological disorders,chronic obstructive pulmonary disease,...,lung disease,myxoma of abdominal cavity,obesity,pneumomediastinum and subcutaneous emphysema,pneumonia,pre-eclampsia,prom,renal disease,respiratory disease,rheumatoid arthritis
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1477,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


---

## Build Feature Engineering Pipeline:
Our pipeline will:
1. clean and dummify the `RiskFactors` variable
2. select the best features from the above using `tree_selector`
3. select the categorical features
4. impute missing data in the categoricals using `False`
5. select numerical features
6. apply `Box-Cox` transformation to the above
7. turn the above into a dataframe
8. perform `EndOfTail` imputation on the missing data
9. bin/discretize the features (numerical)

In [10]:
# train/test split:
X, y = cleaned_covid_flu.drop(['Diagnosis'], axis=1), cleaned_covid_flu['Diagnosis']

x_train, x_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=0, test_size=.2
)

In [11]:
# define constituent pipelines:
risk_factor_pipeline = Pipeline(steps=
                                [('select_risk_factor', FunctionTransformer(lambda df: df['RiskFactors'])),
                                 ('dummify', DummifyRiskFactor()),
                                 ('tree_selector', SelectFromModel(estimator=DecisionTreeClassifier(), max_features=20))]
                               )
    
binary_pipeline = Pipeline(steps=
                           [('select_categorical_features', FunctionTransformer(lambda df: df[binary_feats])),
                            ('fillna', FunctionTransformer(lambda df: df[binary_feats].fillna(False)))]
                          )

numerical_pipeline = Pipeline(steps=
                              [('select_numerical_features', FunctionTransformer(lambda df: df[numerical_feats])),
                               ('box-cox', PowerTransformer(method='box-cox')),
                               ('turn_into_df', FunctionTransformer(lambda matrix: pd.DataFrame(matrix))),
                               ('end_of_tail', EndTailImputer()),
                               ('ordinal_bins', KBinsDiscretizer(encode='ordinal', n_bins=10, strategy='kmeans'))]
                             )

In [12]:
# join the processed dataframes/matrices:
simple_fe = FeatureUnion([
    ('risk_factors', risk_factor_pipeline),
    ('binary_pipeline', binary_pipeline),
    ('numerical_pipeline', numerical_pipeline)
])

&nbsp;

### Run the Feature Engineering pipeline on a simple grid search:

In [13]:
# obtain best model:
best_model = simple_grid_search(x_train, y_train, x_test, y_test, simple_fe)

Parsing took 0.26 seconds
Training took 6.55 seconds
              precision    recall  f1-score   support

     COVID19       0.86      0.83      0.84        82
        H1N1       0.94      0.95      0.94       215

    accuracy                           0.92       297
   macro avg       0.90      0.89      0.89       297
weighted avg       0.92      0.92      0.92       297

Overall took 6.56 seconds


&nbsp;

### Run the Feature Engineering pipeline using TPOT AutoML:

In [14]:
preds = TPOT_search(x_train, y_train, x_test, y_test, simple_fe)
preds

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.940928270042194

Generation 2 - Current best internal CV score: 0.9417721518987342

Generation 3 - Current best internal CV score: 0.9417721518987342

Generation 4 - Current best internal CV score: 0.9426160337552743

Generation 5 - Current best internal CV score: 0.9426160337552743

Generation 6 - Current best internal CV score: 0.9426160337552743

Generation 7 - Current best internal CV score: 0.9459915611814346

Generation 8 - Current best internal CV score: 0.9459915611814346

Generation 9 - Current best internal CV score: 0.9459915611814346

30.07 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: MLPClassifier(RobustScaler(OneHotEncoder(MinMaxScaler(input_matrix), minimum_fraction=0.25, sparse=False, threshold=10)), alpha=0.0001, learning_rate_init=0.01)
0.9259259259259259

              precision    recall  f1-