In [1]:
import numpy as np
import pandas as pd
import featuretools as ft
from featuretools import selection
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold

from lightgbm import LGBMClassifier

from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.pipeline import make_union, make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from category_encoders import OrdinalEncoder, OneHotEncoder
from category_encoders import BackwardDifferenceEncoder, HelmertEncoder, BinaryEncoder
from category_encoders import CountEncoder
from category_encoders import LeaveOneOutEncoder, TargetEncoder, JamesSteinEncoder, MEstimateEncoder, WOEEncoder, CatBoostEncoder
from category_encoders.wrapper import NestedCVWrapper

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2, mutual_info_classif

from mlxtend.feature_selection import SequentialFeatureSelector

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

import h2o
from h2o.sklearn import H2OAutoMLClassifier
from h2o.sklearn import H2OGradientBoostingClassifier

from tpot import TPOTClassifier
from gama import GamaClassifier

from sklearn.feature_selection import RFECV

from nni.algorithms.feature_engineering.gradient_selector import FeatureGradientSelector
from nni.algorithms.feature_engineering.gbdt_selector import GBDTSelector

from imblearn.over_sampling import RandomOverSampler, SMOTE

import time
pd.set_option('display.max_rows', None)

In [2]:
train_feat = pd.read_csv('training_set_features.csv')
train_lab = pd.read_csv('training_set_labels.csv')
test_feat = pd.read_csv('test_set_features.csv')

In [4]:
train_feat.describe()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
count,26707.0,26615.0,26591.0,26636.0,26499.0,26688.0,26665.0,26620.0,26625.0,26579.0,...,25903.0,14433.0,26316.0,26319.0,26312.0,26245.0,26193.0,26170.0,26458.0,26458.0
mean,13353.0,1.618486,1.262532,0.048844,0.725612,0.068982,0.825614,0.35864,0.337315,0.677264,...,0.111918,0.87972,3.850623,2.342566,2.35767,4.025986,2.719162,2.118112,0.886499,0.534583
std,7709.791156,0.910311,0.618149,0.215545,0.446214,0.253429,0.379448,0.47961,0.472802,0.467531,...,0.315271,0.3253,1.007436,1.285539,1.362766,1.086565,1.385055,1.33295,0.753422,0.928173
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,6676.5,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,3.0,1.0,1.0,4.0,2.0,1.0,0.0,0.0
50%,13353.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,4.0,2.0,2.0,4.0,2.0,2.0,1.0,0.0
75%,20029.5,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,5.0,4.0,4.0,5.0,4.0,4.0,1.0,1.0
max,26706.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0


In [4]:
train_feat['test'] = 0
test_feat['test'] = 1

In [5]:
train_feat = pd.concat([train_feat, test_feat], axis=0)

In [6]:
train_feat[['behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance']] = train_feat[['behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance']].astype('bool') 

In [7]:
train_feat.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,test
0,0,1.0,0.0,False,False,False,False,False,True,True,...,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0
1,1,3.0,2.0,False,True,False,True,False,True,True,...,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0
2,2,1.0,1.0,False,True,False,False,False,False,False,...,Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0
3,3,1.0,1.0,False,True,False,True,True,False,False,...,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0
4,4,2.0,1.0,False,True,False,True,True,False,True,...,Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0


In [8]:
train_feat.dtypes

respondent_id                    int64
h1n1_concern                   float64
h1n1_knowledge                 float64
behavioral_antiviral_meds         bool
behavioral_avoidance              bool
behavioral_face_mask              bool
behavioral_wash_hands             bool
behavioral_large_gatherings       bool
behavioral_outside_home           bool
behavioral_touch_face             bool
doctor_recc_h1n1                  bool
doctor_recc_seasonal              bool
chronic_med_condition             bool
child_under_6_months              bool
health_worker                     bool
health_insurance                  bool
opinion_h1n1_vacc_effective    float64
opinion_h1n1_risk              float64
opinion_h1n1_sick_from_vacc    float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                      

In [9]:
train_lab.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0


## Preparing data

In [10]:
train_feat.shape

(53415, 37)

In [11]:
ft.list_primitives()

Unnamed: 0,name,type,dask_compatible,spark_compatible,description,valid_inputs,return_type
0,num_consecutive_less_mean,aggregation,False,False,Determines the length of the longest subsequen...,<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Logical Type = IntegerNullable)...
1,max_consecutive_negatives,aggregation,False,False,Determines the maximum number of consecutive n...,"<ColumnSchema (Logical Type = Integer)>, <Colu...",<ColumnSchema (Logical Type = Integer) (Semant...
2,count_below_mean,aggregation,False,False,Determines the number of values that are below...,<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Logical Type = IntegerNullable)...
3,mode,aggregation,False,False,Determines the most commonly repeated value.,<ColumnSchema (Semantic Tags = ['category'])>,
4,max_consecutive_zeros,aggregation,False,False,Determines the maximum number of consecutive z...,"<ColumnSchema (Logical Type = Integer)>, <Colu...",<ColumnSchema (Logical Type = Integer) (Semant...
5,all,aggregation,True,False,Calculates if all values are 'True' in a list.,"<ColumnSchema (Logical Type = Boolean)>, <Colu...",<ColumnSchema (Logical Type = Boolean)>
6,max_consecutive_true,aggregation,False,False,Determines the maximum number of consecutive T...,<ColumnSchema (Logical Type = Boolean)>,<ColumnSchema (Logical Type = Integer) (Semant...
7,num_consecutive_greater_mean,aggregation,False,False,Determines the length of the longest subsequen...,<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Logical Type = IntegerNullable)...
8,trend,aggregation,False,False,Calculates the trend of a column over time.,<ColumnSchema (Logical Type = Datetime) (Seman...,<ColumnSchema (Semantic Tags = ['numeric'])>
9,skew,aggregation,False,False,Computes the extent to which a distribution di...,<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Semantic Tags = ['numeric'])>


In [10]:
es = ft.EntitySet(id = 'train')
es = es.add_dataframe(
      dataframe_name="data",
      dataframe=train_feat,
      index=None,
)



In [11]:
agg_primitives = ['entropy', 'n_most_common', 'count_greater_than', 'any', 'all', 'num_true', 'median', 'num_consecutive_greater_mean', 'count_inside_nth_std', 'percent_true']
trans_primitives = ['cum_sum', 'cum_max', 'cum_min', 'diff', 'percentile', 'multiply_numeric', 'add_numeric', 'multiply_boolean', 'multiply_numeric_boolean', 'and', 'or']

dfs_feat, dfs_defs = ft.dfs(
    entityset=es,
    target_dataframe_name = 'data',
    trans_primitives=trans_primitives,
    agg_primitives=agg_primitives, 
    max_features=2000,
    chunk_size=4000,
    verbose=True,                            
    max_depth=3,
    n_jobs=-1,
    ignore_columns={'data': ['respondent_id', 'test']}
)

  agg_primitives: ['all', 'any', 'count_greater_than', 'count_inside_nth_std', 'entropy', 'median', 'n_most_common', 'num_consecutive_greater_mean', 'num_true', 'percent_true']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.


Built 539 features
EntitySet scattered to 8 workers in 26 seconds                                                                         
Elapsed: 00:25 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


In [12]:
dfs_feat = pd.concat([train_feat[['respondent_id', 'test']], dfs_feat], axis=1)

In [14]:
dfs_feat.head()

Unnamed: 0,respondent_id,test,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,...,PERCENTILE(h1n1_concern),PERCENTILE(h1n1_knowledge),PERCENTILE(household_adults),PERCENTILE(household_children),PERCENTILE(opinion_h1n1_risk),PERCENTILE(opinion_h1n1_sick_from_vacc),PERCENTILE(opinion_h1n1_vacc_effective),PERCENTILE(opinion_seas_risk),PERCENTILE(opinion_seas_sick_from_vacc),PERCENTILE(opinion_seas_vacc_effective)
0,0,0,1,0,False,False,False,False,False,True,...,0.275198,0.046533,0.151112,0.35175,0.155251,0.514512,0.194239,0.113679,0.595116,0.08797
1,1,0,3,2,False,True,False,True,False,True,...,0.914413,0.821342,0.151112,0.35175,0.833552,0.805936,0.864801,0.399345,0.839525,0.399345
2,2,0,1,1,False,True,False,False,False,False,...,0.275198,0.367866,0.903704,0.35175,0.155251,0.17033,0.194239,0.113679,0.595116,0.399345
3,3,0,1,1,False,True,False,True,True,False,...,0.275198,0.367866,0.151112,0.35175,0.710031,0.958819,0.194239,0.743712,0.223924,0.809823
4,4,0,2,1,False,True,False,True,True,False,...,0.628602,0.367866,0.57617,0.35175,0.710031,0.514512,0.194239,0.113679,0.839525,0.154892


In [13]:
dfs_feat = selection.remove_low_information_features(dfs_feat)
dfs_feat = selection.remove_highly_null_features(dfs_feat)
dfs_feat = selection.remove_single_value_features(dfs_feat)
dfs_feat = dfs_feat.drop(columns=[col for col in dfs_feat.columns if 'inf' in [str(i) for i in dfs_feat[col].tolist()]])

In [14]:
X = dfs_feat[dfs_feat['test']==0].drop(['respondent_id', 'test'], axis=1)
X_test = dfs_feat[dfs_feat['test']==1].drop(['respondent_id', 'test'], axis=1)
y = train_lab.drop('respondent_id', axis=1)

In [14]:
y.iloc[:,0].value_counts()

0    21033
1     5674
Name: h1n1_vaccine, dtype: int64

In [15]:
y.iloc[:,1].value_counts()

0    14272
1    12435
Name: seasonal_vaccine, dtype: int64

In [15]:
skf = StratifiedKFold(5, random_state=42, shuffle=True)

In [16]:
df_metrics = {'method': [], 'roc_auc: h1n1_vaccine':[], 'roc_auc: seasonal_vaccine':[], 'shape': [], 'time': []}

## Defining functions

In [17]:
def function_class_pipeline(classifier):
    class_pipeline = Pipeline([
    ('classifier', classifier)
    ])
    return class_pipeline

In [18]:
def function_cross_val_score(class_pipeline, X, y, skf):
    scores = cross_val_score(
        class_pipeline, X, y, 
        cv=skf, scoring='roc_auc'
    )
    return scores

In [19]:
def function_add_metrics(method, scores_h1n1, scores_seasonal, time, X):
    df_metrics['method'] += [method]
    df_metrics['roc_auc: h1n1_vaccine'] += [round(scores_h1n1.mean(), 3)]
    df_metrics['roc_auc: seasonal_vaccine'] += [round(scores_seasonal.mean(), 3)]
    df_metrics['shape'] += [X.shape[1]]
    df_metrics['time'] += [round(time, 3)]
    
    return df_metrics

## BaseLines

### BaseLine LGBM without featuretools and oversampling

In [24]:
# Time measurement
start_time = time.time()

# Preparation pipeline
prepare_pipeline_base1 = Pipeline([
    ('label-encoder', JamesSteinEncoder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

X_clean = train_feat[train_feat['test']==0].drop(['respondent_id', 'test'], axis=1)

X_prep_lgbm_wft_h1n1 = prepare_pipeline_base1.fit_transform(X_clean, y.iloc[:,0])
X_prep_lgbm_wft_seasonal = prepare_pipeline_base1.fit_transform(X_clean, y.iloc[:,1])

# Classification pipeline LightGBM
lgbm_pipeline_base_wft = function_class_pipeline(LGBMClassifier(max_depth=5))

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(lgbm_pipeline_base_wft, X_prep_lgbm_wft_h1n1, y.iloc[:,0], skf)
scores_seasonal = function_cross_val_score(lgbm_pipeline_base_wft, X_prep_lgbm_wft_seasonal, y.iloc[:,1], skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))

# Add metrics to dataframe
df_metrics = function_add_metrics('lgbm_pipeline_base_without_featuretools', scores_h1n1, scores_seasonal, end_time, X_prep_lgbm_wft_h1n1)

Cross-validated ROC_AUC for h1n1: 0.828 +/- 0.006
Cross-validated ROC_AUC for seasonal: 0.856 +/- 0.004


### BaseLine LGBM with featuretools

In [28]:
# Time measurement
start_time = time.time()

# Preparation pipeline
prepare_pipeline_base1 = Pipeline([
    ('label-encoder', JamesSteinEncoder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

X_prep_lgbm_h1n1 = prepare_pipeline_base1.fit_transform(X, y.iloc[:,0])
X_prep_lgbm_seasonal = prepare_pipeline_base1.fit_transform(X, y.iloc[:,1])

# Oversampling
oversample_h1n1 = SMOTE(random_state=5)
oversample_seasonal = SMOTE(random_state=5)

X_prep_lgbm_h1n1, y_h1n1 = oversample_h1n1.fit_resample(X_prep_lgbm_h1n1, y.iloc[:,0])
X_prep_lgbm_seasonal, y_seasonal = oversample_seasonal.fit_resample(X_prep_lgbm_seasonal, y.iloc[:,1])

# Classification pipeline LightGBM
lgbm_pipeline_base = function_class_pipeline(LGBMClassifier(max_depth=5))

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(lgbm_pipeline_base, X_prep_lgbm_h1n1, y_h1n1, skf)
scores_seasonal = function_cross_val_score(lgbm_pipeline_base, X_prep_lgbm_seasonal, y_seasonal, skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))

# Add metrics to dataframe
df_metrics = function_add_metrics('lgbm_pipeline_base', scores_h1n1, scores_seasonal, end_time, X_prep_lgbm_h1n1)

Cross-validated ROC_AUC for h1n1: 0.954 +/- 0.002
Cross-validated ROC_AUC for seasonal: 0.871 +/- 0.007


### BaseLine RandomForest with featuretools

In [29]:
# Time measurement
start_time = time.time()

# Preparation pipeline
prepare_pipeline_base2 = Pipeline([
    ('label-encoder', CountEncoder()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', RobustScaler())
])

X_prep_rf_h1n1 = prepare_pipeline_base2.fit_transform(X, y.iloc[:,0])
X_prep_rf_seasonal = prepare_pipeline_base2.fit_transform(X, y.iloc[:,1])

# Oversampling
oversample_h1n1 = SMOTE(random_state=5)
oversample_seasonal = SMOTE(random_state=5)

X_prep_rf_h1n1, y_h1n1 = oversample_h1n1.fit_resample(X_prep_rf_h1n1, y.iloc[:,0])
X_prep_rf_seasonal, y_seasonal = oversample_seasonal.fit_resample(X_prep_rf_seasonal, y.iloc[:,1])

# Classification pipeline LightGBM
rf_pipeline_base = function_class_pipeline(RandomForestClassifier(n_estimators=300))

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(rf_pipeline_base, X_prep_rf_h1n1, y_h1n1, skf)
scores_seasonal = function_cross_val_score(rf_pipeline_base, X_prep_rf_seasonal, y_seasonal, skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))

# Add metrics to dataframe
df_metrics = function_add_metrics('rf_pipeline_base', scores_h1n1, scores_seasonal, end_time, X_prep_rf_h1n1)

Cross-validated ROC_AUC for h1n1: 0.957 +/- 0.001
Cross-validated ROC_AUC for seasonal: 0.852 +/- 0.005


## Filter methods

In [20]:
# Base preparing pipeline
prepare_pipeline = Pipeline([
    ('label-encoder', JamesSteinEncoder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

X_prep_h1n1 = prepare_pipeline.fit_transform(X, y.iloc[:,0])
X_prep_seasonal = prepare_pipeline.fit_transform(X, y.iloc[:,1])

X_test_prep_h1n1 = prepare_pipeline.transform(X_test)
X_test_prep_seasonal = prepare_pipeline.transform(X_test)

In [21]:
# Oversampling
oversample_h1n1 = SMOTE(random_state=5)
oversample_seasonal = SMOTE(random_state=5)

X_prep_h1n1, y_h1n1 = oversample_h1n1.fit_resample(X_prep_h1n1, y.iloc[:,0])
X_prep_seasonal, y_seasonal = oversample_seasonal.fit_resample(X_prep_seasonal, y.iloc[:,1])

### Constant

In [33]:
# Time measurement
start_time = time.time()

selector_constant_h1n1 = VarianceThreshold(0.0)
selector_constant_seasonal = VarianceThreshold(0.0)

X_selected_constant_h1n1 = selector_constant_h1n1.fit_transform(X_prep_h1n1)
X_selected_constant_seasonal = selector_constant_seasonal.fit_transform(X_prep_seasonal)

print('Base X shape: {}\nSelected X shape: {}'.format(X_prep_h1n1.shape, X_selected_constant_h1n1.shape))

# Classification pipelines
rf_pipeline_constant = function_class_pipeline(RandomForestClassifier(n_estimators=300))
lgbm_pipeline_constant = function_class_pipeline(LGBMClassifier(max_depth=5))

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(rf_pipeline_constant, X_selected_constant_h1n1, y_h1n1, skf)
scores_seasonal = function_cross_val_score(lgbm_pipeline_constant, X_selected_constant_seasonal, y_seasonal, skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))

# Add metrics to dataframe
df_metrics = function_add_metrics('pipeline_constant', scores_h1n1, scores_seasonal, end_time, X_selected_constant_h1n1)

Base X shape: (42066, 524)
Selected X shape: (42066, 524)
Cross-validated ROC_AUC for h1n1: 0.957 +/- 0.002
Cross-validated ROC_AUC for seasonal: 0.872 +/- 0.007


### Quasi-Constant

In [35]:
# Time measurement
start_time = time.time()

selector_quasi_constant_h1n1 = VarianceThreshold(.85 * (1 - .85))
selector_quasi_constant_seasonal = VarianceThreshold(.85 * (1 - .85))

X_selected_quasi_constant_h1n1 = selector_quasi_constant_h1n1.fit_transform(X_prep_h1n1)
X_selected_quasi_constant_seasonal = selector_quasi_constant_seasonal.fit_transform(X_prep_seasonal)

print('Base X shape: {}\nSelected X shape: {}'.format(X_prep_h1n1.shape, X_selected_quasi_constant_h1n1.shape))

# Classification pipelines
rf_pipeline_quasi_constant = function_class_pipeline(RandomForestClassifier(n_estimators=300))
lgbm_pipeline_quasi_constant = function_class_pipeline(LGBMClassifier(max_depth=5))

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(rf_pipeline_quasi_constant, X_selected_quasi_constant_h1n1, y_h1n1, skf)
scores_seasonal = function_cross_val_score(lgbm_pipeline_quasi_constant, X_selected_quasi_constant_seasonal, y_seasonal, skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))

# Add metrics to dataframe
df_metrics = function_add_metrics('pipeline_quasi_constant', scores_h1n1, scores_seasonal, end_time, X_selected_quasi_constant_h1n1)

Base X shape: (42066, 524)
Selected X shape: (42066, 524)
Cross-validated ROC_AUC for h1n1: 0.957 +/- 0.002
Cross-validated ROC_AUC for seasonal: 0.872 +/- 0.007


### Mutual info

In [22]:
# Time measurement
start_time = time.time()

selector_mutual_h1n1 = SelectPercentile(mutual_info_classif, percentile=85)
selector_mutual_seasonal = SelectPercentile(mutual_info_classif, percentile=85)

X_selected_mutual_h1n1 = selector_mutual_h1n1.fit_transform(X_prep_h1n1, y_h1n1)
X_selected_mutual_seasonal = selector_mutual_seasonal.fit_transform(X_prep_seasonal, y_seasonal)

print('Base X shape: {}\nSelected X shape: {}'.format(X_prep_h1n1.shape, X_selected_mutual_h1n1.shape))

# Classification pipelines
rf_pipeline_mutual = function_class_pipeline(RandomForestClassifier(n_estimators=300))
lgbm_pipeline_mutual = function_class_pipeline(LGBMClassifier(max_depth=5))

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(rf_pipeline_mutual, X_selected_mutual_h1n1, y_h1n1, skf)
scores_seasonal = function_cross_val_score(lgbm_pipeline_mutual, X_selected_mutual_seasonal, y_seasonal, skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))

# Add metrics to dataframe
df_metrics = function_add_metrics('pipeline_mutual', scores_h1n1, scores_seasonal, end_time, X_selected_mutual_h1n1)

Base X shape: (42066, 524)
Selected X shape: (42066, 445)
Cross-validated ROC_AUC for h1n1: 0.957 +/- 0.002
Cross-validated ROC_AUC for seasonal: 0.872 +/- 0.007


## Wrapper methods

In [31]:
# Time measurement
start_time = time.time()

selector_wrapper_h1n1 = SequentialFeatureSelector(
                    LGBMClassifier(max_depth=5), 
                    k_features='best', 
                    forward=False, 
                    floating=False,
                    scoring='roc_auc',
                    cv=None,
                    verbose=1,
                    n_jobs=-1)

selector_wrapper_seasonal = SequentialFeatureSelector(
                    LGBMClassifier(max_depth=5), 
                    k_features='best', 
                    forward=False, 
                    floating=False,
                    scoring='roc_auc',
                    cv=None,
                    verbose=1,
                    n_jobs=-1)

X_selected_wrapper_h1n1 = selector_wrapper_h1n1.fit_transform(X_selected_mutual_h1n1[:5000], y_h1n1[:5000])
X_selected_wrapper_seasonal = selector_wrapper_seasonal.fit_transform(X_selected_mutual_h1n1[:5000], y_seasonal[:5000])

print('Base X shape: {}\nSelected X shape: {}'.format(X_prep_h1n1.shape, X_selected_wrapper_h1n1.shape))

# Classification pipelines
rf_pipeline_wrapper = function_class_pipeline(RandomForestClassifier(n_estimators=300))
lgbm_pipeline_wrapper = function_class_pipeline(LGBMClassifier(max_depth=5))

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(rf_pipeline_wrapper, X_selected_wrapper_h1n1, y_h1n1[:5000], skf)
scores_seasonal = function_cross_val_score(lgbm_pipeline_wrapper, X_selected_wrapper_seasonal, y_seasonal[:5000], skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))

# Add metrics to dataframe
df_metrics = function_add_metrics('pipeline_wrapper', scores_h1n1, scores_seasonal, end_time, X_selected_wrapper_h1n1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   57.2s
[Parallel(n_jobs=-1)]: Done 445 out of 445 | elapsed:  2.1min finished
Features: 444/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   50.5s
[Parallel(n_jobs=-1)]: Done 444 out of 444 | elapsed:  2.0min finished
Features: 443/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   48.7s
[Parallel(n_jobs=-1)]: Done 443 out of 443 | elapsed:  2.0min finished
Features: 442/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.2s
[Par

[Parallel(n_jobs=-1)]: Done 417 out of 417 | elapsed:  1.8min finished
Features: 416/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   50.3s
[Parallel(n_jobs=-1)]: Done 416 out of 416 | elapsed:  1.8min finished
Features: 415/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   47.1s
[Parallel(n_jobs=-1)]: Done 415 out of 415 | elapsed:  1.7min finished
Features: 414/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done 414 out of 414 | elapsed:  1.6min finished
Features: 413/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 

[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 388 out of 388 | elapsed:  2.1min finished
Features: 387/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   59.0s
[Parallel(n_jobs=-1)]: Done 387 out of 387 | elapsed:  2.1min finished
Features: 386/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   59.5s
[Parallel(n_jobs=-1)]: Done 386 out of 386 | elapsed:  2.1min finished
Features: 385/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 385 out of 385 | elapsed:  2.0min finished
Features:

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   57.8s
[Parallel(n_jobs=-1)]: Done 359 out of 359 | elapsed:  1.8min finished
Features: 358/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 358 out of 358 | elapsed:  1.7min finished
Features: 357/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   53.2s
[Parallel(n_jobs=-1)]: Done 357 out of 357 | elapsed:  1.7min finished
Features: 356/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   54.4s
[Parallel(n_jobs=-

Features: 330/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   53.9s
[Parallel(n_jobs=-1)]: Done 330 out of 330 | elapsed:  1.6min finished
Features: 329/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   51.7s
[Parallel(n_jobs=-1)]: Done 329 out of 329 | elapsed:  1.5min finished
Features: 328/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 328 out of 328 | elapsed:  1.6min finished
Features: 327/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapse

[Parallel(n_jobs=-1)]: Done 302 out of 302 | elapsed:   58.9s finished
Features: 301/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   35.5s
[Parallel(n_jobs=-1)]: Done 301 out of 301 | elapsed:   55.8s finished
Features: 300/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   55.4s finished
Features: 299/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done 299 out of 299 | elapsed:   53.7s finished
Features: 298/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 

[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   31.7s
[Parallel(n_jobs=-1)]: Done 273 out of 273 | elapsed:   46.4s finished
Features: 272/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done 272 out of 272 | elapsed:   48.0s finished
Features: 271/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   31.0s
[Parallel(n_jobs=-1)]: Done 271 out of 271 | elapsed:   46.0s finished
Features: 270/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   45.2s
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:  1.1min finished
Features:

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done 244 out of 244 | elapsed:   49.4s finished
Features: 243/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   39.2s
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed:   51.7s finished
Features: 242/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   40.1s
[Parallel(n_jobs=-1)]: Done 242 out of 242 | elapsed:   52.1s finished
Features: 241/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   41.0s
[Parallel(n_jobs=-

Features: 215/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   37.0s
[Parallel(n_jobs=-1)]: Done 215 out of 215 | elapsed:   42.5s finished
Features: 214/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   37.6s
[Parallel(n_jobs=-1)]: Done 214 out of 214 | elapsed:   43.3s finished
Features: 213/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   38.5s
[Parallel(n_jobs=-1)]: Done 213 out of 213 | elapsed:   43.6s finished
Features: 212/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapse

Features: 183/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 183 out of 183 | elapsed:   33.3s finished
Features: 182/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 182 out of 182 | elapsed:   32.6s finished
Features: 181/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 181 out of 181 | elapsed:   31.9s finished
Features: 180/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   31.7s finished
Features: 179/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_j

[Parallel(n_jobs=-1)]: Done 147 out of 147 | elapsed:   19.8s finished
Features: 146/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 146 out of 146 | elapsed:   23.2s finished
Features: 145/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 145 out of 145 | elapsed:   24.5s finished
Features: 144/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   28.9s finished
Features: 143/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 143 out of 143 | elapsed:   30.1s finished
Features: 142/1[Parallel(n_jobs=-

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 110 out of 110 | elapsed:   15.2s finished
Features: 109/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 109 out of 109 | elapsed:   15.0s finished
Features: 108/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:   15.4s finished
Features: 107/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 107 out of 107 | elapsed:   14.8s finished
Features: 106/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 106 out of 106

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  73 out of  73 | elapsed:    7.9s finished
Features: 72/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    7.9s finished
Features: 71/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  71 out of  71 | elapsed:    7.7s finished
Features: 70/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:    7.7s finished
Features: 69/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done  69 out of  69 | e

Features: 31/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  31 out of  31 | elapsed:    2.7s finished
Features: 30/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    2.6s finished
Features: 29/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 out of  29 | elapsed:    2.6s finished
Features: 28/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 out of  28 | elapsed:    2.5s finished
Features: 27/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    2.4s finished
Features: 26/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 out of  26 | elapsed:    2.1s finished
Features: 25/1[Parallel(n_jobs=-1)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   47.3s
[Parallel(n_jobs=-1)]: Done 436 out of 436 | elapsed:  1.8min finished
Features: 435/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   46.5s
[Parallel(n_jobs=-1)]: Done 435 out of 435 | elapsed:  1.8min finished
Features: 434/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done 434 out of 434 | elapsed:  1.8min finished
Features: 433/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   46.9s
[Parallel(n_jobs=-

Features: 407/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   45.2s
[Parallel(n_jobs=-1)]: Done 407 out of 407 | elapsed:  1.6min finished
Features: 406/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done 406 out of 406 | elapsed:  1.6min finished
Features: 405/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  1.7min finished
Features: 404/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapse

[Parallel(n_jobs=-1)]: Done 379 out of 379 | elapsed:  1.4min finished
Features: 378/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done 378 out of 378 | elapsed:  1.4min finished
Features: 377/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   41.0s
[Parallel(n_jobs=-1)]: Done 377 out of 377 | elapsed:  1.4min finished
Features: 376/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   42.0s
[Parallel(n_jobs=-1)]: Done 376 out of 376 | elapsed:  1.4min finished
Features: 375/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 

[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   47.9s
[Parallel(n_jobs=-1)]: Done 350 out of 350 | elapsed:  1.4min finished
Features: 349/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   47.7s
[Parallel(n_jobs=-1)]: Done 349 out of 349 | elapsed:  1.4min finished
Features: 348/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   42.5s
[Parallel(n_jobs=-1)]: Done 348 out of 348 | elapsed:  1.4min finished
Features: 347/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   48.7s
[Parallel(n_jobs=-1)]: Done 347 out of 347 | elapsed:  1.6min finished
Features:

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   44.5s
[Parallel(n_jobs=-1)]: Done 321 out of 321 | elapsed:  1.3min finished
Features: 320/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   45.8s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  1.3min finished
Features: 319/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   46.3s
[Parallel(n_jobs=-1)]: Done 319 out of 319 | elapsed:  1.2min finished
Features: 318/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   43.3s
[Parallel(n_jobs=-

Features: 292/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-1)]: Done 292 out of 292 | elapsed:  1.1min finished
Features: 291/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done 291 out of 291 | elapsed:  1.0min finished
Features: 290/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   38.6s
[Parallel(n_jobs=-1)]: Done 290 out of 290 | elapsed:   59.3s finished
Features: 289/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapse

[Parallel(n_jobs=-1)]: Done 264 out of 264 | elapsed:   49.0s finished
Features: 263/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   34.3s
[Parallel(n_jobs=-1)]: Done 263 out of 263 | elapsed:   47.7s finished
Features: 262/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done 262 out of 262 | elapsed:   46.9s finished
Features: 261/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done 261 out of 261 | elapsed:   47.7s finished
Features: 260/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 

[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   31.6s
[Parallel(n_jobs=-1)]: Done 235 out of 235 | elapsed:   41.6s finished
Features: 234/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done 234 out of 234 | elapsed:   37.2s finished
Features: 233/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done 233 out of 233 | elapsed:   37.4s finished
Features: 232/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 232 out of 232 | elapsed:   34.9s finished
Features:

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   37.1s
[Parallel(n_jobs=-1)]: Done 206 out of 206 | elapsed:   40.9s finished
Features: 205/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done 205 out of 205 | elapsed:   41.5s finished
Features: 204/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done 204 out of 204 | elapsed:   42.7s finished
Features: 203/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   36.6s
[Parallel(n_jobs=-

Features: 171/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 171 out of 171 | elapsed:   30.2s finished
Features: 170/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 170 out of 170 | elapsed:   29.6s finished
Features: 169/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 169 out of 169 | elapsed:   29.5s finished
Features: 168/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 168 out of 168 | elapsed:   29.7s finished
Features: 167/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_j

[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   22.3s finished
Features: 134/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 134 out of 134 | elapsed:   22.5s finished
Features: 133/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 133 out of 133 | elapsed:   21.6s finished
Features: 132/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 132 out of 132 | elapsed:   21.1s finished
Features: 131/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 131 out of 131 | elapsed:   20.9s finished
Features: 130/1[Parallel(n_jobs=-

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  98 out of  98 | elapsed:   14.2s finished
Features: 97/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  97 out of  97 | elapsed:   14.4s finished
Features: 96/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:   13.9s finished
Features: 95/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  95 out of  95 | elapsed:   13.7s finished
Features: 94/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  94 out of  94 | e

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  61 out of  61 | elapsed:    8.8s finished
Features: 60/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    8.8s finished
Features: 59/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  59 out of  59 | elapsed:    9.2s finished
Features: 58/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  58 out of  58 | elapsed:    8.3s finished
Features: 57/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  57 out of  57 | e

[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    1.8s finished
Features: 14/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:    1.7s finished
Features: 13/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    1.6s finished
Features: 12/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:    1.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    1.4s finished
Features: 11/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    1.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    1.4s finished
Features: 10/1[Parallel(n_jobs=-1)]: Using

Base X shape: (42066, 524)
Selected X shape: (5000, 165)
Cross-validated ROC_AUC for h1n1: 0.803 +/- 0.009
Cross-validated ROC_AUC for seasonal: 0.834 +/- 0.010


## Emebeded methods

### L1-based (LinearSVC)

In [37]:
# Time measurement
start_time = time.time()

selector_emb_svc_h1n1 = SelectFromModel(LinearSVC(penalty='l1', dual=False))
selector_emb_svc_seasonal = SelectFromModel(LinearSVC(penalty='l1', dual=False))

X_selected_emb_svc_h1n1 = selector_emb_svc_h1n1.fit_transform(X_prep_h1n1, y_h1n1)
X_selected_emb_svc_seasonal = selector_emb_svc_seasonal.fit_transform(X_prep_seasonal, y_seasonal)

print('Base X shape: {}\nSelected X shape: {}'.format(X_prep_h1n1.shape, X_selected_emb_svc_h1n1.shape))

# Classification pipelines
rf_pipeline_emb_svc = function_class_pipeline(RandomForestClassifier(n_estimators=300))
lgbm_pipeline_emb_svc = function_class_pipeline(LGBMClassifier(max_depth=5))

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(rf_pipeline_emb_svc, X_selected_emb_svc_h1n1, y_h1n1, skf)
scores_seasonal = function_cross_val_score(lgbm_pipeline_emb_svc, X_selected_emb_svc_seasonal, y_seasonal, skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))

# Add metrics to dataframe
df_metrics = function_add_metrics('pipeline_emb_svc', scores_h1n1, scores_seasonal, end_time, X_selected_emb_svc_h1n1)

Base X shape: (42066, 524)
Selected X shape: (42066, 456)
Cross-validated ROC_AUC for h1n1: 0.957 +/- 0.002
Cross-validated ROC_AUC for seasonal: 0.872 +/- 0.007


### L1-based (LogisticRegression)

In [38]:
# Time measurement
start_time = time.time()

selector_emb_log_h1n1 = SelectFromModel(LogisticRegression(penalty='l1',solver='liblinear'))
selector_emb_log_seasonal = SelectFromModel(LogisticRegression(penalty='l1',solver='liblinear'))

X_selected_emb_log_h1n1 = selector_emb_log_h1n1.fit_transform(X_prep_h1n1, y_h1n1)
X_selected_emb_log_seasonal = selector_emb_log_seasonal.fit_transform(X_prep_seasonal, y_seasonal)

print('Base X shape: {}\nSelected X shape: {}'.format(X_prep_h1n1.shape, X_selected_emb_log_h1n1.shape))

# Classification pipelines
rf_pipeline_emb_log = function_class_pipeline(RandomForestClassifier(n_estimators=300))
lgbm_pipeline_emb_log = function_class_pipeline(LGBMClassifier(max_depth=5))

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(rf_pipeline_emb_log, X_selected_emb_log_h1n1, y_h1n1, skf)
scores_seasonal = function_cross_val_score(lgbm_pipeline_emb_log, X_selected_emb_log_seasonal, y_seasonal, skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))

# Add metrics to dataframe
df_metrics = function_add_metrics('pipeline_emb_log', scores_h1n1, scores_seasonal, end_time, X_selected_emb_log_h1n1)

Base X shape: (42066, 524)
Selected X shape: (42066, 401)
Cross-validated ROC_AUC for h1n1: 0.956 +/- 0.002
Cross-validated ROC_AUC for seasonal: 0.872 +/- 0.007


### ExtraTreesClassifier

In [39]:
# Time measurement
start_time = time.time()

selector_emb_trees_h1n1 = SelectFromModel(ExtraTreesClassifier(n_estimators=200))
selector_emb_trees_seasonal = SelectFromModel(ExtraTreesClassifier(n_estimators=200))

X_selected_emb_trees_h1n1 = selector_emb_trees_h1n1.fit_transform(X_prep_h1n1, y_h1n1)
X_selected_emb_trees_seasonal = selector_emb_trees_seasonal.fit_transform(X_prep_seasonal, y_seasonal)

print('Base X shape: {}\nSelected X shape: {}'.format(X_prep_h1n1.shape, X_selected_emb_trees_h1n1.shape))

# Classification pipelines
rf_pipeline_emb_trees = function_class_pipeline(RandomForestClassifier(n_estimators=300))
lgbm_pipeline_emb_trees = function_class_pipeline(LGBMClassifier(max_depth=5))

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(rf_pipeline_emb_trees, X_selected_emb_trees_h1n1, y_h1n1, skf)
scores_seasonal = function_cross_val_score(lgbm_pipeline_emb_trees, X_selected_emb_trees_seasonal, y_seasonal, skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))

# Add metrics to dataframe
df_metrics = function_add_metrics('pipeline_emb_trees', scores_h1n1, scores_seasonal, end_time, X_selected_emb_trees_h1n1)

Base X shape: (42066, 524)
Selected X shape: (42066, 182)
Cross-validated ROC_AUC for h1n1: 0.953 +/- 0.002
Cross-validated ROC_AUC for seasonal: 0.869 +/- 0.007


## RFECV

In [29]:
# Time measurement
start_time = time.time()

selector_rfecv_h1n1 = RFECV(estimator=LinearSVC(penalty='l1', dual=False), step=2, cv=None, scoring='roc_auc', verbose=1,  min_features_to_select=350)
selector_rfecv_seasonal = RFECV(estimator=LinearSVC(penalty='l1', dual=False), step=2, cv=None, scoring='roc_auc', verbose=1,  min_features_to_select=350)

X_selected_rfecv_h1n1 = selector_rfecv_h1n1.fit_transform(X_selected_mutual_h1n1[:5000], y_h1n1[:5000])
X_selected_rfecv_seasonal = selector_rfecv_seasonal.fit_transform(X_selected_mutual_seasonal[:5000], y_seasonal[:5000])

print('Base X shape: {}\nSelected X shape: {}'.format(X_prep_h1n1.shape, X_selected_rfecv_h1n1.shape))

# Classification pipelines
rf_pipeline_rfecv = function_class_pipeline(RandomForestClassifier(n_estimators=300))
lgbm_pipeline_rfecv = function_class_pipeline(LGBMClassifier(max_depth=5))

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(rf_pipeline_rfecv, X_selected_rfecv_h1n1, y_h1n1[:5000], skf)
scores_seasonal = function_cross_val_score(lgbm_pipeline_rfecv, X_selected_rfecv_seasonal, y_seasonal[:5000], skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))

# Add metrics to dataframe
df_metrics = function_add_metrics('pipeline_rfecv', scores_h1n1, scores_seasonal, end_time, X_selected_rfecv_h1n1)

Fitting estimator with 445 features.
Fitting estimator with 443 features.
Fitting estimator with 441 features.
Fitting estimator with 439 features.
Fitting estimator with 437 features.
Fitting estimator with 435 features.
Fitting estimator with 433 features.
Fitting estimator with 431 features.
Fitting estimator with 429 features.
Fitting estimator with 427 features.
Fitting estimator with 425 features.
Fitting estimator with 423 features.
Fitting estimator with 421 features.
Fitting estimator with 419 features.
Fitting estimator with 417 features.
Fitting estimator with 415 features.
Fitting estimator with 413 features.
Fitting estimator with 411 features.
Fitting estimator with 409 features.
Fitting estimator with 407 features.
Fitting estimator with 405 features.
Fitting estimator with 403 features.
Fitting estimator with 401 features.
Fitting estimator with 399 features.
Fitting estimator with 397 features.
Fitting estimator with 395 features.
Fitting estimator with 393 features.
F

Fitting estimator with 385 features.
Fitting estimator with 383 features.
Fitting estimator with 381 features.
Fitting estimator with 379 features.
Fitting estimator with 377 features.
Fitting estimator with 375 features.
Fitting estimator with 373 features.
Fitting estimator with 371 features.
Fitting estimator with 369 features.
Fitting estimator with 367 features.
Fitting estimator with 365 features.
Fitting estimator with 363 features.
Fitting estimator with 361 features.
Fitting estimator with 359 features.
Fitting estimator with 357 features.
Fitting estimator with 355 features.
Fitting estimator with 353 features.
Fitting estimator with 351 features.
Fitting estimator with 445 features.
Fitting estimator with 443 features.
Fitting estimator with 441 features.
Fitting estimator with 439 features.
Fitting estimator with 437 features.
Fitting estimator with 435 features.
Fitting estimator with 433 features.
Fitting estimator with 431 features.
Fitting estimator with 429 features.
F

Fitting estimator with 417 features.
Fitting estimator with 415 features.
Fitting estimator with 413 features.
Fitting estimator with 411 features.
Fitting estimator with 409 features.
Fitting estimator with 407 features.
Fitting estimator with 405 features.
Fitting estimator with 403 features.
Fitting estimator with 401 features.
Fitting estimator with 399 features.
Fitting estimator with 397 features.
Fitting estimator with 395 features.
Fitting estimator with 393 features.
Fitting estimator with 391 features.
Fitting estimator with 389 features.
Fitting estimator with 387 features.
Fitting estimator with 385 features.
Fitting estimator with 383 features.
Fitting estimator with 381 features.
Fitting estimator with 379 features.
Fitting estimator with 377 features.
Fitting estimator with 375 features.
Fitting estimator with 373 features.
Fitting estimator with 371 features.
Fitting estimator with 369 features.
Fitting estimator with 367 features.
Fitting estimator with 365 features.
F

## Permutation Importance

In [40]:
from eli5.sklearn import PermutationImportance
roc_scorer = make_scorer(roc_auc_score, needs_proba=True)

# Time measurement
start_time = time.time()

selector_perm_h1n1 = SelectFromModel(PermutationImportance(LogisticRegression(penalty="l1", solver='liblinear'), scoring=roc_scorer, cv=None), threshold=0.00001)
selector_perm_seasonal = SelectFromModel(PermutationImportance(LogisticRegression(penalty="l1", solver='liblinear'), scoring=roc_scorer, cv=None), threshold=0.00001)

X_selected_perm_h1n1 = selector_perm_h1n1.fit_transform(X_prep_h1n1, y_h1n1)
X_selected_perm_seasonal = selector_perm_seasonal.fit_transform(X_prep_seasonal, y_seasonal)

print('Base X shape: {}\nSelected X shape: {}'.format(X_prep_h1n1.shape, X_selected_perm_h1n1.shape))

# Classification pipelines
rf_pipeline_perm = function_class_pipeline(RandomForestClassifier(n_estimators=300))
lgbm_pipeline_perm = function_class_pipeline(LGBMClassifier(max_depth=5))

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(rf_pipeline_perm, X_selected_perm_h1n1, y_h1n1, skf)
scores_seasonal = function_cross_val_score(lgbm_pipeline_perm, X_selected_perm_seasonal, y_seasonal, skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))

# Add metrics to dataframe
df_metrics = function_add_metrics('pipeline_perm', scores_h1n1, scores_seasonal, end_time, X_selected_perm_h1n1)

Base X shape: (42066, 524)
Selected X shape: (42066, 365)
Cross-validated ROC_AUC for h1n1: 0.956 +/- 0.002
Cross-validated ROC_AUC for seasonal: 0.872 +/- 0.007


## FeatureGradientSelector

In [41]:
# Time measurement
start_time = time.time()

selector_fgs_h1n1 = FeatureGradientSelector(n_features=400)
selector_fgs_seasonal = FeatureGradientSelector(n_features=400)

X_selected_fgs_h1n1 = selector_fgs_h1n1.fit_transform(X_prep_h1n1, y_h1n1)
X_selected_fgs_seasonal = selector_fgs_seasonal.fit_transform(X_prep_seasonal, y_seasonal)

print('Base X shape: {}\nSelected X shape: {}'.format(X_prep_h1n1.shape, X_selected_fgs_h1n1.shape))

# Classification pipeline LightGBM
rf_pipeline_fgs = function_class_pipeline(RandomForestClassifier(n_estimators=300))
lgbm_pipeline_fgs = function_class_pipeline(LGBMClassifier(max_depth=5))

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(rf_pipeline_fgs, X_selected_fgs_h1n1, y_h1n1, skf)
scores_seasonal = function_cross_val_score(lgbm_pipeline_fgs, X_selected_fgs_seasonal, y_seasonal, skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))

# Add metrics to dataframe
df_metrics = function_add_metrics('pipeline_fgs', scores_h1n1, scores_seasonal, end_time, X_selected_fgs_h1n1)

Base X shape: (42066, 524)
Selected X shape: (42066, 400)
Cross-validated ROC_AUC for h1n1: 0.954 +/- 0.001
Cross-validated ROC_AUC for seasonal: 0.871 +/- 0.006


## GBDTSelector

In [42]:
# Time measurement
start_time = time.time()

selector_gbdt_h1n1 = GBDTSelector()
selector_gbdt_seasonal = GBDTSelector()

selector_gbdt_h1n1.fit(X_prep_h1n1, y_h1n1, 
                          lgb_params={'boosting_type':'gbdt','metric':'AUC','verbosity':0},
                          eval_ratio=0.25,
                          early_stopping_rounds=5,
                          importance_type='gain',
                          num_boost_round=50)

selector_gbdt_seasonal.fit(X_prep_seasonal, y_seasonal, 
                          lgb_params={'boosting_type':'gbdt','metric':'AUC','verbosity':0},
                          eval_ratio=0.25,
                          early_stopping_rounds=5,
                          importance_type='gain',
                          num_boost_round=50)

X_selected_gbdt_h1n1 = X_prep_h1n1[:, selector_gbdt_h1n1.get_selected_features(500)]
X_selected_gbdt_seasonal = X_prep_seasonal[:, selector_gbdt_seasonal.get_selected_features(500)]

print('Base X shape: {}\nSelected X shape: {}'.format(X_prep_h1n1.shape, X_selected_gbdt_h1n1.shape))

# Classification pipeline LightGBM
rf_pipeline_gbdt = function_class_pipeline(RandomForestClassifier(n_estimators=300))
lgbm_pipeline_gbdt = function_class_pipeline(LGBMClassifier(max_depth=5))

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(rf_pipeline_gbdt, X_selected_gbdt_h1n1, y_h1n1, skf)
scores_seasonal = function_cross_val_score(lgbm_pipeline_gbdt, X_selected_gbdt_seasonal, y_seasonal, skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))

# Add metrics to dataframe
df_metrics = function_add_metrics('pipeline_gbdt', scores_h1n1, scores_seasonal, end_time, X_selected_gbdt_h1n1)



You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's auc: 0.892702
Training until validation scores don't improve for 5 rounds
[2]	valid_0's auc: 0.906475
[3]	valid_0's auc: 0.910557
[4]	valid_0's auc: 0.916212
[5]	valid_0's auc: 0.920163
[6]	valid_0's auc: 0.922393
[7]	valid_0's auc: 0.925565
[8]	valid_0's auc: 0.926776
[9]	valid_0's auc: 0.93027
[10]	valid_0's auc: 0.932598
[11]	valid_0's auc: 0.934268
[12]	valid_0's auc: 0.936215
[13]	valid_0's auc: 0.937345
[14]	valid_0's auc: 0.938701
[15]	valid_0's auc: 0.940148
[16]	valid_0's auc: 0.940935
[17]	valid_0's auc: 0.941742
[18]	valid_0's auc: 0.942455
[19]	valid_0's auc: 0.943679
[20]	valid_0's auc: 0.944197
[21]	valid_0's auc: 0.944995
[22]	valid_0's auc: 0.945631
[23]	valid_0's auc: 0.946089
[24]	valid_0's auc: 0.946613
[25]	valid_0's auc: 0.946931
[26]	valid_0's auc: 0.947341
[27]	valid_0's auc: 0.947649
[28]	valid_0's auc: 0.94804
[29]	valid_0's auc: 0.948491
[30]	valid_0's auc: 0.948897
[31]	valid_0's auc: 0.9

## Results

In [46]:
# examples of important features
columns_features = selector_mutual_h1n1.get_support(indices=True)
df_important_features = X.iloc[:, columns_features]

[i for i in features_df_new.columns.tolist()]

['h1n1_concern',
 'h1n1_knowledge',
 'behavioral_avoidance',
 'behavioral_wash_hands',
 'behavioral_large_gatherings',
 'behavioral_outside_home',
 'behavioral_touch_face',
 'doctor_recc_h1n1',
 'doctor_recc_seasonal',
 'chronic_med_condition',
 'health_worker',
 'opinion_h1n1_vacc_effective',
 'opinion_h1n1_risk',
 'opinion_h1n1_sick_from_vacc',
 'opinion_seas_vacc_effective',
 'opinion_seas_risk',
 'opinion_seas_sick_from_vacc',
 'age_group',
 'education',
 'race',
 'sex',
 'income_poverty',
 'marital_status',
 'rent_or_own',
 'employment_status',
 'hhs_geo_region',
 'census_msa',
 'household_adults',
 'household_children',
 'employment_industry',
 'employment_occupation',
 'h1n1_concern + h1n1_knowledge',
 'h1n1_concern + household_adults',
 'h1n1_concern + household_children',
 'h1n1_concern + opinion_h1n1_risk',
 'h1n1_concern + opinion_h1n1_sick_from_vacc',
 'h1n1_concern + opinion_h1n1_vacc_effective',
 'h1n1_concern + opinion_seas_risk',
 'h1n1_concern + opinion_seas_sick_from_

In [47]:
df_metrics = pd.DataFrame(df_metrics)

In [50]:
# best pipelines
df_metrics.sort_values(['roc_auc: h1n1_vaccine', 'roc_auc: seasonal_vaccine', 'shape'], ascending=[False, False, True]).reset_index().drop('index', axis=1)

Unnamed: 0,method,roc_auc: h1n1_vaccine,roc_auc: seasonal_vaccine,shape,time
0,pipeline_mutual,0.957,0.872,445,546.257
1,pipeline_emb_svc,0.957,0.872,456,722.578
2,pipeline_gbdt,0.957,0.872,500,480.837
3,pipeline_constant,0.957,0.872,524,571.131
4,pipeline_quasi_constant,0.957,0.872,524,610.448
5,rf_pipeline_base,0.957,0.852,524,792.537
6,pipeline_perm,0.956,0.872,365,4325.102
7,pipeline_emb_log,0.956,0.872,401,821.271
8,lgbm_pipeline_base,0.954,0.872,524,80.383
9,pipeline_fgs,0.954,0.871,400,428.308


## AutoML

### TPOT

In [27]:
# Time measurement
start_time = time.time()

X_train_h1n1, X_test_h1n1, y_train_h1n1, y_test_h1n1 = train_test_split(
    X_selected_mutual_h1n1,
    y_h1n1,
    test_size=0.3,
    random_state=20,
    stratify=y_h1n1
)

X_train_seasonal, X_test_seasonal, y_train_seasonal, y_test_seasonal = train_test_split(
    X_selected_mutual_seasonal,
    y_seasonal,
    test_size=0.3,
    random_state=20,
    stratify=y_seasonal
)

tpot_pipeline_optimizer_h1n1 = TPOTClassifier(
    generations=5,
    population_size=20,
    cv=5,
    random_state=42,
    verbosity=2,
    max_eval_time_mins=1,
    n_jobs=-1,
)

tpot_pipeline_optimizer_seasonal = TPOTClassifier(
    generations=5,
    population_size=20,
    cv=5,
    random_state=42,
    verbosity=2,
    max_eval_time_mins=1,
    n_jobs=-1,
)

# Cross-validated roc auc
tpot_pipeline_optimizer_h1n1.fit(X_train_h1n1, y_train_h1n1)
tpot_pipeline_optimizer_seasonal.fit(X_train_seasonal, y_train_seasonal)

end_time = time.time() - start_time

print('ROC_AUC for h1n1: %0.3f' % (roc_auc_score(y_test_h1n1, tpot_pipeline_optimizer_h1n1.predict(X_test_h1n1))))
print('ROC_AUC for seasonal: %0.3f' % (roc_auc_score(y_test_seasonal, tpot_pipeline_optimizer_seasonal.predict(X_test_seasonal))))

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.838517993173656

Generation 2 - Current best internal CV score: 0.838517993173656

Generation 3 - Current best internal CV score: 0.838517993173656

Generation 4 - Current best internal CV score: 0.8407253257130141

Generation 5 - Current best internal CV score: 0.8407253257130141

Best pipeline: DecisionTreeClassifier(StandardScaler(input_matrix), criterion=gini, max_depth=9, min_samples_leaf=9, min_samples_split=18)


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.775975975975976

Generation 2 - Current best internal CV score: 0.775975975975976

Generation 3 - Current best internal CV score: 0.775975975975976

Generation 4 - Current best internal CV score: 0.775975975975976

Generation 5 - Current best internal CV score: 0.775975975975976

Best pipeline: LinearSVC(SelectFwe(input_matrix, alpha=0.036000000000000004), C=0.0001, dual=True, loss=squared_hinge, penalty=l2, tol=0.001)
ROC_AUC for h1n1: 0.843
ROC_AUC for seasonal: 0.769


### GAMA

In [34]:
# Time measurement
start_time = time.time()

gaml_pipeline_optimizer_h1n1 =  GamaClassifier(scoring='roc_auc', max_total_time=360, store="nothing", n_jobs=-1)
gaml_pipeline_optimizer_seasonal =  GamaClassifier(scoring='roc_auc', max_total_time=360, store="nothing", n_jobs=-1)

# Cross-validated roc auc
gaml_pipeline_optimizer_h1n1.fit(X_train_h1n1, y_train_h1n1)
gaml_pipeline_optimizer_seasonal.fit(X_train_seasonal, y_train_seasonal)

end_time = time.time() - start_time

print('ROC_AUC for h1n1: %0.3f' % (roc_auc_score(y_test_h1n1, gaml_pipeline_optimizer_h1n1.predict(X_test_h1n1))))
print('ROC_AUC for seasonal: %0.3f' % (roc_auc_score(y_test_seasonal, gaml_pipeline_optimizer_seasonal.predict(X_test_seasonal))))

ROC_AUC for h1n1: 0.776
ROC_AUC for seasonal: 0.758


### AutoSklearn (not working)

In [None]:
'''
# Time measurement
start_time = time.time()

autoskl_pipeline_optimizer = AutoSklearnClassifier()

# Cross-validated roc auc
scores_h1n1 = function_cross_val_score(autoskl_pipeline_optimizer, X_prep_h1n1, y.iloc[:,0], skf)
scores_seasonal = function_cross_val_score(autoskl_pipeline_optimizer, X_prep_seasonal, y.iloc[:,1], skf)

end_time = time.time() - start_time

print('Cross-validated ROC_AUC for h1n1: %0.3f +/- %0.3f' % (scores_h1n1.mean(), scores_h1n1.std()))
print('Cross-validated ROC_AUC for seasonal: %0.3f +/- %0.3f' % (scores_seasonal.mean(), scores_seasonal.std()))
'''

## Prediction

### permutation_importance pipeline: ROC_AUC in the competition - 0.7972

In [57]:
# Save results from best pipeline
X_test_selected_perm_h1n1 = selector_perm_h1n1.transform(X_test_prep_h1n1)
X_test_selected_perm_seasonal = selector_perm_seasonal.transform(X_test_prep_seasonal)

df_pred = pd.DataFrame()

df_pred['respondent_id'] = test_feat['respondent_id']

pipeline_perm.fit(X_selected_perm_h1n1, y_h1n1)
df_pred['h1n1_vaccine'] = lgbm_pipeline_perm.predict_proba(X_test_selected_perm_h1n1)[:, 1]

pipeline_perm.fit(X_selected_perm_seasonal, y_seasonal)
df_pred['seasonal_vaccine'] = lgbm_pipeline_perm.predict_proba(X_test_selected_perm_seasonal)[:, 1]

df_pred.to_csv('predictions_v1.csv', sep=',', index=False)

# Test ROC_AUC in the competition - 0.7972

### mutual_info pipeline: ROC_AUC in the competition - 0.8263

In [37]:
# Save results from best pipeline
X_test_selected_mutual_h1n1 = selector_mutual_h1n1.transform(X_test_prep_h1n1)
X_test_selected_mutual_seasonal = selector_mutual_seasonal.transform(X_test_prep_seasonal)

df_pred = pd.DataFrame()

df_pred['respondent_id'] = test_feat['respondent_id']

rf_pipeline_mutual.fit(X_selected_mutual_h1n1, y_h1n1)
df_pred['h1n1_vaccine'] = rf_pipeline_mutual.predict_proba(X_test_selected_mutual_h1n1)[:, 1]

lgbm_pipeline_mutual.fit(X_selected_mutual_seasonal, y_seasonal)
df_pred['seasonal_vaccine'] = lgbm_pipeline_mutual.predict_proba(X_test_selected_mutual_seasonal)[:, 1]

df_pred.to_csv('predictions_v2.csv', sep=',', index=False)