In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate
from sklearn.linear_model import LogisticRegressionCV

In [2]:
df = pd.read_csv('nesda_anxiety_with_missing_NaN.csv')
df_dtype = pd.read_csv('nesda_anxiety_data_types_with_missing_removed.csv')
y = df.persistance_anxiety.values
df.drop('persistance_anxiety', axis=1, inplace=True)

### Let's drop all the summary scores and only use the individual scores (for now)

In [3]:
id_summary = df.columns.str.find('summary') != -1
print(df.columns[id_summary])
print('n_summary: {}'.format(id_summary.sum()))
print('n_remaining: {}'.format(df.shape[1] - id_summary.sum())) 

Index(['IA205_summary', 'IA226_domainpa_summary', 'IA226_domainna_summary',
       'IA226_domainsa_summary', 'IA229_summary', 'IA232_summary',
       'IA233_summary', 'IA235_total_summary', 'IA235_domainatyp_summary',
       'IA235_domainmel_summary', 'IA236_total_summary',
       'IA236_domainsom_summary', 'IA236_domainsub_summary',
       'IA238_total_summary', 'IA238_domainbi_summary',
       'IA238_domainso_summary', 'IA238_domainag_summary',
       'IA252_lifeevents_summary', 'IA252_childhoodtrauma_summary',
       'IA254_summary', 'IA255_summary', 'IA256_items_summary',
       'IA257_current_summary', 'IA257_lifetime_summary',
       'IA259_current_summary', 'IA306_domain01_summary',
       'IA306_domain02_summary', 'IA306_domain03_summary',
       'IA306_domain04_summary', 'IA306_domain05h_summary',
       'IA306_domain05w_summary', 'IA306_domain06_summary',
       'IA306_total_summary', 'IA354_domainfirstline_summary',
       'IA354_domainsecondline_summary', 'IA355_domainfirst

In [4]:
col_summary = df.columns[id_summary].values
print('Dimensions WITH summary scores')
print(df.shape)
df.drop(col_summary, axis=1, inplace=True)
print('Dimensons WITHOUT summary scores')
print(df.shape)

Dimensions WITH summary scores
(994, 708)
Dimensons WITHOUT summary scores
(994, 637)


In [5]:
df_dtype = df_dtype.loc[~df_dtype.variable_name.isin(col_summary).values, :]
df_dtype.reset_index(drop=True, inplace=True)

In [6]:
assert np.all(df.columns == df_dtype.variable_name), 'Order of column_names is broken'

In [7]:
categorical_pipeline = Pipeline(steps=[
    ('categorical_impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore'))
])

ordinal_pipeline = Pipeline(steps=[
    ('ordinal_impute', SimpleImputer(strategy='median')),
])

interval_pipeline = Pipeline(steps=[
    ('interval_impute', SimpleImputer(strategy='mean')),
])

imputer = ColumnTransformer(transformers=[
    ('categorical_impute', categorical_pipeline, df_dtype.variable_name[(df_dtype.data_type == 'Nominal').values]),
    ('ordinal_impute', ordinal_pipeline, df_dtype.variable_name[(df_dtype.data_type == 'Ordinal').values]),
    ('interval_impute', interval_pipeline, df_dtype.variable_name[(df_dtype.data_type == 'Scale').values])
])
pipe_classification = Pipeline(steps=[
    ('imputation', imputer),
    ('random_forest', RandomForestClassifier(n_estimators=500, class_weight='balanced'))
])

scores = cross_validate(estimator=pipe_classification, X=df, y=y, scoring=('balanced_accuracy', 'recall', 'roc_auc'),
                       cv=StratifiedShuffleSplit(test_size=0.2, n_splits=100), n_jobs=15, verbose=1)

print('RandomForest on individual scores with mean/median/mode imputation')
print('balanced accuracy: {}'.format(scores['test_balanced_accuracy']))
print('Mean/SD: {}/{}'.format(scores['test_balanced_accuracy'].mean(), scores['test_balanced_accuracy'].std()))
print()

print('AUC: {}'.format(scores['test_roc_auc']))
print('Mean/SD: {}/{}'.format(scores['test_roc_auc'].mean(), scores['test_roc_auc'].std()))

[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:  1.3min


RandomForest on individual scores with mean/median/mode imputation
balanced accuracy: [0.59879935 0.61619862 0.6717033  0.56817257 0.62311762 0.68879731
 0.60373423 0.63583639 0.64626577 0.65262515 0.67114367 0.57916158
 0.59305047 0.66391941 0.63817664 0.63181726 0.60576923 0.63354701
 0.61589337 0.63064713 0.61935287 0.62138787 0.67607855 0.61706349
 0.64860602 0.67200855 0.59620472 0.62484737 0.55428368 0.57885633
 0.67287342 0.6011396  0.62108262 0.59188034 0.63583639 0.63614164
 0.59361009 0.66478429 0.61731787 0.620523   0.61731787 0.58206146
 0.62891738 0.66188441 0.58552096 0.69485144 0.64799552 0.629477
 0.59305047 0.68905169 0.62774725 0.60200448 0.55631868 0.66651404
 0.65638991 0.55082418 0.65638991 0.64713065 0.63731176 0.67663818
 0.66910867 0.6280525  0.6474359  0.61385836 0.63410663 0.64250102
 0.61126374 0.59132072 0.64077127 0.66015466 0.65120065 0.60200448
 0.63095238 0.66101954 0.62632275 0.61126374 0.62311762 0.60546398
 0.66534392 0.63904151 0.61472324 0.59274522 

[Parallel(n_jobs=15)]: Done 100 out of 100 | elapsed:  1.8min finished
