In [12]:
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn import metrics

import tensorflow as tf

In [2]:
df_train_clin = pd.read_csv("./Data/train_clinical_data.csv")
df_train_pept = pd.read_csv("./Data/train_peptides.csv")
df_train_prot = pd.read_csv("./Data/train_proteins.csv")

In [3]:
# Create Targets

patients = {}
for e in range(1,5):
    for m in [0,6,12,24]:
        df_train_clin[f'updrs_{e}_plus_{m}_months'] = 0

for patient in df_train_clin.patient_id.unique():
    temp = df_train_clin[df_train_clin.patient_id == patient]
    month_list = []
    month_windows = [0,6,12,24]
    for month in temp.visit_month.values:
        month_list.append([month, month + 6, month + 12, month + 24])
    for month in range(len(month_list)):
        for x in range(1,5):
            arr = temp[temp.visit_month.isin(month_list[month])][f'updrs_{x}'].fillna(0).to_list()
            if len(arr) == 4:
                for e, i in enumerate(arr):
                    m = month_list[month][0]
                    temp.loc[temp.visit_month == m,[f'updrs_{x}_plus_{month_windows[e]}_months']] = i
            else:
                temp = temp[~temp.visit_month.isin(month_list[month])]
    patients[patient] = temp

In [4]:
formatted_clin = pd.concat(patients.values(), ignore_index=True).set_index('visit_id').iloc[:,7:]
formatted_clin.head()

Unnamed: 0_level_0,updrs_1_plus_0_months,updrs_1_plus_6_months,updrs_1_plus_12_months,updrs_1_plus_24_months,updrs_2_plus_0_months,updrs_2_plus_6_months,updrs_2_plus_12_months,updrs_2_plus_24_months,updrs_3_plus_0_months,updrs_3_plus_6_months,updrs_3_plus_12_months,updrs_3_plus_24_months,updrs_4_plus_0_months,updrs_4_plus_6_months,updrs_4_plus_12_months,updrs_4_plus_24_months
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
55_0,10,8,10,16,6,10,10,9,15,34,41,49,0,0,0,0
55_6,8,10,7,14,10,10,13,13,34,41,38,49,0,0,0,0
55_12,10,7,16,17,10,13,9,18,41,38,49,51,0,0,0,0
55_18,7,16,14,12,13,9,13,20,38,49,49,41,0,0,0,0
55_24,16,14,17,17,9,13,18,16,49,49,51,52,0,0,0,0


In [5]:
protfeatures = df_train_prot.pivot(index='visit_id', columns='UniProt', values='NPX')
protfeatures.head()

UniProt,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,O60888,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10053_0,9104.27,402321.0,,,7150.57,2497.84,83002.9,15113.6,167327.0,129048.0,...,,9469.45,94237.6,,23016.0,177983.0,65900.0,15382.0,,19017.4
10053_12,10464.2,435586.0,,,,,197117.0,15099.1,164268.0,108114.0,...,,14408.4,,,28537.0,171733.0,65668.1,,9295.65,25697.8
10053_18,13235.7,507386.0,7126.96,24525.7,,2372.71,126506.0,16289.6,168107.0,163776.0,...,317477.0,38667.2,111107.0,,37932.6,245188.0,59986.1,10813.3,,29102.7
10138_12,12600.2,494581.0,9165.06,27193.5,22506.1,6015.9,156313.0,54546.4,204013.0,56725.0,...,557904.0,44556.9,155619.0,14647.9,36927.7,229232.0,106564.0,26077.7,21441.8,7642.42
10138_24,12003.2,522138.0,4498.51,17189.8,29112.4,2665.15,151169.0,52338.1,240892.0,85767.1,...,,47836.7,177619.0,17061.1,25510.4,176722.0,59471.4,12639.2,15091.4,6168.55


In [6]:
df = protfeatures.merge(formatted_clin, left_index=True,right_index=True,how='right')
print(f'\nNA values: {df[protfeatures.columns].isna().sum().sum()/(len(df)*len(protfeatures.columns)):.2%}')
df['visit_month'] = df.reset_index().visit_id.str.split('_').apply(lambda x: int(x[1])).values
df.head()


NA values: 53.64%


Unnamed: 0_level_0,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,O60888,...,updrs_2_plus_24_months,updrs_3_plus_0_months,updrs_3_plus_6_months,updrs_3_plus_12_months,updrs_3_plus_24_months,updrs_4_plus_0_months,updrs_4_plus_6_months,updrs_4_plus_12_months,updrs_4_plus_24_months,visit_month
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
55_0,11254.3,732430.0,39585.8,41526.9,31238.0,4202.71,177775.0,62898.2,333376.0,166850.0,...,9,15,34,41,49,0,0,0,0,0
55_6,13163.6,630465.0,35220.8,41295.0,26219.9,4416.42,165638.0,62567.5,277833.0,170345.0,...,13,34,41,38,49,0,0,0,0,6
55_12,15257.6,815083.0,41650.9,39763.3,30703.6,4343.6,151073.0,66963.1,332401.0,151194.0,...,18,41,38,49,51,0,0,0,0,12
55_18,,,,,,,,,,,...,20,38,49,49,41,0,0,0,0,18
55_24,,,,,,,,,,,...,16,49,49,51,52,0,0,0,0,24


In [7]:
visit_month_list = df.reset_index().visit_id.str.split('_').apply(lambda x: int(x[1])).unique().tolist()
protein_list = protfeatures.columns.to_list()

In [8]:
X = df[protfeatures.columns.to_list() + ["visit_month"]]
y = df[formatted_clin.columns]
print('\nX and y shapes:')
X.shape, y.shape


X and y shapes:


((954, 228), (954, 16))

In [9]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

X.visit_month = X.visit_month.astype('float')
y = y.astype('float')

feature_trans = ColumnTransformer([
    (
        'numerical',
        make_pipeline(IterativeImputer(), StandardScaler()),
        make_column_selector(dtype_include='number')
    ),
])

X_transformed = feature_trans.fit_transform(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.visit_month = X.visit_month.astype('float')


In [13]:
import tensorflow.keras.backend as K

def smape_loss(y_true, y_pred):
    epsilon = 0.1
    numer = K.abs(y_pred - y_true)
    denom = K.maximum(K.abs(y_true) + K.abs(y_pred) + epsilon, 0.5 + epsilon)
    smape = numer / (denom/2)
    smape = tf.where(tf.math.is_nan(smape), tf.zeros_like(smape), smape)
    return smape


def calculate_smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    numer = np.round(np.abs(y_pred-y_true),0)
    denom = np.round(np.abs(y_true) + np.abs(y_pred),0)

    return 1/len(y_true) * np.sum(np.nan_to_num(numer / (denom/2))) *100

In [14]:
import wandb
from wandb.keras import WandbCallback

In [15]:
sweep_config = {
    'method': 'random',
    'metric': {
        'name': 'smape_loss',
        'goal': 'minimize'
    },
    'early_terminate': {
        'type': 'hyperband',
        'min_iter': 5
    },
    'parameters': {
        'layers': {
            'values': [32, 64, 96, 128, 256]
        },
        'learning_rate': {
            'values': [0.01, 0.005, 0.001, 0.0005, 0.0001]
        },
        'epochs': {
            'values': [100, 200, 300, 400, 500, 600, 700, 800]
        },
        'dropout': {
            'values': [0.2, 0.4, 0.6, 0.8]
        }
    }
}

In [16]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33muj_fasci[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
def sweep_train(config_defaults=None):
    config_defaults = {
        'layers': 128,
        'learning_rate': 0.01,
        'dropout': 0.6,
        'epochs': 500
    }

    wandb.config.architecture_name = "DNN"
    wandb.config.dataset_name = "AMP"

    model = tf.keras.Sequential([
        tf.keras.layers.Dense(wandb.config.layers, input_shape=[X.shape[1]], activation=tf.nn.relu),
        tf.keras.layers.Dropout(wandb.config.dropout)
    ])