In [15]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [10]:
# data_dir = "/kaggle/input/amp-parkinsons-disease-progression-prediction"
data_dir = './data'

train_proteins = pd.read_csv(f'{data_dir}/train_proteins.csv')
train_peptides = pd.read_csv(f'{data_dir}/train_peptides.csv')
train_clinical = pd.read_csv(f'{data_dir}/train_clinical_data.csv')
supplemental = pd.read_csv(f'{data_dir}/supplemental_clinical_data.csv')

print(train_proteins.shape, train_peptides.shape, train_clinical.shape, supplemental.shape)

(232741, 5) (981834, 6) (2615, 8) (2223, 8)


In [3]:
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = dem != 0
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap)

# def smape(A, F):
#     return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [6]:
supplemental.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,35_0,35,0,5.0,3.0,16.0,0.0,
1,35_36,35,36,6.0,4.0,20.0,0.0,
2,75_0,75,0,4.0,6.0,26.0,0.0,
3,75_36,75,36,1.0,8.0,38.0,0.0,On
4,155_0,155,0,,,0.0,,


In [13]:
train_df = pd.concat([train_clinical, supplemental])

for i in range(1, 5): 
    train_df[f'updrs_{i}'] = train_df[f'updrs_{i}'].fillna(train_df[f'updrs_{i}'].median())

train_df.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,0.0,
1,55_3,55,3,10.0,7.0,25.0,0.0,
2,55_6,55,6,8.0,10.0,34.0,0.0,
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On


In [33]:
train = train_df
estimates = {}
months = train.visit_month.unique()
targets = ['updrs_1','updrs_2', 'updrs_3', 'updrs_4']
test_list = [200, 150, 100, 50, 10, 5, 1, 0] #Intentionally leaving the best for last
best_test_val = 1000
best_test_score = 1000

for m in months:
    for target in targets:

        # Get all labels for this month/updr_i
        t = train[train.visit_month==m][f'{target}'].dropna().values
        s = []
        best_threshold = 0
        best_smape = 200

        # Test different prediction values 
        for i in np.arange(0, 50, 0.1):
            score = smape(t, np.array([i for _ in range(len(t))]))
            s.append(score)
            # Keep track of the prediction value that gives the best smape 
            if score < best_smape:
                best_smape = score
                best_threshold = i
        estimates[(m, target)] = best_threshold

# Fill in gaps in the estimates 
for i in range(sorted(months)[-1]+1):
    for target in targets:
        if (i, target) not in estimates:
            estimates[(i, target)] = estimates[(i-1, target)]

validation_x = []
validation_y = []

# Rerun predictions on full dataset using the best prediction value
for id, row in train.iterrows():
    for t in targets:
        if row[f'{t}']>=0:
            validation_x.append((row.visit_month, t))
            validation_y.append(row[f'{t}'])

temp_score = smape(validation_y, pd.Series(validation_x).map(estimates).values)
if temp_score <= best_test_score:
    best_test_score = temp_score
    best_test_val = j