## Competition Metric
### *This function calculates SMAPE, the metric used to score our predictions in this competition*

In [13]:
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = dem != 0
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap)

# Data Exploration
#### There are some NaN values in our target columns. These will be a problem for the smape function, so these values will be dropped while looking for the best average estimate. 

In [14]:
import pandas as pd
import numpy as np

train = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
#train = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv')


In [15]:
train.isna().sum()

visit_id                                  0
patient_id                                0
visit_month                               0
updrs_1                                   1
updrs_2                                   2
updrs_3                                  25
updrs_4                                1038
upd23b_clinical_state_on_medication    1327
dtype: int64

In [16]:
train.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,,
1,55_3,55,3,10.0,7.0,25.0,,
2,55_6,55,6,8.0,10.0,34.0,,
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On


### This loop optimizes the estimate for the highest smape score on the training set. This is repeated for every month of every target variable. 

In [17]:
train['upd23b_clinical_state_on_medication']= train['upd23b_clinical_state_on_medication'].replace('On', 1)
train['upd23b_clinical_state_on_medication']=train['upd23b_clinical_state_on_medication'].replace('Off', 2)
train['upd23b_clinical_state_on_medication'].fillna(0, inplace=True)

In [18]:
train.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,,0.0
1,55_3,55,3,10.0,7.0,25.0,,0.0
2,55_6,55,6,8.0,10.0,34.0,,0.0
3,55_9,55,9,8.0,9.0,30.0,0.0,1.0
4,55_12,55,12,10.0,10.0,41.0,0.0,1.0


In [19]:
estimates = {}
months = train.visit_month.unique()
targets = [ 'updrs_1','updrs_2', 'updrs_3']
for m in months:
    for target in targets:
        t = train[train.visit_month==m][f'{target}'].dropna().values
        if len(t) >= 100:
            s = []
            best_threshold = 2
            best_smape = 100
            for i in np.arange(0, 30, 0.1):
                score = smape(t, np.array([i for _ in range(len(t))]))
                s.append(score)
                if score < best_smape:
                    best_smape = score
                    best_threshold = i
        else:
            best_threshold = np.median(t)
        if target == 'updrs_2':
            best_threshold+=5
        estimates[(m, target)] = best_threshold

for i in range(sorted(months)[-1]+1):
    for target in targets:
        if (i, target) not in estimates:
            estimates[(i, target)] = estimates[(i-1, target)]

### This bit of code calculates a score on the training data, and this should resemble the testing data

In [20]:
validation_x = []
validation_y = []

for id, row in train.iterrows():
    for t in targets:
        if row[f'{t}']>=0:
            validation_x.append((row.visit_month, t))
            validation_y.append(row[f'{t}'])
            
smape(validation_y, pd.Series(validation_x).map(estimates).values)

76.15783680983034

In [21]:
#76.36180750178882-- 70
#75.51539629900658-- 71
#74.64358918477852-- 70
#80.02508514819662-- 103

# Applying Optimal Value Estimates to Test Data

import amp_pd_peptide
env = amp_pd_peptide.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files

# The API will deliver four dataframes in this specific order:
for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    # This maps the correct value estimate to each line in sample_submission
    targets = sample_submission.prediction_id.str.split('_').apply(lambda x: (int(x[1]) + int(x[5]), '_'.join(x[2:4])))
    sample_submission['rating'] = targets.map(estimates)
    sample_submission['rating'] = sample_submission['rating'].fillna(0)
    
    # Saves predictions to csv file
    env.predict(sample_submission)

submission = pd.read_csv('/kaggle/working/submission.csv')
submission