In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn import model_selection
import tensorflow as tf
import tensorflow.keras.backend as K
import warnings
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

kaggle/input\amp-parkinsons-disease-progression-prediction\public_timeseries_testing_util.py
kaggle/input\amp-parkinsons-disease-progression-prediction\supplemental_clinical_data.csv
kaggle/input\amp-parkinsons-disease-progression-prediction\train_clinical_data.csv
kaggle/input\amp-parkinsons-disease-progression-prediction\train_peptides.csv
kaggle/input\amp-parkinsons-disease-progression-prediction\train_proteins.csv
kaggle/input\amp-parkinsons-disease-progression-prediction\amp_pd_peptide\competition.cpython-37m-x86_64-linux-gnu.so
kaggle/input\amp-parkinsons-disease-progression-prediction\amp_pd_peptide\__init__.py
kaggle/input\amp-parkinsons-disease-progression-prediction\example_test_files\sample_submission.csv
kaggle/input\amp-parkinsons-disease-progression-prediction\example_test_files\test.csv
kaggle/input\amp-parkinsons-disease-progression-prediction\example_test_files\test_peptides.csv
kaggle/input\amp-parkinsons-disease-progression-prediction\example_test_files\test_proteins

In [2]:
#Data Analysis
df_train_cli = pd.read_csv("kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")
df_train_pep = pd.read_csv("kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv")
df_train_pro = pd.read_csv("kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv")

In [3]:
#Missing Values
print(f'Unique Clinical Patient #: {df_train_cli["patient_id"].nunique()}')
print("--------------------------------------------------------")
print(f'Null Values found in Clinical Data:')
for col in df_train_cli.columns:
    print(f'{col}: {df_train_cli[col].isna().sum()}')
print("--------------------------------------------------------")

Unique Clinical Patient #: 248
--------------------------------------------------------
Null Values found in Clinical Data:
visit_id: 0
patient_id: 0
visit_month: 0
updrs_1: 1
updrs_2: 2
updrs_3: 25
updrs_4: 1038
upd23b_clinical_state_on_medication: 1327
--------------------------------------------------------


In [4]:
df_train_cli.updrs_3 = df_train_cli.updrs_3.interpolate(method='linear', axis=0)

In [5]:
#Target Values Preparation
patients = {}
for e in range(1,5):
    for m in [0,6,12,24]:
        df_train_cli[f'updrs_{e}_plus_{m}_months'] = 0

for patient in df_train_cli.patient_id.unique():
    temp = df_train_cli[df_train_cli.patient_id == patient]
    month_list = []
    month_windows = [0,6,12,24]
    for month in temp.visit_month.values:
        month_list.append([month, month + 6, month + 12, month + 24])
    for month in range(len(month_list)):
        for x in range(1,5):
            if x == 3:
                arr = temp[temp.visit_month.isin(month_list[month])][f'updrs_{x}'].to_list()
            else:
                arr = temp[temp.visit_month.isin(month_list[month])][f'updrs_{x}'].fillna(0).to_list()
            if len(arr) == 4:
                for e, i in enumerate(arr):
                    m = month_list[month][0]
                    temp.loc[temp.visit_month == m, [f'updrs_{x}_plus_{month_windows[e]}_months']] = i
            else:
                temp = temp[~temp.visit_month.isin(month_list[month])]
    patients[patient] = temp

In [6]:
formatted_clin = pd.concat(patients.values(), ignore_index=True).set_index('visit_id').iloc[:,7:]
formatted_clin.head()

Unnamed: 0_level_0,updrs_1_plus_0_months,updrs_1_plus_6_months,updrs_1_plus_12_months,updrs_1_plus_24_months,updrs_2_plus_0_months,updrs_2_plus_6_months,updrs_2_plus_12_months,updrs_2_plus_24_months,updrs_3_plus_0_months,updrs_3_plus_6_months,updrs_3_plus_12_months,updrs_3_plus_24_months,updrs_4_plus_0_months,updrs_4_plus_6_months,updrs_4_plus_12_months,updrs_4_plus_24_months
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
55_0,10,8,10,16,6,10,10,9,15.0,34.0,41.0,49.0,0,0,0,0
55_6,8,10,7,14,10,10,13,13,34.0,41.0,38.0,49.0,0,0,0,0
55_12,10,7,16,17,10,13,9,18,41.0,38.0,49.0,51.0,0,0,0,0
55_18,7,16,14,12,13,9,13,20,38.0,49.0,49.0,41.0,0,0,0,0
55_24,16,14,17,17,9,13,18,16,49.0,49.0,51.0,52.0,0,0,0,0


In [7]:
prot_features = df_train_pro.pivot(index='visit_id', columns='UniProt', values='NPX').fillna(0)

df = prot_features.merge(formatted_clin, left_index=True,right_index=True,how='right')
print(f'\nNA values: {df[prot_features.columns].isna().sum().sum()/(len(df)*len(prot_features.columns)):.2%}')
df['visit_month'] = df.reset_index().visit_id.str.split('_').apply(lambda x: int(x[1])).values

visit_month_list = df.reset_index().visit_id.str.split('_').apply(lambda x: int(x[1])).unique().tolist()
protein_list = prot_features.columns.to_list()
print(df.dropna().shape)


NA values: 49.69%
(480, 244)


In [8]:
# create a new variables
df_train_pep['UniProt_Peptide'] = df_train_pep['UniProt'] + '_' + df_train_pep['Peptide']
pep_features = df_train_pep.pivot(index='visit_id', columns='UniProt_Peptide', values='PeptideAbundance').fillna(0)


df = pep_features.merge(df, left_index=True,right_index=True,how='right')
print(f'\nNA values: {df[pep_features.columns].isna().sum().sum()/(len(df)*len(pep_features.columns)):.2%}')
df['visit_month'] = df.reset_index().visit_id.str.split('_').apply(lambda x: int(x[1])).values

visit_month_list = df.reset_index().visit_id.str.split('_').apply(lambda x: int(x[1])).unique().tolist()
peptide_list = pep_features.columns.to_list()


NA values: 49.69%


In [9]:
from sklearn.impute import KNNImputer
import pandas as pd

# Assuming your dataframe is named 'df'
# Create an imputer object
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

# Fit the imputer to your data and transform the data
df_imputed = imputer.fit_transform(df)

# Convert the result back to a DataFrame (if necessary), 
# as the transform method returns a numpy array
df_imputed = pd.DataFrame(df_imputed, columns=df.columns, index=df.index)

# Round to 1 decimal place
df_imputed = df_imputed.round(1)

df_imputed

Unnamed: 0_level_0,O00391_NEQEQPLGQWHLS,O00533_GNPEPTFSWTK,O00533_IEIPSSVQQVPTIIK,O00533_KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,O00533_SMEQNGPGLEYR,O00533_TLKIENVSYQDKGNYR,O00533_VIAVNEVGR,O00533_VMTPAVYAPYDVK,O00533_VNGSPVDNHPFAGDVVFPR,O00584_ELDLNSVLLK,...,updrs_2_plus_24_months,updrs_3_plus_0_months,updrs_3_plus_6_months,updrs_3_plus_12_months,updrs_3_plus_24_months,updrs_4_plus_0_months,updrs_4_plus_6_months,updrs_4_plus_12_months,updrs_4_plus_24_months,visit_month
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
55_0,11254.3,102060.0,174185.0,27278.9,30838.7,23216.5,170878.0,148771.0,55202.1,27229.3,...,9.0,15.0,34.0,41.0,49.0,0.0,0.0,0.0,0.0,0.0
55_6,13163.6,90643.1,147434.0,24320.6,25532.9,21884.6,152910.0,118982.0,48758.2,23305.4,...,13.0,34.0,41.0,38.0,49.0,0.0,0.0,0.0,0.0,6.0
55_12,15257.6,114433.0,194848.0,34090.4,30140.5,29528.6,184855.0,172592.0,54596.9,28367.0,...,18.0,41.0,38.0,49.0,51.0,0.0,0.0,0.0,0.0,12.0
55_18,11389.3,90054.5,156586.0,29460.9,27039.0,22117.8,127038.6,126362.6,50137.6,23684.1,...,20.0,38.0,49.0,49.0,41.0,0.0,0.0,0.0,0.0,18.0
55_24,7673.8,65241.7,112237.8,19907.6,20857.6,16431.4,87047.6,82939.4,29787.5,20587.8,...,16.0,49.0,49.0,51.0,52.0,0.0,0.0,0.0,0.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65043_12,14134.9,126575.0,296011.0,44624.9,44012.3,37695.2,186724.0,180328.0,68679.6,28990.8,...,7.0,14.0,13.0,8.5,5.0,0.0,0.0,0.0,0.0,12.0
65043_18,11988.4,119825.4,226779.8,41469.2,40194.6,29252.1,146679.2,159279.1,61314.7,22948.1,...,10.0,13.0,8.5,4.0,15.0,0.0,0.0,0.0,0.0,18.0
65043_24,14659.5,159675.0,296530.0,56647.4,45766.8,43523.0,201256.0,190398.0,68225.3,29540.2,...,6.0,8.5,4.0,5.0,13.0,0.0,0.0,0.0,0.0,24.0
65043_30,9858.5,95860.3,190500.2,36430.9,29600.2,25436.8,128837.7,132796.0,54811.7,23882.0,...,8.0,4.0,5.0,15.0,11.0,0.0,0.0,0.0,1.0,30.0


In [10]:
df_imputed.isna().sum()

O00391_NEQEQPLGQWHLS                             0
O00533_GNPEPTFSWTK                               0
O00533_IEIPSSVQQVPTIIK                           0
O00533_KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK    0
O00533_SMEQNGPGLEYR                              0
                                                ..
updrs_4_plus_0_months                            0
updrs_4_plus_6_months                            0
updrs_4_plus_12_months                           0
updrs_4_plus_24_months                           0
visit_month                                      0
Length: 1212, dtype: int64

In [11]:
#df.dropna(inplace = True)
#df

In [12]:
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import datetime
!nvidia-smi

Tue May 16 15:59:07 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.41                 Driver Version: 531.41       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro RTX 3000               WDDM | 00000000:01:00.0 Off |                  N/A |
| N/A   71C    P8               13W /  N/A|   1340MiB /  6144MiB |      5%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [13]:
def smape_loss(y_true, y_pred):
    epsilon = 0.1
    numer = K.abs(y_pred - y_true)
    denom = K.maximum(K.abs(y_true) + K.abs(y_pred) + epsilon, 0.5 + epsilon)
    smape = numer / (denom/2)
    smape = tf.where(tf.math.is_nan(smape), tf.zeros_like(smape), smape)
    return smape


def smape_obj(preds, y_true):
    actuals = y_true
    summ = np.abs(actuals) + np.abs(preds)
    smape = np.where(summ == 0, 0, (np.abs(actuals - preds) / summ)) * 200
    grad = np.where(actuals >= preds, -200 / summ, 200 / summ)
    hess = np.where(summ == 0, 0, 400 / (summ * summ))
    return grad, hess

def calculate_smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    numer = np.round(np.abs(y_pred-y_true),0)
    denom = np.round(np.abs(y_true) + np.abs(y_pred),0)

    return 1/len(y_true) * np.sum(np.nan_to_num(numer / (denom/2))) *100

In [14]:
target = formatted_clin.columns.to_list()
X = df_imputed.drop(target, axis = 1)
y = df_imputed[target]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.1, random_state=2)

In [16]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [300, 500, 900, 1000],
    'learning_rate': [0.1, 0.03, 0.02, 0.3],
    'max_depth': [3, 4, 5, 8],
    'tree_method': ['hist'], # Changed from 'gpu_hist' to 'hist'
}

# Initialize the model
model = XGBRegressor(objective='reg:squarederror')

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=100, cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fit the RandomizedSearchCV to the data
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_

Fitting 3 folds for each of 64 candidates, totalling 192 fits


In [None]:
def create_params(objective, n_estimators, learning_rate, max_depth, tree_method, gpu_id):
    return {
        'objective': objective,
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'tree_method': tree_method,
        'gpu_id': gpu_id,
    }

# List of parameter sets
params_list = [
    {'n_estimators': 300, 'learning_rate': 0.1, 'max_depth': 3},
    {'n_estimators': 500, 'learning_rate': 0.03, 'max_depth': 4},
    {'n_estimators': 900, 'learning_rate': 0.02, 'max_depth': 5},
    {'n_estimators': 1000, 'learning_rate': 0.3, 'max_depth': 8},
]

# Create the full list of parameter dictionaries
params_full_list = []
for _ in range(len(target) // len(params_list)):
    params_full_list.extend([create_params('reg:squarederror', tree_method='gpu_hist', gpu_id=0, **params) for params in params_list])

params_dic = dict(zip(target, params_full_list))

In [None]:
# parametres setting
params_1 = {
    'objective': 'reg:squarederror',
    'n_estimators': 800, 
    'learning_rate': 0.05, 'max_depth': 4,
    'tree_method': 'gpu_hist', 'gpu_id': 0
}

params_2 = {
    'objective': 'reg:squarederror', 
    'n_estimators': 800, 
    'learning_rate': 0.1, 'max_depth': 4,
    'tree_method': 'gpu_hist', 'gpu_id': 0
}

params_3 = {
    'objective': 'reg:squarederror',
    'n_estimators': 900, 
    'learning_rate': 0.05, 'max_depth': 4,
    'tree_method': 'gpu_hist', 'gpu_id': 0
}

params_4 = {
    'objective':'reg:squarederror',
    'n_estimators': 1000, 
    'learning_rate': 0.08, 'max_depth': 4,
    'tree_method': 'gpu_hist', 'gpu_id': 0
}

params_dic = dict(zip(target,
                      [params_1, params_2, params_3, params_4,
                       params_1, params_2, params_3, params_4,
                       params_1, params_2, params_3, params_4,
                       params_1, params_2, params_3, params_4]))

In [None]:
def model_train_operation(X, y, u, params,
                          feature_number=0):

    model = xgb.XGBRegressor(random_state=4, **params)
    selected_X = X.columns.to_list()
    if feature_number != 0:
        selector = SelectKBest(score_func = f_regression, k = feature_number)
        X_new = selector.fit_transform(X, y[u])
        selected_indices = selector.get_support(indices = True)
        selected_X = X.columns[selected_indices]
        selected_X = selected_X.tolist()
        if 'visit_month' not in selected_X:
            selected_X.append('visit_month') 
        X = X[selected_X]

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.1, random_state=2)
    
    model.fit(X_train, y_train[u])
    
    y_pred = model.predict(X_test)
#     print(y_pred)
    smape = calculate_smape(y_test[u], y_pred)
    mse = mean_squared_error(y_test[u], y_pred)
    return model, smape, mse, selected_X

In [None]:
df_test_cli = pd.read_csv("kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv")
df_test_pep = pd.read_csv("kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv")
df_test_pro = pd.read_csv("kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv")
df_test_sub = pd.read_csv("kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv")

In [None]:
def data_test_transform(df_clin_test, df_test_pep, df_test_pro):
    df_test_pep['UniProt_Peptide'] = df_test_pep['UniProt'] +'_' +df_test_pep['Peptide']
    # 1. transform test_pep
    pep_test = df_test_pep.pivot_table(index=['visit_id','visit_month'],
                                       columns='UniProt_Peptide',
                                       values='PeptideAbundance').reset_index()
    pep_test = pd.concat([pep_test[['visit_id', 'visit_month']],
                          pep_test.drop(['visit_id', 'visit_month'], axis=1)], axis=1).fillna(0)
    
    # 2. transform test_pro
    pro_test = df_test_pro.pivot_table(index=['visit_id','visit_month'],
                                         columns='UniProt',values='NPX',fill_value=0).reset_index()
    pro_test = pd.concat([pro_test[['visit_id', 'visit_month']],
                          pro_test.drop(['visit_id', 'visit_month'], axis=1)], axis=1).fillna(0)
    df_test = pd.merge(pro_test, pep_test, on=['visit_id', 'visit_month'])
    return df_test

def data_columns_remain(test, train):
    train_copy = train.copy(deep=True)
    common_cols = test.columns.intersection(train_copy.columns)
    test = test[common_cols]
    for col in train_copy.columns:
        if col not in common_cols:
            test[col] = 0
        
    test = test[train_copy.columns]
    return test, train_copy

In [None]:
df_test = data_test_transform(df_test_cli, df_test_pep, df_test_pro)
df_test.set_index('visit_id', inplace=True)
df_test, df_train = data_columns_remain(df_test, X)

In [None]:
from scipy.optimize import minimize
def smape_plus_1(y_true, y_pred):
    y_true_plus_1 = y_true + 1
    y_pred_plus_1 = y_pred + 1
    metric = np.zeros(len(y_true_plus_1))
    
    numerator = np.abs(y_true_plus_1 - y_pred_plus_1)
    denominator = ((np.abs(y_true_plus_1) + np.abs(y_pred_plus_1)) / 2)
    
    mask_not_zeros = (y_true_plus_1 != 0) | (y_pred_plus_1 != 0)
    metric[mask_not_zeros] = numerator[mask_not_zeros] / denominator[mask_not_zeros]
    
    return 100 * np.nanmean(metric)

train_clinical_data = pd.read_csv('kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
train_clinical_data['source'] = 'standard'

supplemental_clinical_data = pd.read_csv('kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv')
supplemental_clinical_data['source'] = 'supplemental'

train_clinical_all = pd.concat([train_clinical_data, supplemental_clinical_data])
train_clinical_all = train_clinical_all[~train_clinical_all.visit_month.isin([3, 5, 9])]

train_clinical_all['pred_month'] = train_clinical_all['visit_month']

for plus_month in [6, 12, 24]:
    train_shift = train_clinical_all[['patient_id', 'visit_month', 'pred_month', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']].copy()
    train_shift['visit_month'] -= plus_month
    train_shift.rename(columns={f'updrs_{i}': f'updrs_{i}_plus_{plus_month}' for i in range(1, 5)}, inplace=True)
    train_shift.rename(columns={'pred_month': f'pred_month_plus_{plus_month}'}, inplace=True)
    train_clinical_all = train_clinical_all.merge(train_shift, how='left', on=['patient_id', 'visit_month'])

train_clinical_all.rename(columns={f'updrs_{i}': f'updrs_{i}_plus_0' for i in range(1, 5)}, inplace=True)
train_clinical_all.rename(columns={'pred_month': f'pred_month_plus_0'}, inplace=True)


def calculate_predicitons(pred_month, trend):
    return np.round(trend[0] + pred_month * trend[1])

def function_to_minimize(x):    
    metric = smape_plus_1(
        y_true=y_true_array, 
        y_pred=calculate_predicitons(
            pred_month=pred_month_array,
            trend=x
        )
    )
    return metric

target_to_trend = {}
for i in range(1, 5):
    target_i = f'updrs_{i}'
    columns_with_target = [f'{target_i}_plus_{plus_month}' for plus_month in [0, 6, 12, 24]]
    columns_with_pred_month = [f'pred_month_plus_{plus_month}' for plus_month in [0, 6, 12, 24]]
    y_true_array = train_clinical_all[columns_with_target].values.ravel()
    pred_month_array = train_clinical_all[columns_with_pred_month].values.ravel()
    trend = list(minimize(
        fun=function_to_minimize,
        x0=[0, 0.0048],
        method='Powell'
    ).x)
    target_to_trend[target_i] = trend

In [None]:
models_dict = {}
mse_scores = {}
feature_dict = dict(zip(target, [x for x in range(len(target))]))


feature_number_dict = dict(zip(target, [600, 700, 700, 650,
                                        600, 850, 700, 470,
                                        600, 0, 700, 460,
                                        100, 150, 100, 150]))

In [None]:
df_test = data_test_transform(df_test_cli, df_test_pep, df_test_pro)
df_test.set_index('visit_id', inplace=True)
df_test, df_train = data_columns_remain(df_test, X)
mse_score = {}
mse_score_test = {}
smape_score = {}
smape_score_test= {}
train_target = target[8:12] # 
for i,u in enumerate(train_target):
    
     params = params_dic[u]
     model_1, smape, mse, select_x = model_train_operation(df_train, y, u, params)
     mse_score[u] = mse
     smape_score[u] = smape

     feature_number = feature_number_dict[u]
     model_2, smape_test, mse_test, select_X = model_train_operation(df_train, y, u, params,
                                                                     feature_number = feature_number)
     mse_score_test[u] = mse_test
     smape_score_test[u] = smape_test
     print(f'--||{u}||, \n mse: ||{mse_score[u]: 2f}||vs||{mse_test:2f}||,\n smape: ||{smape_score[u]: 2f}||vs||{smape_test:2f}|| \n feature number {len(select_X)}----------')
        


In [None]:
print(np.mean(list(smape_score.values())))
print(np.mean(list(smape_score_test.values())))

In [None]:
import amp_pd_peptide
amp_pd_peptide.make_env.func_dict['__called__'] = False
env = amp_pd_peptide.make_env()   # initialize the environment
iter_test = env.iter_test() 

for iteration, (df_test_cli, df_test_pep, df_test_pro, sample_submission) in enumerate(iter_test):
    
            
    sample_submission['patient_id'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[0]))
    sample_submission['visit_month'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[1]))
    sample_submission['target_name'] = sample_submission['prediction_id'].map(lambda x: 'updrs_' + x.split('_')[3])
    sample_submission['plus_month'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[5]))
    sample_submission['pred_month'] = sample_submission['visit_month'] + sample_submission['plus_month']
    
    for i in range(1, 5):
        target_i = f'updrs_{i}'
        mask_target = sample_submission['target_name'] == target_i
        sample_submission.loc[mask_target, 'rating'] = calculate_predicitons(
            pred_month=sample_submission.loc[mask_target, 'pred_month'],
            trend=target_to_trend[target_i]
        )
    
    
    df_test = data_test_transform(df_test_cli, df_test_pep, df_test_pro)
    df_test.set_index('visit_id', inplace=True)
    df_test, df_train = data_columns_remain(df_test, X)
    test_target = target[1:2] + target[8:11] 
    for i,u in enumerate(test_target):
        
        params = params_dic[u]
        feature_number = feature_number_dict[u]
        model, smape_test, mse_test, select_X = model_train_operation(df_train, y, u, params,
                                                         feature_number = feature_number)
        for j, visit_id in enumerate(df_test.index):

            y_pred = model.predict(df_test[select_X][df_test.index==visit_id])
            prediction_id = visit_id+'_'+u
            rating = max(y_pred[0],0)
            print(f'prediction_id: {prediction_id}, rating: {rating}')
            if j%2 ==0:
                sample_submission['rating'][sample_submission['prediction_id']==prediction_id] = 0.7* sample_submission['rating'][sample_submission['prediction_id']==prediction_id] + 0.3 * float(rating)
            else:
                sample_submission['rating'][sample_submission['prediction_id']==prediction_id] = 0.3* sample_submission['rating'][sample_submission['prediction_id']==prediction_id] + 0.7 * float(rating)
    env.predict(sample_submission[['prediction_id', 'rating']])

In [None]:
'''
Best Params for Model:
# List of parameter sets
params_list = [
    {'n_estimators': 500, 'learning_rate': 0.01, 'max_depth': 3},
    {'n_estimators': 500, 'learning_rate': 0.01, 'max_depth': 4},
    {'n_estimators': 800, 'learning_rate': 0.05, 'max_depth': 5},
    {'n_estimators': 1500, 'learning_rate': 0.2, 'max_depth': 6},
]

benchmark results for comparison:
--||updrs_3_plus_0_months||, 
 mse: || 95.949662||vs||101.588181||,
 smape: || 47.592463||vs||49.326212|| 
 feature number 600----------
--||updrs_3_plus_6_months||, 
 mse: || 82.085828||vs||82.085828||,
 smape: || 39.656596||vs||39.656596|| 
 feature number 1196----------
--||updrs_3_plus_12_months||, 
 mse: || 91.589313||vs||92.995778||,
 smape: || 46.314771||vs||47.608374|| 
 feature number 700----------
--||updrs_3_plus_24_months||, 
 mse: || 128.184774||vs||131.270712||,
 smape: || 47.902624||vs||52.357836|| 
 feature number 460----------

 SMAPE scores:
 45.36661340169451
 47.23725456155037

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
prediction_id: 50423_0_updrs_1_plus_6_months, rating: 8.102071762084961
prediction_id: 50423_0_updrs_3_plus_0_months, rating: 17.76900291442871
prediction_id: 50423_0_updrs_3_plus_6_months, rating: 25.231693267822266
prediction_id: 50423_0_updrs_3_plus_12_months, rating: 26.493465423583984
prediction_id: 3342_6_updrs_1_plus_6_months, rating: 6.029064178466797
prediction_id: 3342_6_updrs_3_plus_0_months, rating: 23.474700927734375
prediction_id: 3342_6_updrs_3_plus_6_months, rating: 21.904701232910156
prediction_id: 3342_6_updrs_3_plus_12_months, rating: 23.7000675201416

'''