In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

from keras.models import Sequential
from keras.layers import Dense, LSTM

import missingno as msno

# Intialize

In [3]:
train_peptides = pd.read_csv('/content/drive/Shareddrives/ML@SJSU Members/Hands-on 2023/Parkinsons Disease/Data/train_peptides.csv')
train_proteins = pd.read_csv('/content/drive/Shareddrives/ML@SJSU Members/Hands-on 2023/Parkinsons Disease/Data/train_proteins.csv')
train_clinical_data = pd.read_excel('/content/drive/Shareddrives/ML@SJSU Members/Hands-on 2023/Parkinsons Disease/Data/train_clinical_data.xlsx')

In [4]:
# Function to prepare dataset with all the steps mentioned above:
def prepare_dataset(train_proteins, train_peptides):
    # Step 1: Grouping 
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].mean().reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].mean().reset_index()
    
    # Step 2: Pivoting
    df_protein = df_protein_grouped.pivot(index='visit_id',columns = 'UniProt', values = 'NPX').rename_axis(columns=None).reset_index()
    df_peptide = df_peptide_grouped.pivot(index='visit_id',columns = 'Peptide', values = 'PeptideAbundance').rename_axis(columns=None).reset_index()
    
    # Step 3: Merging
    pro_pep_df = df_protein.merge(df_peptide, on = ['visit_id'], how = 'left')
    
    return pro_pep_df

In [5]:
pro_pep_df = prepare_dataset(train_proteins, train_peptides)

In [7]:
pro_pep_df

Unnamed: 0,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,10053_0,9104.27,402321.0,,,7150.57,2497.84,83002.9,15113.6,167327.0,...,202274.0,,4401830.0,77482.6,583075.0,76705.7,104260.0,530223.0,,7207.30
1,10053_12,10464.20,435586.0,,,,,197117.0,15099.1,164268.0,...,201009.0,,5001750.0,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.80
2,10053_18,13235.70,507386.0,7126.96,24525.7,,2372.71,126506.0,16289.6,168107.0,...,220728.0,,5424380.0,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.70
3,10138_12,12600.20,494581.0,9165.06,27193.5,22506.10,6015.90,156313.0,54546.4,204013.0,...,188362.0,9433.71,3900280.0,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98
4,10138_24,12003.20,522138.0,4498.51,17189.8,29112.40,2665.15,151169.0,52338.1,240892.0,...,206187.0,6365.15,3521800.0,69984.6,496737.0,80919.3,111799.0,,56977.6,4903.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,8699_24,9983.00,400290.0,24240.10,,16943.50,6303.17,77493.6,46435.3,254247.0,...,289888.0,8615.27,8770410.0,33599.1,926094.0,118897.0,133682.0,571879.0,80268.3,54889.70
1109,942_12,6757.32,360858.0,18367.60,14760.7,18603.40,1722.77,86847.4,37741.3,212132.0,...,173259.0,4767.63,374307.0,35767.3,250397.0,65966.9,77976.8,486239.0,45032.7,
1110,942_24,,352722.0,22834.90,23393.1,16693.50,1487.91,114772.0,36095.7,185836.0,...,185428.0,5554.53,,64049.8,479473.0,68505.7,74483.1,561398.0,52916.4,21847.60
1111,942_48,11627.80,251820.0,22046.50,26360.5,22440.20,2117.43,82241.9,30146.6,167633.0,...,137611.0,6310.09,,28008.8,231359.0,63265.8,64601.8,632782.0,51123.7,20700.30


In [8]:
merged_df = pd.merge(train_clinical_data, pro_pep_df, on=['visit_id'])
merged_df

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,O00391,O00533,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55_0,55,0,10.0,6.0,15.0,,,11254.3,732430.0,...,201158.0,16492.30,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55_6,55,6,8.0,10.0,34.0,,,13163.6,630465.0,...,171079.0,13198.80,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
2,55_12,55,12,10.0,10.0,41.0,0.0,On,15257.6,815083.0,...,231772.0,17873.80,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
3,55_36,55,36,17.0,18.0,51.0,0.0,On,13530.8,753832.0,...,185290.0,18580.50,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
4,942_6,942,6,8.0,2.0,21.0,,,11218.7,399518.0,...,226314.0,6399.80,,57571.4,480951.0,80001.2,79661.9,573300.0,48005.8,15674.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1063,64674_84,64674,84,11.0,15.0,45.0,4.0,Off,,190487.0,...,203523.0,3835.58,4901220.0,40325.9,335625.0,49250.4,64076.3,667993.0,38472.5,21949.1
1064,65043_0,65043,0,2.0,6.0,16.0,,,13472.4,927954.0,...,257361.0,18316.60,2514660.0,51444.6,530245.0,156148.0,157548.0,336625.0,48423.2,10915.8
1065,65043_12,65043,12,4.0,7.0,14.0,0.0,Off,14134.9,984651.0,...,230437.0,16703.20,2481560.0,44405.0,543391.0,159828.0,161207.0,330337.0,45368.1,19023.2
1066,65043_24,65043,24,4.0,8.0,,0.0,,14659.5,1062020.0,...,251228.0,18326.20,2939460.0,50588.2,597869.0,148032.0,192857.0,388125.0,65101.0,20790.1


In [6]:


# Get protein and peptide features & fill with mean
pro_pep = pro_pep_df.columns.tolist()[1:]
for i in pro_pep:
  merged_df[i] = merged_df.groupby('patient_id')[i].transform(lambda x: x.fillna(x.mean()))

In [None]:
merged_df['upd23b_clinical_state_on_medication'] = merged_df['upd23b_clinical_state_on_medication'].fillna('off')

# Filling UPDRS with Mean - Patient Specific
merged_df['updrs_4'] = merged_df.groupby('patient_id')['updrs_4'].transform(lambda x: x.fillna(x.mean()))
merged_df['updrs_3'] = merged_df.groupby('patient_id')['updrs_3'].transform(lambda x: x.fillna(x.mean()))

In [None]:
# Fill remaining with mean

merged_df.fillna(merged_df.mean(), inplace=True)

  merged_df.fillna(merged_df.mean(), inplace=True)


In [None]:
pd.set_option('display.max_rows', None)

missing_values = merged_df.isna().mean()*100
print(missing_values.sort_values(ascending=False))

visit_id                                                                                   0.0
MASGAANVVGPK                                                                               0.0
M(UniMod_35)YLGYEYVTAIR                                                                    0.0
M(UniMod_35)VQEQC(UniMod_4)C(UniMod_4)HSQLEELHC(UniMod_4)ATGISLANEQDR                      0.0
M(UniMod_35)TVTDQVNC(UniMod_4)PK                                                           0.0
M(UniMod_35)LTPEHVFIHPGWK                                                                  0.0
M(UniMod_35)ELERPGGNEITR                                                                   0.0
M(UniMod_35)C(UniMod_4)PQLQQYEMHGPEGLR                                                     0.0
M(UniMod_35)ASGAANVVGPK                                                                    0.0
M(UniMod_35)ADEAGSEADHEGTHSTKR                                                             0.0
LYQQHGAGLFDVTR                                    

# After Cleaning

In [None]:
data = pd.read_csv('/content/drive/Shareddrives/ML@SJSU Members/Hands-on 2023/Parkinsons Disease/Data/cleaned_data.csv')

In [None]:
data = data.drop(columns=['visit_id','patient_id'], axis=1)

In [None]:
le = LabelEncoder()
data['visit_month'] = le.fit_transform(data['visit_month'])
data['upd23b_clinical_state_on_medication'] = le.fit_transform(data['upd23b_clinical_state_on_medication'])
data

Unnamed: 0,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,O00391,O00533,O00584,O14498,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
0,0,10.0,6.0,15.000000,0.0,2,11254.300000,732430.0,39585.8,41526.9,...,365475.0,35528.00,97005.600000,23122.5,60912.6,408698.0,70369.35,29758.800000,23833.7,18953.5
1,2,8.0,10.0,34.000000,0.0,2,13163.600000,630465.0,35220.8,41295.0,...,405676.0,30332.60,109174.000000,23499.8,51655.8,369870.0,70369.35,22935.200000,17722.5,16642.7
2,3,10.0,10.0,41.000000,0.0,1,15257.600000,815083.0,41650.9,39763.3,...,303953.0,43026.20,114921.000000,21860.1,61598.2,318553.0,65762.60,29193.400000,28536.1,19290.9
3,7,17.0,18.0,51.000000,0.0,1,13530.800000,753832.0,43048.9,43503.6,...,303597.0,48188.40,109794.000000,23930.6,70223.5,377550.0,74976.10,31732.600000,22186.5,21717.1
4,2,8.0,2.0,21.000000,0.0,2,11218.700000,399518.0,20581.0,31290.9,...,253373.0,27431.80,93796.700000,17450.9,21299.1,306621.0,82335.50,24018.700000,18939.5,15251.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1063,12,11.0,15.0,45.000000,4.0,0,11434.196118,190487.0,24907.9,18543.1,...,260021.0,7139.93,104277.000000,10500.0,21944.2,136725.0,62217.50,15135.233333,10287.7,13848.2
1064,0,2.0,6.0,16.000000,0.0,2,13472.400000,927954.0,42661.5,43663.2,...,186414.0,25897.80,120462.333333,21480.7,57364.0,416142.0,37584.60,19411.811427,28346.5,35617.5
1065,3,4.0,7.0,14.000000,0.0,0,14134.900000,984651.0,28990.8,42440.9,...,301343.0,22343.40,105626.000000,20500.8,54011.2,380072.0,40588.90,19411.811427,17035.7,37064.2
1066,5,4.0,8.0,14.333333,0.0,2,14659.500000,1062020.0,46440.4,38293.0,...,300439.0,52143.60,139291.000000,19449.2,66569.9,300948.0,36150.40,19411.811427,21286.3,39587.9


In [None]:
X = data.drop(columns =  ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4', 'upd23b_clinical_state_on_medication'] , axis=1)
y = data[['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((854, 228), (214, 228))

In [None]:
# Nomalize the data

scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [None]:
def smape(y_true, y_pred):
    """
    Calculate the symmetric mean absolute percentage error (SMAPE) for two 1D arrays.
    """
    diff = np.abs(y_true - y_pred)
    denom = np.abs(y_true) + np.abs(y_pred)
    return 100 / len(y_true) * np.sum(2 * diff / denom)


In [None]:
test = pd.read_csv('/content/drive/Shareddrives/ML@SJSU Members/Hands-on 2023/Parkinsons Disease/Data/test.csv')
test_peptides = pd.read_csv('/content/drive/Shareddrives/ML@SJSU Members/Hands-on 2023/Parkinsons Disease/Data/test_peptides.csv')
test_proteins = pd.read_csv('/content/drive/Shareddrives/ML@SJSU Members/Hands-on 2023/Parkinsons Disease/Data/test_proteins.csv')
sample_submission = pd.read_csv('/content/drive/Shareddrives/ML@SJSU Members/Hands-on 2023/Parkinsons Disease/Data/sample_submission.csv')

# Sample Model & Submission

In [None]:

y_pred = pd.DataFrame()
gbr_smape = {}

for i, col in enumerate(y_train.columns):
  gbr = GradientBoostingRegressor()
  gbr.fit(X_train, y_train[col])
  y_pred[col] = gbr.predict(X_test)
  gbr_smape[col] = smape(y_test[col], y_pred[col])

gbr_smape

{'updrs_1': 12.531666864031187,
 'updrs_2': 21.675981515140656,
 'updrs_3': 19.284581294196553,
 'updrs_4': 28.100953605444566}

In [None]:
# Prepare test data

test_pivoted = test_proteins.pivot(index=['patient_id', 'visit_month'], columns='UniProt', values='NPX')
test_pivoted = test_pivoted.reset_index()
# test_merged = pd.merge(test, test_pivoted, on=['patient_id', 'visit_month'])
test_pivoted = test_pivoted[X_train.columns.tolist()]
test_pivoted.fillna(test_pivoted.mean(), inplace=True)

KeyError: ignored

In [None]:
len(X_train.columns

228

In [None]:
sample_submission

Unnamed: 0,prediction_id,rating,group_key
0,3342_0_updrs_1_plus_0_months,0,0
1,3342_0_updrs_1_plus_6_months,0,0
2,3342_0_updrs_1_plus_12_months,0,0
3,3342_0_updrs_1_plus_24_months,0,0
4,3342_0_updrs_2_plus_0_months,0,0
...,...,...,...
59,50423_6_updrs_3_plus_24_months,0,6
60,50423_6_updrs_4_plus_0_months,0,6
61,50423_6_updrs_4_plus_6_months,0,6
62,50423_6_updrs_4_plus_12_months,0,6


In [None]:
def smape(actual, predicted):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (SMAPE) between two multi-dimensional arrays.
    """
    dividend = np.abs(actual - predicted)
    denominator = np.abs(actual) + np.abs(predicted)
    smape_array = np.divide(dividend, denominator, out=np.zeros_like(dividend), where=denominator != 0)
    smape_sum = np.sum(smape_array)
    smape_score = (2 * smape_sum) / (actual.shape[0] * actual.shape[1]) * 100
    return smape_score


In [None]:
gbr = GradientBoostingRegressor()
mor = MultiOutputRegressor(gbr)

# train the model
mor.fit(X_train, y_train)

y_pred = mor.predict(X_test)
y_pred_train = mor.predict(X_train)

print(smape(y_test, y_pred))
print(smape(y_train, y_pred_train))

updrs_1    15.661052
updrs_2    24.056019
updrs_3    21.056475
updrs_4    34.800388
dtype: float64
updrs_1    10.354230
updrs_2    17.759130
updrs_3    14.607174
updrs_4    32.287237
dtype: float64


In [None]:
rfr = RandomForestRegressor()
mor = MultiOutputRegressor(rfr)

# train the model
mor.fit(X_train, y_train)

y_pred = mor.predict(X_test)
y_pred_train = mor.predict(X_train)

print(smape(y_test, y_pred))
print(smape(y_train, y_pred_train))

updrs_1    16.069005
updrs_2    23.892785
updrs_3    20.935020
updrs_4    34.243214
dtype: float64
updrs_1     8.555159
updrs_2    16.263610
updrs_3    13.197641
updrs_4    30.541849
dtype: float64


In [None]:
sdr = SGDRegressor(random_state=42)
mor = MultiOutputRegressor(sdr)

# train the model
mor.fit(X_train, y_train)

y_pred = mor.predict(X_test)
y_pred_train = mor.predict(X_train)

print(smape(y_test, y_pred))
print(smape(y_train, y_pred_train))

updrs_1    16.640629
updrs_2    24.339941
updrs_3    21.549462
updrs_4    36.819811
dtype: float64
updrs_1    15.268031
updrs_2    22.167355
updrs_3    18.971935
updrs_4    36.868467
dtype: float64


In [None]:
# Define the hyperparameters to search over for each model
param_grid_gbr = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.05, 0.01]
}

param_grid_rfr = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 8]
}

param_grid_sgd = {
    'alpha': [0.0001, 0.001, 0.01],
    'max_iter': [1000, 5000, 10000],
    'penalty': ['l1', 'l2', 'elasticnet']
}

# Define the models to be used
gbr = GradientBoostingRegressor(random_state=0)
rfr = RandomForestRegressor(random_state=0)
sgd = SGDRegressor(random_state=0)

# Create a list of dictionaries for each model to iterate over
models = [
    {'name': 'GBR', 'model': gbr, 'param_grid': param_grid_gbr},
    {'name': 'RFR', 'model': rfr, 'param_grid': param_grid_rfr},
    {'name': 'SGD', 'model': sgd, 'param_grid': param_grid_sgd}
]

# Iterate over each UPDRS score and each model to perform hyperparameter tuning
for i in range(4):
    y_train_i = y_train.iloc[:, i].ravel()
    y_test_i = y_test.iloc[:, i].ravel()
    for model in models:
        mor = model['model']
        param_grid = model['param_grid']

        # Define the grid search with mean squared error as the scoring metric
        gs = GridSearchCV(mor, param_grid, scoring='neg_mean_squared_error', cv=5)

        # Fit the grid search to the training data
        gs.fit(X_train, y_train_i)

        # Get the best estimator from the grid search
        best_estimator = gs.best_estimator_

        # Train the model with the best hyperparameters
        best_estimator.fit(X_train, y_train_i)

        # Make predictions on the test and training data
        y_pred = best_estimator.predict(X_test)
        y_pred_train = best_estimator.predict(X_train)

        # Print the results
        print('UPDRS', i+1, model['name'])
        print('Best hyperparameters:', gs.best_params_)
        print('Test MSE:', mean_squared_error(y_test_i, y_pred))
        print('Train MSE:', mean_squared_error(y_train_i, y_pred_train))
        print('\n-----------')


NameError: ignored

In [None]:
# Kaggle Submission
# Feature Importance -> feature selection
# Include peptide data
# Use deep learning -> 1D - Sequential, CNN
# ARIMA, RNN and LSTM

In [None]:
# Test data

test = pd.read_csv('/content/drive/Shareddrives/ML@SJSU Members/Hands-on 2023/Parkinsons Disease/Data/test.csv')
test_peptides = pd.read_csv('/content/drive/Shareddrives/ML@SJSU Members/Hands-on 2023/Parkinsons Disease/Data/test_peptides.csv')
test_proteins = pd.read_csv('/content/drive/Shareddrives/ML@SJSU Members/Hands-on 2023/Parkinsons Disease/Data/test_proteins.csv')
sample_submission = pd.read_csv('/content/drive/Shareddrives/ML@SJSU Members/Hands-on 2023/Parkinsons Disease/Data/sample_submission.csv')

In [None]:
test_proteins

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX,group_key
0,50423_0,0,50423,O00391,33127.90,0
1,50423_0,0,50423,O00533,490742.00,0
2,50423_0,0,50423,O00584,43615.30,0
3,50423_0,0,50423,O14773,16486.60,0
4,50423_0,0,50423,O14791,2882.42,0
...,...,...,...,...,...,...
448,3342_6,6,3342,Q9UHG2,325226.00,6
449,3342_6,6,3342,Q9UKV8,64411.50,6
450,3342_6,6,3342,Q9UNU6,25117.50,6
451,3342_6,6,3342,Q9Y646,51473.30,6


In [None]:
sample_submission

Unnamed: 0,prediction_id,rating,group_key
0,3342_0_updrs_1_plus_0_months,0,0
1,3342_0_updrs_1_plus_6_months,0,0
2,3342_0_updrs_1_plus_12_months,0,0
3,3342_0_updrs_1_plus_24_months,0,0
4,3342_0_updrs_2_plus_0_months,0,0
...,...,...,...
59,50423_6_updrs_3_plus_24_months,0,6
60,50423_6_updrs_4_plus_0_months,0,6
61,50423_6_updrs_4_plus_6_months,0,6
62,50423_6_updrs_4_plus_12_months,0,6
