Goal of the Competition

The goal of this competition is to predict MDS-UPDR scores, which measure progression in patients with Parkinson's disease. The Movement Disorder Society-Sponsored Revision of the Unified Parkinson's Disease Rating Scale (MDS-UPDRS) is a comprehensive assessment of both motor and non-motor symptoms associated with Parkinson's. You will develop a model trained on data of protein and peptide levels over time in subjects with Parkinson’s disease versus normal age-matched control subjects.

Your work could help provide important breakthrough information about which molecules change as Parkinson’s disease progresses.

In [6]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


In [18]:
#import data
input_dir = os.path.join(os.path.dirname(os.getcwd()), 'KAGGLE-AMP-PARKINSONS-PROGRESSION-PREDICTION')
temp_data_dir = os.path.join(input_dir, 'temp_data')
print(input_dir)
train_clinical = pd.read_csv(os.path.join(input_dir, 'train_clinical_data.csv'))
train_clinical.shape
train_clinical.info()
train_clinical.head()


/Users/administrator/Documents/Parkinsons/KAGGLE-AMP-PARKINSONS-PROGRESSION-PREDICTION
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2615 entries, 0 to 2614
Data columns (total 8 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   visit_id                             2615 non-null   object 
 1   patient_id                           2615 non-null   int64  
 2   visit_month                          2615 non-null   int64  
 3   updrs_1                              2614 non-null   float64
 4   updrs_2                              2613 non-null   float64
 5   updrs_3                              2590 non-null   float64
 6   updrs_4                              1577 non-null   float64
 7   upd23b_clinical_state_on_medication  1288 non-null   object 
dtypes: float64(4), int64(2), object(2)
memory usage: 163.6+ KB


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,,
1,55_3,55,3,10.0,7.0,25.0,,
2,55_6,55,6,8.0,10.0,34.0,,
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On


In [19]:
train_peptides = pd.read_csv(os.path.join(input_dir, 'train_peptides.csv'))
train_peptides.shape
train_peptides.info()
train_peptides.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981834 entries, 0 to 981833
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   visit_id          981834 non-null  object 
 1   visit_month       981834 non-null  int64  
 2   patient_id        981834 non-null  int64  
 3   UniProt           981834 non-null  object 
 4   Peptide           981834 non-null  object 
 5   PeptideAbundance  981834 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 44.9+ MB


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7


In [20]:
train_proteins = pd.read_csv(os.path.join(input_dir, 'train_proteins.csv'))
train_proteins.shape
train_proteins.info()
train_proteins.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232741 entries, 0 to 232740
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   visit_id     232741 non-null  object 
 1   visit_month  232741 non-null  int64  
 2   patient_id   232741 non-null  int64  
 3   UniProt      232741 non-null  object 
 4   NPX          232741 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 8.9+ MB


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0


In [None]:
train_supplemental = pd.read_csv(os.path.join(input_dir, 'supplemental_clinical_data.csv'))
train_supplemental.shape
train_supplemental.info()
train_supplemental.head()

In [None]:
#merging the proteins and peptides data based on visit id, patient id and UniProt ID
train_proteins_peptides = pd.merge(train_proteins, train_peptides, on=['patient_id', 'visit_id', 'visit_month', 'UniProt'])
train_proteins_peptides.shape
train_proteins_peptides.info()
# train_proteins_peptides.head(75)
train_proteins_peptides.to_csv('./temp_data/train_proteins_peptides.csv', index=False)
# display(train_proteins_peptides)

In [None]:
train_proteins_peptides_df = pd.read_csv(os.path.join(temp_data_dir, 'train_proteins_peptides.csv'))
train_proteins_peptides_df.shape

#merge the clinical data with the proteins and peptides data with visit id and patient id
train_clinical_proteins_peptides = pd.merge(train_clinical, train_proteins_peptides_df, on=['patient_id', 'visit_id', 'visit_month'])
train_clinical_proteins_peptides.shape
train_clinical_proteins_peptides.info()
train_clinical_proteins_peptides.head(75)
train_clinical_proteins_peptides.to_csv('./temp_data/train_clinical_proteins_peptides.csv', index=False)
# display(train_clinical_proteins_peptides)
