In [1]:
import pandas as pd
import plotly.express as px

# Load Data

In [2]:
train_peptides_df = pd.read_csv('./data/train_peptides.csv')
print(train_peptides_df.shape)
print(train_peptides_df.groupby(['patient_id', 'visit_id']).size())
print("num patients", train_peptides_df.patient_id.nunique())
print("num visits total", train_peptides_df.visit_id.nunique())

print(train_peptides_df.visit_month.describe())
train_peptides_df.head(10)

(981834, 6)
patient_id  visit_id
55          55_0        931
            55_12       943
            55_36       936
            55_6        936
942         942_12      889
                       ... 
64674       64674_84    893
65043       65043_0     912
            65043_12    938
            65043_24    944
            65043_48    937
Length: 1113, dtype: int64
num patients 248
num visits total 1113
count    981834.000000
mean         26.105061
std          22.913897
min           0.000000
25%           6.000000
50%          24.000000
75%          48.000000
max         108.000000
Name: visit_month, dtype: float64


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7
5,55_0,0,55,O00533,TLKIENVSYQDKGNYR,23216.5
6,55_0,0,55,O00533,VIAVNEVGR,170878.0
7,55_0,0,55,O00533,VMTPAVYAPYDVK,148771.0
8,55_0,0,55,O00533,VNGSPVDNHPFAGDVVFPR,55202.1
9,55_0,0,55,O00584,ELDLNSVLLK,27229.3


In [3]:
train_proteins_df = pd.read_csv('./data/train_proteins.csv')
print(train_proteins_df.shape)

# Calculate NPX frequency per visit_id
train_proteins_df['npx_frequency'] = train_proteins_df.groupby('visit_id')['NPX'].transform(lambda x: x / x.sum())

train_proteins_df.head(10)

(232741, 5)


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX,npx_frequency
0,55_0,0,55,O00391,11254.3,1.6e-05
1,55_0,0,55,O00533,732430.0,0.001052
2,55_0,0,55,O00584,39585.8,5.7e-05
3,55_0,0,55,O14498,41526.9,6e-05
4,55_0,0,55,O14773,31238.0,4.5e-05
5,55_0,0,55,O14791,4202.71,6e-06
6,55_0,0,55,O15240,177775.0,0.000255
7,55_0,0,55,O15394,62898.2,9e-05
8,55_0,0,55,O43505,333376.0,0.000479
9,55_0,0,55,O60888,166850.0,0.00024


In [4]:
train_clinical_data_df = pd.read_csv('./data/train_clinical_data.csv')
print(train_clinical_data_df.shape)


train_clinical_data_df.head(10)

(2615, 8)


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,,
1,55_3,55,3,10.0,7.0,25.0,,
2,55_6,55,6,8.0,10.0,34.0,,
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On
5,55_18,55,18,7.0,13.0,38.0,0.0,On
6,55_24,55,24,16.0,9.0,49.0,0.0,On
7,55_30,55,30,14.0,13.0,49.0,0.0,On
8,55_36,55,36,17.0,18.0,51.0,0.0,On
9,55_42,55,42,12.0,20.0,41.0,0.0,On


In [5]:
train_clinical_data_df.upd23b_clinical_state_on_medication.value_counts()

upd23b_clinical_state_on_medication
On     775
Off    513
Name: count, dtype: int64

In [6]:
test_df = pd.read_csv('example_test_files/test.csv')
print(test_df.shape)
test_df.head(10)

(16, 6)


Unnamed: 0,visit_id,visit_month,patient_id,updrs_test,row_id,group_key
0,3342_0,0,3342,updrs_1,3342_0_updrs_1,0
1,3342_0,0,3342,updrs_2,3342_0_updrs_2,0
2,3342_0,0,3342,updrs_3,3342_0_updrs_3,0
3,3342_0,0,3342,updrs_4,3342_0_updrs_4,0
4,50423_0,0,50423,updrs_1,50423_0_updrs_1,0
5,50423_0,0,50423,updrs_2,50423_0_updrs_2,0
6,50423_0,0,50423,updrs_3,50423_0_updrs_3,0
7,50423_0,0,50423,updrs_4,50423_0_updrs_4,0
8,3342_6,6,3342,updrs_1,3342_6_updrs_1,6
9,3342_6,6,3342,updrs_2,3342_6_updrs_2,6


In [7]:
test_peptides_df = pd.read_csv('example_test_files/test_peptides.csv')
print(test_peptides_df.shape)
print(test_peptides_df.visit_id.unique())
test_peptides_df.head()

(2057, 7)
['50423_0' '3342_6']


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance,group_key
0,50423_0,0,50423,O00391,AHFSPSNIILDFPAAGSAAR,22226.3,0
1,50423_0,0,50423,O00391,NEQEQPLGQWHLS,10901.6,0
2,50423_0,0,50423,O00533,GNPEPTFSWTK,51499.4,0
3,50423_0,0,50423,O00533,IEIPSSVQQVPTIIK,125492.0,0
4,50423_0,0,50423,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,23174.2,0


In [8]:
test_proteins_df = pd.read_csv('example_test_files/test_proteins.csv')
print(test_proteins_df.shape)
print(test_peptides_df.visit_id.unique())
test_proteins_df.head()

(453, 6)
['50423_0' '3342_6']


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX,group_key
0,50423_0,0,50423,O00391,33127.9,0
1,50423_0,0,50423,O00533,490742.0,0
2,50423_0,0,50423,O00584,43615.3,0
3,50423_0,0,50423,O14773,16486.6,0
4,50423_0,0,50423,O14791,2882.42,0


In [9]:
sample_submission_df = pd.read_csv('example_test_files/sample_submission.csv')
print(sample_submission_df.shape)
sample_submission_df.head(10)


(64, 3)


Unnamed: 0,prediction_id,rating,group_key
0,3342_0_updrs_1_plus_0_months,0,0
1,3342_0_updrs_1_plus_6_months,0,0
2,3342_0_updrs_1_plus_12_months,0,0
3,3342_0_updrs_1_plus_24_months,0,0
4,3342_0_updrs_2_plus_0_months,0,0
5,3342_0_updrs_2_plus_6_months,0,0
6,3342_0_updrs_2_plus_12_months,0,0
7,3342_0_updrs_2_plus_24_months,0,0
8,3342_0_updrs_3_plus_0_months,0,0
9,3342_0_updrs_3_plus_6_months,0,0


# Explore Labels

In [10]:
clinical_df = pd.read_csv('./data/train_clinical_data.csv')
clinical_df.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,,
1,55_3,55,3,10.0,7.0,25.0,,
2,55_6,55,6,8.0,10.0,34.0,,
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On


In [11]:
px.histogram(clinical_df.visit_month)

In [55]:
clinical_df.upd23b_clinical_state_on_medication.fillna('Unknown', inplace=True)
px.histogram(clinical_df.upd23b_clinical_state_on_medication)

In [59]:
# px.scatter(clinical_df.query('patient_id == 1517'), x='visit_month', y='updrs_1')
clincal_df_grouped = clinical_df.groupby(['visit_month', 'upd23b_clinical_state_on_medication']).agg({
    'updrs_1': ['mean', 'std'], 
    'updrs_2': ['mean', 'std'],
    'updrs_3': ['mean', 'std'],
    'updrs_4': ['mean', 'std'],
}).reset_index()

clincal_df_grouped.columns = ['visit_month', 'upd23b_clinical_state_on_medication', 'updrs_1_mean', 'updrs_1_std', 'updrs_2_mean', 'updrs_2_std', 'updrs_3_mean', 'updrs_3_std', 'updrs_4_mean', 'updrs_4_std']

# Reformat df for plotting
clincal_df_grouped_mean = clincal_df_grouped.melt(id_vars=['visit_month', 'upd23b_clinical_state_on_medication'], value_vars=['updrs_1_mean', 'updrs_2_mean', 'updrs_3_mean', 'updrs_4_mean'], var_name='updrs', value_name='mean')
clincal_df_grouped_mean['updrs'] = clincal_df_grouped_mean['updrs'].str.replace('_mean', '')
clincal_df_grouped_std = clincal_df_grouped.melt(id_vars=['visit_month', 'upd23b_clinical_state_on_medication'], value_vars=['updrs_1_std', 'updrs_2_std', 'updrs_3_std', 'updrs_4_std'], var_name='updrs', value_name='std')
clincal_df_grouped_std['updrs'] = clincal_df_grouped_std['updrs'].str.replace('_std', '')

clinical_df_grouped_final = pd.merge(clincal_df_grouped_mean, clincal_df_grouped_std, on=['visit_month', 'updrs', 'upd23b_clinical_state_on_medication'])

# Plot
fig = px.line(clinical_df_grouped, x='visit_month', y='mean', color='updrs', error_y='std', facet_col='upd23b_clinical_state_on_medication')
fig.update_layout(width=1500)
fig.show()

# Protein Distribution

In [62]:
proteins_df = pd.read_csv('./data/train_proteins.csv')
print(proteins_df.shape, proteins_df['UniProt'].nunique())
proteins_df.head()

(232741, 5) 227


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0


In [73]:
proteins_df_grouped = proteins_df.groupby(['visit_month', 'UniProt']).agg({
    'NPX': ['mean', 'std']
}).reset_index()
proteins_df_grouped.columns = ['visit_month', 'UniProt', 'NPX_mean', 'NPX_std']

# Make a barplot with error bars
fig = px.bar(proteins_df_grouped, x='visit_month', y='NPX_mean', color='UniProt', error_y='NPX_std')
fig.show()