In [2]:
import pandas as pd

prot_df = pd.read_csv('data/train_proteins.csv')
clin_df = pd.read_csv('data/train_clinical_data.csv')
clin_df = clin_df.rename(columns={'upd23b_clinical_state_on_medication': 'on_meds'})

clin_df['on_meds'] = clin_df['on_meds'].fillna(0)
clin_df['on_meds'] = clin_df['on_meds'].replace({'Off': 0, 'On': 1})


In [3]:
pivoted = prot_df.pivot(index="visit_id", columns="UniProt", values="NPX")

import numpy as np

# Normalize each column separately
for col in pivoted.columns:
    col_data = pivoted[col].dropna()
    mean = col_data.mean()
    std = col_data.std(ddof=1)
    pivoted[col] = (pivoted[col] - mean) / std
    
# Replace original NaN values in pivoted with 0 (mean value)
norm_prot_df = pivoted.fillna(0)


In [4]:
# Calculate mean and standard deviation of updrs_1 through updrs_4
updrs_mean = clin_df[['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']].mean()
updrs_std = clin_df[['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']].std()

norm_clin_df = pd.DataFrame()
# Normalize updrs_1 through updrs_4 and round to 3 sig figs
norm_clin_df[['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']] = ((clin_df[['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']] - updrs_mean) / updrs_std).round(3)
norm_prot_df = pivoted.fillna(0)

In [5]:
# Merge prot_df and clin_df on visit_id
merged_df = pd.merge(clin_df, norm_prot_df, on='visit_id')

# Drop any rows with NaN values
merged_df_no_nan = merged_df.dropna()

# Select only numeric columns
merged_df_no_nan_numeric = merged_df_no_nan.select_dtypes(include=np.number)


In [6]:

for patient in merged_df.patient_id.unique():
    # Select the patient dataset and make a copy
    patient_data = merged_df[merged_df.patient_id == patient].copy()

    # Fill the NaN values with forward fill, backward fill, and zero
    patient_data.fillna(method='ffill', inplace=True)
    patient_data.fillna(method='bfill', inplace=True)
    patient_data.fillna(0, inplace=True)

    merged_df[merged_df['patient_id']==patient] = patient_data

In [7]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error

feature_columns = merged_df.columns[8:]
target_columns = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

X = merged_df[feature_columns]
y = merged_df[target_columns]
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the best hyperparameters for each target variable
best_params = {
    'updrs_1': {'seed': 33, 'reg_alpha': 5, 'objective': 'reg:squarederror', 'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.075},
    'updrs_2': {'seed': 33, 'reg_alpha': 10, 'objective': 'reg:squarederror', 'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.075},
    'updrs_3': {'seed': 33, 'reg_alpha': 10, 'objective': 'reg:squarederror', 'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.075},
    'updrs_4': {'seed': 33, 'reg_alpha': 10, 'objective': 'reg:squarederror', 'n_estimators': 750, 'max_depth': 3, 'learning_rate': 0.075}
}

# Train a model for each target variable using the best hyperparameters
models = {}
for target in target_columns:
    model = xgb.XGBRegressor(**best_params[target])
    model.fit(X_train, y_train[target])
    models[target] = model

# Evaluate the performance of each model on the test set
for target in target_columns:
    y_pred = models[target].predict(X_test)
    mse = mean_squared_error(y_test[target], y_pred)
    print(f"MSE for {target}: {mse}")


MSE for updrs_1: 23.159666523779503
MSE for updrs_2: 30.929440594418807
MSE for updrs_3: 199.72471481095803
MSE for updrs_4: 5.923767159532673


In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error

feature_columns = merged_df.columns[8:]
target_columns = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

X = merged_df[feature_columns]
y = merged_df[target_columns]
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the best hyperparameters for each target variable
best_params = {
    'updrs_1': {'seed': 33, 'reg_alpha': 5, 'objective': 'reg:squarederror', 'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.075},
    'updrs_2': {'seed': 33, 'reg_alpha': 10, 'objective': 'reg:squarederror', 'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.075},
    'updrs_3': {'seed': 33, 'reg_alpha': 10, 'objective': 'reg:squarederror', 'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.075},
    'updrs_4': {'seed': 33, 'reg_alpha': 10, 'objective': 'reg:squarederror', 'n_estimators': 750, 'max_depth': 3, 'learning_rate': 0.075}
}

# Train a model for each target variable using the best hyperparameters
models = {}
for target in target_columns:
    model = xgb.XGBRegressor(**best_params[target])
    model.fit(X_train, y_train[target])
    models[target] = model

# Evaluate the performance of each model on the test set
for target in target_columns:
    y_pred = models[target].predict(X_test)
    mse = mean_squared_error(y_test[target], y_pred)
    print(f"MSE for {target}: {mse}")


MSE for updrs_1: 23.159666523779503
MSE for updrs_2: 30.929440594418807
MSE for updrs_3: 199.72471481095803
MSE for updrs_4: 5.923767159532673


In [None]:
import amp_pd_peptide
env = amp_pd_peptide.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files
for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    sample_prediction_df['rating'] = np.arange(len(sample_prediction))  # make your predictions here
    env.predict(sample_prediction_df)   # register your predictions