In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_poisson_deviance
from sklearn.metrics import mean_gamma_deviance
from sklearn.metrics import mean_tweedie_deviance
from sklearn.metrics import mean_absolute_percentage_error



In [11]:
#Read the data
train_clinical_all = pd.read_csv('kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
proteins = pd.read_csv('kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')
proteins_features = pd.pivot_table(proteins, values='NPX', index='visit_id', columns='UniProt', aggfunc='sum')

train_clinical_all = train_clinical_all.merge(
    proteins_features,
    left_on='visit_id',
    right_index=True,
    how='left'
)

In [13]:
#Drop upd23b_clinical_state_on_medication
train_clinical_all = train_clinical_all.drop(['upd23b_clinical_state_on_medication'], axis=1)

In [14]:
#Fill missing values with KNN
from sklearn.impute import KNNImputer

# Make a copy of the dataframe
df_imputed = train_clinical_all.copy()

# Select only numeric columns
numeric_cols = df_imputed.select_dtypes(include=[np.number]).columns

# Create imputer
imputer = KNNImputer(n_neighbors=8, weights='uniform', metric='nan_euclidean')

# Fit on the dataset
imputer.fit(df_imputed[numeric_cols])

# Transform the dataset
df_imputed[numeric_cols] = imputer.transform(df_imputed[numeric_cols])

# Round to 1 decimal place
df_imputed = df_imputed.round(1)

df_imputed.head()




Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O00391,O00533,O00584,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
0,55_0,55.0,0.0,10.0,6.0,15.0,0.0,11254.3,732430.0,39585.8,...,365475.0,35528.0,97005.6,23122.5,60912.6,408698.0,69818.9,29758.8,23833.7,18953.5
1,55_3,55.0,3.0,10.0,7.0,25.0,0.0,11807.4,537091.0,30417.0,...,257034.8,34635.0,104548.4,19224.2,40922.0,313042.6,73995.5,24638.9,20291.0,16897.1
2,55_6,55.0,6.0,8.0,10.0,34.0,0.0,13163.6,630465.0,35220.8,...,405676.0,30332.6,109174.0,23499.8,51655.8,369870.0,76552.2,22935.2,17722.5,16642.7
3,55_9,55.0,9.0,8.0,9.0,30.0,0.0,11807.4,537091.0,30417.0,...,257034.8,34635.0,104548.4,19224.2,40922.0,313042.6,73995.5,24638.9,20291.0,16897.1
4,55_12,55.0,12.0,10.0,10.0,41.0,0.0,15257.6,815083.0,41650.9,...,303953.0,43026.2,114921.0,21860.1,61598.2,318553.0,65762.6,29193.4,28536.1,19290.9


In [15]:
target = train_clinical_all.columns.to_list()
X = df_imputed.drop(target, axis = 1)
y = df_imputed[target]


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.1, random_state=2)

In [20]:
#Implement Kfold cross validation
from sklearn.model_selection import KFold

#Define the Kfold Cross Validator
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

#Define the model evaluation metrics
def print_evaluation_scores(y_true, y_pred):
    print('Mean Squared Error: {}'.format(mean_squared_error(y_true, y_pred)))
    print('Mean Absolute Error: {}'.format(mean_absolute_error(y_true, y_pred)))
    print('R^2 Score: {}'.format(r2_score(y_true, y_pred)))
    print('Explained Variance Score: {}'.format(explained_variance_score(y_true, y_pred)))
   # print('Max Error: {}'.format(max_error(y_true, y_pred)))
    print('Median Absolute Error: {}'.format(median_absolute_error(y_true, y_pred)))
    #print('Mean Squared Log Error: {}'.format(mean_squared_log_error(y_true, y_pred)))
    print('Mean Poisson Deviance: {}'.format(mean_poisson_deviance(y_true, y_pred)))
    print('Mean Gamma Deviance: {}'.format(mean_gamma_deviance(y_true, y_pred)))
    #print('Mean Tweedie Deviance: {}'.format(mean_tweedie_deviance(y_true, y_pred)))
    print('Mean Absolute Percentage Error: {}'.format(mean_absolute_percentage_error(y_true, y_pred)))


#Define the model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

#Define the models
models = []
models.append(('LR', LinearRegression()))
models.append(('RF', RandomForestRegressor()))
models.append(('GBR', GradientBoostingRegressor()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('SGD', SGDRegressor()))
models.append(('SVR', SVR()))
models.append(('DTR', DecisionTreeRegressor()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('MLP', MLPRegressor()))

#Evaluate each model in turn
results = []
names = []
for name, model in models:
    print(name)
    for train_index, test_index in kfold.split(df_imputed):
        X_train, X_test = df_imputed.iloc[train_index], df_imputed.iloc[test_index]
        y_train, y_test = df_imputed.iloc[train_index], df_imputed.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print_evaluation_scores(y_test, y_pred)
        print('----------------------------------')
    print('==================================')
    print('\n')

    

LR
Mean Squared Error: 6.531731962415841e-16
Mean Absolute Error: 8.31169933802614e-09
R^2 Score: 1.0
Explained Variance Score: 1.0
Median Absolute Error: 6.019337460570916e-09


ValueError: Multioutput not supported in mean_tweedie_deviance