In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import LabelEncoder

In [6]:
proteins_data = pd.read_csv('/content/drive/MyDrive/Parkinson/train_proteins.csv')
proteins_data.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0


In [7]:
peptides_data = pd.read_csv('/content/drive/MyDrive/Parkinson/train_peptides.csv')
peptides_data.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7


In [8]:
clinical_data = pd.read_csv('/content/drive/MyDrive/Parkinson/train_clinical_data(1).csv')
clinical_data.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,,
1,55_3,55,3,10.0,7.0,25.0,,
2,55_6,55,6,8.0,10.0,34.0,,
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On


In [9]:

clinical_data.shape

(2615, 8)

In [10]:
clinical_data.isnull().sum()

visit_id                                  0
patient_id                                0
visit_month                               0
updrs_1                                   1
updrs_2                                   2
updrs_3                                  25
updrs_4                                1038
upd23b_clinical_state_on_medication    1327
dtype: int64

In [11]:
supplemental_data = pd.read_csv('/content/drive/MyDrive/Parkinson/supplemental_clinical_data(1).csv')
supplemental_data.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,35_0,35,0,5.0,3.0,16.0,0.0,
1,35_36,35,36,6.0,4.0,20.0,0.0,
2,75_0,75,0,4.0,6.0,26.0,0.0,
3,75_36,75,36,1.0,8.0,38.0,0.0,On
4,155_0,155,0,,,0.0,,


In [12]:
supplemental_data.shape

(2223, 8)

In [13]:
supplemental_data.isnull().sum()

visit_id                                  0
patient_id                                0
visit_month                               0
updrs_1                                 213
updrs_2                                 214
updrs_3                                   5
updrs_4                                 928
upd23b_clinical_state_on_medication    1101
dtype: int64

In [14]:
merge_data = proteins_data.merge(peptides_data[['visit_id', 'UniProt', 'Peptide','PeptideAbundance']], on = ['visit_id','UniProt'], how = 'left')
merge_data.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX,Peptide,PeptideAbundance
0,55_0,0,55,O00391,11254.3,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,732430.0,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,732430.0,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,732430.0,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,732430.0,SMEQNGPGLEYR,30838.7


In [15]:
merge_data.isnull().sum()

visit_id            0
visit_month         0
patient_id          0
UniProt             0
NPX                 0
Peptide             0
PeptideAbundance    0
dtype: int64

In [16]:
clinical_data = clinical_data.melt(id_vars=['visit_id', 'patient_id', 'visit_month', 'upd23b_clinical_state_on_medication'],
                 var_name='updrs', value_name='rating')
clinical_data.head()

Unnamed: 0,visit_id,patient_id,visit_month,upd23b_clinical_state_on_medication,updrs,rating
0,55_0,55,0,,updrs_1,10.0
1,55_3,55,3,,updrs_1,10.0
2,55_6,55,6,,updrs_1,8.0
3,55_9,55,9,On,updrs_1,8.0
4,55_12,55,12,On,updrs_1,10.0


In [17]:
clinical_data.isnull().sum()

visit_id                                  0
patient_id                                0
visit_month                               0
upd23b_clinical_state_on_medication    5308
updrs                                     0
rating                                 1066
dtype: int64

In [18]:
clinical_data.shape,clinical_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10460 entries, 0 to 10459
Data columns (total 6 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   visit_id                             10460 non-null  object 
 1   patient_id                           10460 non-null  int64  
 2   visit_month                          10460 non-null  int64  
 3   upd23b_clinical_state_on_medication  5152 non-null   object 
 4   updrs                                10460 non-null  object 
 5   rating                               9394 non-null   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 490.4+ KB


((10460, 6), None)

In [19]:

imputer = SimpleImputer(strategy='most_frequent')
data_imputed = imputer.fit_transform(clinical_data)
clinical_data = pd.DataFrame(data_imputed, columns=clinical_data.columns)


In [20]:
supplemental_data = supplemental_data.melt(id_vars=['visit_id', 'patient_id', 'visit_month', 'upd23b_clinical_state_on_medication'],
                 var_name='updrs', value_name='rating')
supplemental_data.head()

Unnamed: 0,visit_id,patient_id,visit_month,upd23b_clinical_state_on_medication,updrs,rating
0,35_0,35,0,,updrs_1,5.0
1,35_36,35,36,,updrs_1,6.0
2,75_0,75,0,,updrs_1,4.0
3,75_36,75,36,On,updrs_1,1.0
4,155_0,155,0,,updrs_1,


In [21]:
supplemental_data.isnull().sum()

visit_id                                  0
patient_id                                0
visit_month                               0
upd23b_clinical_state_on_medication    4404
updrs                                     0
rating                                 1360
dtype: int64

In [22]:
supplemental_data.shape,clinical_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10460 entries, 0 to 10459
Data columns (total 6 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   visit_id                             10460 non-null  object
 1   patient_id                           10460 non-null  object
 2   visit_month                          10460 non-null  object
 3   upd23b_clinical_state_on_medication  10460 non-null  object
 4   updrs                                10460 non-null  object
 5   rating                               10460 non-null  object
dtypes: object(6)
memory usage: 490.4+ KB


((8892, 6), None)

In [23]:

imputer = SimpleImputer(strategy='most_frequent')
data_imputed = imputer.fit_transform(supplemental_data)
supplemental_data = pd.DataFrame(data_imputed, columns=supplemental_data.columns)


In [24]:
supplemental_data.isnull().sum()

visit_id                               0
patient_id                             0
visit_month                            0
upd23b_clinical_state_on_medication    0
updrs                                  0
rating                                 0
dtype: int64

In [25]:
supplemental_data.columns,clinical_data.columns

(Index(['visit_id', 'patient_id', 'visit_month',
        'upd23b_clinical_state_on_medication', 'updrs', 'rating'],
       dtype='object'),
 Index(['visit_id', 'patient_id', 'visit_month',
        'upd23b_clinical_state_on_medication', 'updrs', 'rating'],
       dtype='object'))

In [26]:
merged_df2 = pd.concat([clinical_data, supplemental_data], ignore_index=True)

In [27]:
merged_df2.isnull().sum(),merged_df2.shape

(visit_id                               0
 patient_id                             0
 visit_month                            0
 upd23b_clinical_state_on_medication    0
 updrs                                  0
 rating                                 0
 dtype: int64,
 (19352, 6))

In [28]:
data = merged_df2.merge(merge_data[['visit_id', 'visit_month', 'patient_id', 'NPX', 'UniProt', 'Peptide','PeptideAbundance']], on = ['visit_id','visit_month', 'patient_id'], how = 'left').rename(columns={'upd23b_clinical_state_on_medication': 'on_medication'})
data.head()

Unnamed: 0,visit_id,patient_id,visit_month,on_medication,updrs,rating,NPX,UniProt,Peptide,PeptideAbundance
0,55_0,55,0,On,updrs_1,10.0,11254.3,O00391,NEQEQPLGQWHLS,11254.3
1,55_0,55,0,On,updrs_1,10.0,732430.0,O00533,GNPEPTFSWTK,102060.0
2,55_0,55,0,On,updrs_1,10.0,732430.0,O00533,IEIPSSVQQVPTIIK,174185.0
3,55_0,55,0,On,updrs_1,10.0,732430.0,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,55,0,On,updrs_1,10.0,732430.0,O00533,SMEQNGPGLEYR,30838.7


In [29]:
data.isnull().sum()

visit_id                0
patient_id              0
visit_month             0
on_medication           0
updrs                   0
rating                  0
NPX                 15080
UniProt             15080
Peptide             15080
PeptideAbundance    15080
dtype: int64

In [30]:
data.shape

(3782056, 10)

In [31]:

imputer = SimpleImputer(strategy='most_frequent')
data_imputed = imputer.fit_transform(data)
data = pd.DataFrame(data_imputed, columns=data.columns)


In [32]:
data.isnull().sum()

visit_id            0
patient_id          0
visit_month         0
on_medication       0
updrs               0
rating              0
NPX                 0
UniProt             0
Peptide             0
PeptideAbundance    0
dtype: int64

In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3782056 entries, 0 to 3782055
Data columns (total 10 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   visit_id          object
 1   patient_id        object
 2   visit_month       object
 3   on_medication     object
 4   updrs             object
 5   rating            object
 6   NPX               object
 7   UniProt           object
 8   Peptide           object
 9   PeptideAbundance  object
dtypes: object(10)
memory usage: 288.5+ MB


In [34]:
data.head()

Unnamed: 0,visit_id,patient_id,visit_month,on_medication,updrs,rating,NPX,UniProt,Peptide,PeptideAbundance
0,55_0,55,0,On,updrs_1,10.0,11254.3,O00391,NEQEQPLGQWHLS,11254.3
1,55_0,55,0,On,updrs_1,10.0,732430.0,O00533,GNPEPTFSWTK,102060.0
2,55_0,55,0,On,updrs_1,10.0,732430.0,O00533,IEIPSSVQQVPTIIK,174185.0
3,55_0,55,0,On,updrs_1,10.0,732430.0,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,55,0,On,updrs_1,10.0,732430.0,O00533,SMEQNGPGLEYR,30838.7


In [35]:
data.shape

(3782056, 10)

In [36]:
y = data['rating']
X = data[['on_medication', 'updrs', 'NPX', 'PeptideAbundance']]

In [37]:
label_encoder = LabelEncoder()

In [38]:
!pip install scikit-learn-intelex



In [39]:
!pip install hummingbird-ml



In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
import joblib
from sklearnex import patch_sklearn,config_context
from hummingbird.ml import convert, load
patch_sklearn()


X = data[['on_medication', 'updrs', 'NPX', 'PeptideAbundance']]
y = data['rating']


threshold = 5.0


y = (data['rating'] > threshold).astype(int)


label_encoder = LabelEncoder()
X['on_medication'] = label_encoder.fit_transform(X['on_medication'])
X['updrs'] = label_encoder.fit_transform(X['updrs'])


scaler = StandardScaler()
X = scaler.fit_transform(X)


models = {
    'Gradient Boosting': GradientBoostingClassifier(
        learning_rate=0.01,
        n_estimators=100,
        max_depth=3,
        subsample=0.8,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features=None,
        loss='log_loss',
    ),
    'Neural Network': MLPClassifier(
        hidden_layer_sizes=(300,),
        activation='relu',
        solver='adam',
        alpha=0.0001,
    ),
    'Random Forest': RandomForestClassifier(),
    'Stochastic Gradient Descent': SGDClassifier(
        loss='hinge',
        alpha=0.00001,
        learning_rate='constant',
        eta0=0.01,
    ),

}


n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

results = {}

for model_name, model in models.items():
    auc_scores = []
    accuracy_scores = []
    f1_scores = []
    precision_scores = []
    recall_scores = []

    for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        print(f"Running model: {model_name}, Fold: {fold_idx + 1}")


        model.fit(X_train, y_train)
        model = convert(model, 'pytorch')

        model.to('cuda')
        y_pred = model.predict(X_test)


        auc = roc_auc_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)

        auc_scores.append(auc)
        accuracy_scores.append(accuracy)
        f1_scores.append(f1)
        precision_scores.append(precision)
        recall_scores.append(recall)

    avg_auc = np.mean(auc_scores)
    avg_accuracy = np.mean(accuracy_scores)
    avg_f1 = np.mean(f1_scores)
    avg_precision = np.mean(precision_scores)
    avg_recall = np.mean(recall_scores)

    results[model_name] = {
        'Average AUC': avg_auc,
        'Average Accuracy': avg_accuracy,
        'Average F1 Score': avg_f1,
        'Average Precision': avg_precision,
        'Average Recall': avg_recall
    }

    print(results)


best_model_name = max(results, key=lambda k: results[k]['Average AUC'])
best_model_results = results[best_model_name]

print(f"The best model is {best_model_name} with the following average results over {n_splits} folds:")
print(f"Average AUC: {best_model_results['Average AUC']}")
print(f"Average Accuracy: {best_model_results['Average Accuracy']}")
print(f"Average F1 Score: {best_model_results['Average F1 Score']}")
print(f"Average Precision: {best_model_results['Average Precision']}")
print(f"Average Recall: {best_model_results['Average Recall']}")



best_model = models[best_model_name]
best_model.fit(X, y)
model_filename = "best_model.joblib"
joblib.dump(best_model, model_filename)
print(f"The best model '{best_model_name}' has been saved to {model_filename}")

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['on_medication'] = label_encoder.fit_transform(X['on_medication'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['updrs'] = label_encoder.fit_transform(X['updrs'])


Running model: Gradient Boosting, Fold: 1
