In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import precision_score,recall_score, f1_score, accuracy_score,confusion_matrix,multilabel_confusion_matrix
import lime
from lime import lime_tabular


In [27]:
df1 = pd.read_csv("MDS-UPDRS_Part_III-Group-A.csv")
df2 = pd.read_csv("MDS-UPDRS_Part_III-Group-B.csv")
df = pd.concat([df1, df2], axis=0)

In [28]:
df['NHY'].value_counts()

2.0      10404
0.0       7861
1.0       3394
3.0        967
101.0      263
4.0        173
5.0         60
Name: NHY, dtype: int64

In [29]:
df_normalised = df.replace(101.,0.)

In [30]:
threshold = 70  # Set the threshold for missing values percentage

# Calculate the percentage of missing values for each column
null_values = df_normalised.isna().mean() * 100

# Identify columns with missing values greater than the threshold
columns_to_drop = null_values[null_values > threshold].index

# Drop the identified columns
df_normalised = df_normalised.drop(columns=columns_to_drop)

# Display the DataFrame after dropping columns
df_normalised

Unnamed: 0.1,Unnamed: 0,REC_ID,PATNO,EVENT_ID,PAG_NAME,INFODT,PDTRTMNT,PDSTATE,HRPOSTMED,PDMEDYN,...,NP3RTALU,NP3RTARL,NP3RTALL,NP3RTALJ,NP3RTCON,NP3TOT,DYSKPRES,NHY,ORIG_ENTRY,LAST_UPDATE
0,17595,671077401,57869,V04,NUPDRS3,12/2017,1.0,OFF,14.5000,1.0,...,1.0,2.0,3.0,0.0,0.0,54.0,0.0,2.0,01/2018,2018-01-11 15:46:02.0
1,21551,IANT214603,111429,BL,NUPDRDOSE3,11/2022,0.0,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12/2022,2022-12-20 00:00:00.0
2,23302,IANT310558,182340,BL,NUPDRDOSE3,02/2023,0.0,,,0.0,...,1.0,0.0,0.0,0.0,1.0,16.0,0.0,1.0,08/2023,2023-08-09 00:00:00.0
3,20789,IAON164829,101018,V02,NUPDRDOSE3,11/2021,1.0,ON,3.0833,1.0,...,0.0,0.0,0.0,0.0,0.0,15.0,0.0,2.0,11/2021,2021-11-16 00:00:00.0
4,16024,IANT163527,52587,V12,NUPDRDOSE3,04/2021,0.0,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,05/2021,2021-05-04 00:00:00.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11960,17893,IAOF164017,58510,V08,NUPDRDOSE3,03/2021,1.0,OFF,,1.0,...,,,,,,,,,03/2021,2021-03-23 00:00:00.0
11961,720,532644601,3069,V10,NUPDRS3,08/2015,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,08/2015,2020-06-24 12:34:50.0
11962,22646,IANT165575,153027,BL,NUPDRDOSE3,08/2022,0.0,,,,...,0.0,0.0,0.0,0.0,0.0,18.0,0.0,1.0,09/2022,2022-09-01 00:00:00.0
11963,18553,478056701,60043,V02,NUPDRS3,11/2014,0.0,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11/2014,2020-01-24 15:33:29.0


In [31]:
# Identify numerical columns with missing values
numerical_columns_with_missing = df_normalised.select_dtypes(include='number').columns[df_normalised.select_dtypes(include='number').isnull().any()]

# Replace missing values with median for each numerical column
for column in numerical_columns_with_missing:
    median_value = df_normalised[column].median()  #Try using mode to see the change 
    df_normalised[column].fillna(median_value, inplace=True)

    
# Filling the null values with median

In [32]:
df_new = df_normalised.drop(['REC_ID','HRPOSTMED','PDMEDTM','EVENT_ID','Unnamed: 0','PDMEDDT','EXAMDT','PDSTATE','EXAMTM','INFODT', 'PDTRTMNT','ORIG_ENTRY', 'LAST_UPDATE'],axis = 1)

In [33]:
df_new = pd.get_dummies(df_new)

In [34]:
df_new.columns

Index(['PATNO', 'PDMEDYN', 'DBSYN', 'NP3SPCH', 'NP3FACXP', 'NP3RIGN',
       'NP3RIGRU', 'NP3RIGLU', 'NP3RIGRL', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL',
       'NP3HMOVR', 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL',
       'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT', 'NP3PSTBL',
       'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML', 'NP3KTRMR', 'NP3KTRML',
       'NP3RTARU', 'NP3RTALU', 'NP3RTARL', 'NP3RTALL', 'NP3RTALJ', 'NP3RTCON',
       'NP3TOT', 'DYSKPRES', 'NHY', 'PAG_NAME_NUPDR3OF', 'PAG_NAME_NUPDR3ON',
       'PAG_NAME_NUPDRDOSE3', 'PAG_NAME_NUPDRS3', 'PAG_NAME_NUPDRS3A'],
      dtype='object')

In [35]:
df_new.shape

(23929, 44)

In [36]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_new, test_size=0.3, random_state=0)

In [37]:
x_train=train.drop('NHY',axis=1)
y_train=train['NHY']

x_test=test.drop('NHY',axis=1)
y_test=test['NHY']

In [38]:
y_train.value_counts() # Highly Imbalanced

2.0    7790
0.0    5698
1.0    2396
3.0     702
4.0     124
5.0      40
Name: NHY, dtype: int64

In [39]:
from sklearn.datasets import make_classification

In [40]:
#pip install catboost

In [41]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import CategoricalNB
from sklearn.multioutput import ClassifierChain
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestCentroid
from sklearn.svm import NuSVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
#from sklearn.ensemble import VotingClassifier

from xgboost import XGBClassifier
#from catboost import CatBoostClassifier

In [52]:
estimators = []
#estimators.append(('AdaBoostClassifier', AdaBoostClassifier(random_state=13) ))
#estimators.append(('Bagging Classifier', BaggingClassifier(random_state=13) ))
#estimators.append(('Bernoulli NB', BernoulliNB() ))
#estimators.append(('Decision Tree Classifier', DecisionTreeClassifier(random_state=13) ))
#estimators.append(('Dummy Classifier', DummyClassifier(random_state=13) ))
#estimators.append(('Extra Tree Classifier', ExtraTreeClassifier(random_state=13) ))
#estimators.append(('Extra Trees Classifier', ExtraTreesClassifier(random_state=13) ))
#estimators.append(('Gaussian NB', GaussianNB() ))
#estimators.append(('Gaussian Process Classifier', GaussianProcessClassifier(random_state=13) ))
estimators.append(('Gradient Boosting Classifier', GradientBoostingClassifier(random_state=13) ))
#estimators.append(('Hist Gradient Boosting Classifier', HistGradientBoostingClassifier(random_state=13) ))
#estimators.append(('KNN', KNeighborsClassifier() ))
#estimators.append(('Label Propagation', LabelPropagation() ))
#estimators.append(('Label Spreading', LabelSpreading() ))
#estimators.append(('LogisticRegression', LogisticRegression(max_iter=1000, random_state=13)))
#estimators.append(('Logistic Regression CV', LogisticRegressionCV(max_iter=1000, random_state=13) ))
#estimators.append(('MLPClassifier', MLPClassifier(max_iter=2000,random_state=13) ))
#estimators.append(('Nearest Centroid', NearestCentroid() ))
#estimators.append(('Passive Aggressive Classifier', PassiveAggressiveClassifier(random_state=13) ))
#estimators.append(('Perceptron', Perceptron(random_state=13) ))
#estimators.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=3) ))
#estimators.append(('RandomForest', RandomForestClassifier(max_depth= 10, min_samples_leaf= 1, min_samples_split= 3, n_estimators= 170, random_state=13) ))
#estimators.append(('Ridge Classifier', RidgeClassifier(random_state=13) ))
#estimators.append(('Ridge Classifier CV', RidgeClassifierCV() ))
#estimators.append(('SGDClassifier', SGDClassifier(random_state=13) ))
#estimators.append(('SVC', SVC(random_state=13)))
estimators.append(('XGB', XGBClassifier(random_state=13) ))
#estimators.append(('CatBoost', CatBoostClassifier(logging_level='Silent', random_state=13) ))

In [53]:
XGB = GradientBoostingClassifier(random_state=13)

In [54]:
from sklearn.ensemble import StackingClassifier
SC = StackingClassifier(estimators=estimators,final_estimator=XGB,cv=6)
SC.fit(x_train, y_train)
y_pred = SC.predict(x_test)

print(f"\nStacking classifier training Accuracy: {SC.score(x_train, y_train):0.2f}")
print(f"Stacking classifier test Accuracy: {SC.score(x_test, y_test):0.2f}")




























Stacking classifier training Accuracy: 0.98
Stacking classifier test Accuracy: 0.92
