# Version 2.2 (Applied SMOT) With full dataset

# Please read this to understand the considerations of the Experiment

#### 1) In this experement the null values above 70% are dropped(for whole dataset). 
#### 2) For numerical features median is used to fill/handle the null values.
#### 3) 'REC_ID', 'HRPOSTMED', 'PDMEDTM', 'EVENT_ID', 'Unnamed:0', 'PDMEDDT', 'EXAMDT', 'PDSTATE', 'EXAMTM', 'INFODT' ,  'PDTRTMNT', 'ORIG_ENTRY',  'LAST_UPDATE'  These columns are dropped from the dataset

#### 4) pd.get_dummies(df_new) is used for encoding

#### 5) Finally for model traning various machine learning techniques are used to get the best model for this usecase

#### 6) It is observed that Gaussian Naive Bayes (GaussianNB) has highest accuracy of 88% out of all the models used

## For your Experiment
1) please use different ways of filling null values

2) Try handling the "PDSTATE" column and add that also with the dataset for traning (Siwani did by dropping the null values from entire dataset, which resulted in less dimention of the dataset / loss of many rows and information) However her model achived 78% accuracy

3) Try using mode to fill the null values ( as i have used median )

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
df1 = pd.read_csv("MDS-UPDRS_Part_III-Group-A.csv")
df2 = pd.read_csv("MDS-UPDRS_Part_III-Group-B.csv")
df = pd.concat([df1, df2], axis=0)

In [3]:
df_normalised = df.replace(101.,0.)

In [4]:
threshold = 70  # Set the threshold for missing values percentage

# Calculate the percentage of missing values for each column
null_values = df_normalised.isna().mean() * 100

# Identify columns with missing values greater than the threshold
columns_to_drop = null_values[null_values > threshold].index

# Drop the identified columns
df_normalised = df_normalised.drop(columns=columns_to_drop)

# Display the DataFrame after dropping columns
df_normalised

Unnamed: 0.1,Unnamed: 0,REC_ID,PATNO,EVENT_ID,PAG_NAME,INFODT,PDTRTMNT,PDSTATE,HRPOSTMED,PDMEDYN,...,NP3RTALU,NP3RTARL,NP3RTALL,NP3RTALJ,NP3RTCON,NP3TOT,DYSKPRES,NHY,ORIG_ENTRY,LAST_UPDATE
0,17595,671077401,57869,V04,NUPDRS3,12/2017,1.0,OFF,14.5000,1.0,...,1.0,2.0,3.0,0.0,0.0,54.0,0.0,2.0,01/2018,2018-01-11 15:46:02.0
1,21551,IANT214603,111429,BL,NUPDRDOSE3,11/2022,0.0,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12/2022,2022-12-20 00:00:00.0
2,23302,IANT310558,182340,BL,NUPDRDOSE3,02/2023,0.0,,,0.0,...,1.0,0.0,0.0,0.0,1.0,16.0,0.0,1.0,08/2023,2023-08-09 00:00:00.0
3,20789,IAON164829,101018,V02,NUPDRDOSE3,11/2021,1.0,ON,3.0833,1.0,...,0.0,0.0,0.0,0.0,0.0,15.0,0.0,2.0,11/2021,2021-11-16 00:00:00.0
4,16024,IANT163527,52587,V12,NUPDRDOSE3,04/2021,0.0,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,05/2021,2021-05-04 00:00:00.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11960,17893,IAOF164017,58510,V08,NUPDRDOSE3,03/2021,1.0,OFF,,1.0,...,,,,,,,,,03/2021,2021-03-23 00:00:00.0
11961,720,532644601,3069,V10,NUPDRS3,08/2015,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,08/2015,2020-06-24 12:34:50.0
11962,22646,IANT165575,153027,BL,NUPDRDOSE3,08/2022,0.0,,,,...,0.0,0.0,0.0,0.0,0.0,18.0,0.0,1.0,09/2022,2022-09-01 00:00:00.0
11963,18553,478056701,60043,V02,NUPDRS3,11/2014,0.0,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11/2014,2020-01-24 15:33:29.0


In [5]:
# Identify numerical columns with missing values
numerical_columns_with_missing = df_normalised.select_dtypes(include='number').columns[df_normalised.select_dtypes(include='number').isnull().any()]

# Replace missing values with median for each numerical column
for column in numerical_columns_with_missing:
    median_value = df_normalised[column].median()  #Try using mode to see the change 
    df_normalised[column].fillna(median_value, inplace=True)

    
# Filling the null values with median

In [6]:
df_new = df_normalised.drop(['REC_ID','HRPOSTMED','PDMEDTM','EVENT_ID','Unnamed: 0','PDMEDDT','EXAMDT','PDSTATE','EXAMTM','INFODT', 'PDTRTMNT','ORIG_ENTRY', 'LAST_UPDATE'],axis = 1)

In [7]:
df_new = pd.get_dummies(df_new)

In [8]:
df_new.columns

Index(['PATNO', 'PDMEDYN', 'DBSYN', 'NP3SPCH', 'NP3FACXP', 'NP3RIGN',
       'NP3RIGRU', 'NP3RIGLU', 'NP3RIGRL', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL',
       'NP3HMOVR', 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL',
       'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT', 'NP3PSTBL',
       'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML', 'NP3KTRMR', 'NP3KTRML',
       'NP3RTARU', 'NP3RTALU', 'NP3RTARL', 'NP3RTALL', 'NP3RTALJ', 'NP3RTCON',
       'NP3TOT', 'DYSKPRES', 'NHY', 'PAG_NAME_NUPDR3OF', 'PAG_NAME_NUPDR3ON',
       'PAG_NAME_NUPDRDOSE3', 'PAG_NAME_NUPDRS3', 'PAG_NAME_NUPDRS3A'],
      dtype='object')

In [9]:
df_new['NHY'].value_counts()

2.0    11211
0.0     8124
1.0     3394
3.0      967
4.0      173
5.0       60
Name: NHY, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_new, test_size=0.3, random_state=0)

In [11]:
x_train=train.drop('NHY',axis=1)
y_train=train['NHY']

x_test=test.drop('NHY',axis=1)
y_test=test['NHY']

In [12]:
#!pip install imbalanced-learn

In [13]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

# Define SMOTE-Tomek Links
resample=SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
x_train, y_train = resample.fit_resample(x_train, y_train)


In [14]:
y_train.value_counts() # Balanced

2.0    7790
0.0    5698
1.0    2396
3.0     702
4.0     124
5.0      40
Name: NHY, dtype: int64

# Hyper parameter tuning

In [15]:
x_train.shape

(16750, 43)

In [16]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score,recall_score, f1_score, accuracy_score
model1 = DecisionTreeClassifier()
model2 = KNeighborsClassifier()
model3 = LogisticRegression()
model4 = GradientBoostingClassifier()
gnb = GaussianNB().fit(x_train, y_train) 

model1.fit(x_train,y_train)
model2.fit(x_train,y_train)
model3.fit(x_train,y_train)
model4.fit(x_train,y_train)

gnb_predictions = gnb.predict(x_test) 
pred1=model1.predict_proba(x_test)
pred2=model2.predict_proba(x_test)
pred3=model3.predict_proba(x_test)
pred4=model4.predict_proba(x_test)

accuracy = gnb.score(x_test, y_test) 
sc1 = model1.score(x_test,y_test)
sc2 = model2.score(x_test,y_test)
sc3 = model3.score(x_test,y_test)
sc4 = model4.score(x_test,y_test)

print(sc1)
print(sc2)
print(sc3)
print(sc4) #GradientBoostingClassifier
print(accuracy) 


pred1=model1.predict(x_train)
testaccuracy1= accuracy_score(y_train, pred1)
print("Accuracy of DT train  : {} %".format(round((testaccuracy1*100),2)))

pred2=model2.predict(x_train)
testaccuracy2= accuracy_score(y_train, pred2)
print("Accuracy of KNN train  : {} %".format(round((testaccuracy2*100),2)))

pred3=model3.predict(x_train)
testaccuracy3= accuracy_score(y_train, pred3)
print("Accuracy of LR train  : {} %".format(round((testaccuracy3*100),2)))

pred4=model4.predict(x_train)
testaccuracy4= accuracy_score(y_train, pred4)
print("Accuracy of GBC train  : {} %".format(round((testaccuracy4*100),2)))


finalpred = (pred1+pred2+pred3)/3

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8834099456748851
0.8084691461206296
0.6817105446441009
0.8874495055021591
0.6789246413149463
Accuracy of DT train  : 99.97 %
Accuracy of KNN train  : 86.99 %
Accuracy of LR train  : 67.51 %
Accuracy of GBC train  : 90.0 %


In [17]:
import xgboost as xgb
XGB_model = xgb.XGBClassifier(learning_rate = 0.1, max_depth = 5, n_estimators = 10)
XGB_model.fit(x_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=10, n_jobs=16,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [18]:
result_train = XGB_model.score(x_test,y_test)
print("Accuracy : {}".format(result_train))

Accuracy : 0.856247388215629
