# Parkinsons Disease Prediction

### Importing Libraries

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 
import os



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\parmo\AppData\Local\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\parmo\AppData\Local\anaconda3\Lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
    app.start()
  File "C:\Users\parmo\AppData\Local\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  Fil

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

### Data Collection

In [2]:
display(os.getcwd())

NameError: name 'os' is not defined

In [None]:
df=pd.read_csv('parkinsons.data')

### Data Preprocessing

In [None]:
display (df)

In [None]:
import ydata_profiling as pf
display(pf.ProfileReport(df))

In [None]:
#Display the shape 
display (df.shape)

In [None]:
#Number of rows 
print (len(df))

In [None]:
# display data types
display(df.dtypes)

### Display Details 

In [None]:
print (df.info())

In [None]:
#Describe the details 
display (df.describe())

In [None]:
#Check for Null Values 
display (df.isna().sum() )

In [None]:
# Display column details  
print (df.columns)

In [None]:
# Display the dependent variable  
# status - health status of the subject (one) - Parkinson's, (zero) – healthy
print (df['status'])

## Exploratry Data Analysis

### Histogram with Status column 

In [None]:
# The dataset has high number of patients effected with Parkinson's disease.
plt.figure(figsize=(10, 6))
df.status.hist()
plt.xlabel('Status')
plt.ylabel('Frequencies')
plt.plot()
plt.show()

### The patients affected with Parkinson's disease have high NHR which is the measure of the ratio of noise to tonal components in the voice.
 - bar graf of X-Axis Status and y-Axis NHR

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x='status',y='NHR',data=df);
plt.show()

#### The patients affected with Parkinson's disease have high HNR that is the measure of the ratio of noise to tonal components in the voice
- create a bar graph X-Axis status and Y-Axis HNR

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x='status',y='HNR',data=df)
plt.show()

#### The nonlinear dynamical complexity measure RPDE is high in the patients affected with Parkinson's disease.
- create bar plot of the X-Axis status and Y-Axis RPDE

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x='status',y='RPDE',data=df);
plt.show()

### Distribution plot

In [None]:
import warnings
warnings.filterwarnings('ignore')
rows=3
cols=7
fig, ax=plt.subplots(nrows=rows,ncols=cols,figsize=(16,4))
col=df.columns
index=1
for i in range(rows):
    for j in range(cols):
        sns.distplot(df[col[index]],ax=ax[i][j])
        index=index+1
        
plt.tight_layout()
plt.show()

### Relational matrix

In [None]:
# Display the data types of each column
print(df.dtypes)

# Alternatively, get a list of non-numeric columns
non_numeric_columns = df.select_dtypes(exclude=['number']).columns.tolist()
print("Non-numeric columns:", non_numeric_columns)


In [None]:
# Drop non-numeric columns
df_numeric = df.select_dtypes(include=['number'])


In [None]:
# Calculate the correlation matrix
corr = df_numeric.corr()

# Display the correlation matrix
display(corr)


In [None]:
df.columns

### Heat Map

In [None]:
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20,15
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, cmap='cubehelix',annot = True)
plt.show()

In [None]:
# Exclude non-numeric columns from the dataframe
df_numeric = df.select_dtypes(include=[np.number])

# Now create the heatmap for the correlation matrix of numeric data
plt.figure(figsize=(20, 10))
sns.heatmap(df_numeric.corr(), cmap='cubehelix', annot=True, fmt='.2f')
plt.show()


In [None]:
# Check the non-numeric columns
non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns
print("Non-numeric columns: ", non_numeric_columns)


In [None]:
from matplotlib.pylab import rcParams
rcParams['figure.figsize']=20,10
sns.heatmap(corr)
plt.show()

### Box Plot

In [None]:
fig,axes=plt.subplots(5,5,figsize=(15,15))
axes=axes.flatten()

for i in range(1,len(df.columns)-1):
    sns.boxplot(x='status',y=df.iloc[:,i],data=df,orient='v',ax=axes[i])
plt.tight_layout()
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (15, 4)
sns.pairplot(df,hue = 'status', vars = ['MDVP:Jitter(%)','MDVP:Jitter(Abs)','MDVP:RAP','MDVP:PPQ', 'Jitter:DDP'] )
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (15, 4)
sns.pairplot(df,hue = 'status', vars = ['MDVP:Shimmer','MDVP:Shimmer(dB)','Shimmer:APQ3','Shimmer:APQ5','MDVP:APQ','Shimmer:DDA'] )
plt.show()

###  Deleting 'name' column

In [None]:
df=pd.read_csv('parkinsons.data')
df.drop(['name'], inplace=True, axis=1)
print("\nCSV Data after deleting the column 'name':\n")
print(df)

In [None]:
# Train test split the data into x and y
from sklearn.model_selection import train_test_split
X=df.drop(labels=['status'],axis=1)
display (X.head())

Y=df['status']
display(Y.head())

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=40)
print(X.shape,Y.shape)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

### logistic Regression model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_reg=LogisticRegression().fit(X_train,Y_train)
    
#     prediction on train 
train_preds=log_reg.predict(X_train)
# Accuracy on train 

print("Model Accuracy on train is:", accuracy_score(Y_train,train_preds))

#     prediction on test
test_preds=log_reg.predict(X_test)

# Accuracy on train 
print("Model Accuracy on test is:",accuracy_score(Y_test,test_preds))

print('-'*50)

### Confusion Matrix

In [None]:
y_pred_proba = log_reg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(Y_test, y_pred_proba )
auc = metrics.roc_auc_score(Y_test, y_pred_proba )
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn import metrics
print("confusion_matrix train is:\n ", confusion_matrix(Y_train, train_preds))
print("confusion_matrix test is:\n ", confusion_matrix(Y_test, test_preds))
print("\n Classification Report Train is ")
print(classification_report(Y_train,train_preds))
print("\n Classification Report Test is")
print(classification_report(Y_test,test_preds))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
cm = confusion_matrix(Y_train, train_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
cm = confusion_matrix(Y_test, test_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

### Randome forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier

RF= RandomForestClassifier().fit(X_train,Y_train)

#  prediction on train
train_preds2=RF.predict(X_train)

# for aacuracy on train
print("Model accuracy on train is:", accuracy_score(Y_train,train_preds2))


#  prediction on test
test_preds2=RF.predict(X_test)

# for aacuracy on train
print("Model accuracy on test is:", accuracy_score(Y_test,test_preds2))

In [None]:
#Confusion matrix
print("confusion_matrix train is:\n ", confusion_matrix(Y_train, train_preds2))
print("confusion_matrix test is:\n ", confusion_matrix(Y_test, test_preds2))
print('\nClassification Report Train is ')
print(classification_report (Y_train, train_preds2))
print('\nClassification Report Test is ')
print(classification_report (Y_test, test_preds2))

In [None]:
# Wrong Predictions made
print((Y_test !=test_preds2).sum(),'/',((Y_test == test_preds2).sum()+(Y_test != test_preds2).sum()))

In [None]:
print('KappaScore is: ', metrics.cohen_kappa_score(Y_test,test_preds2))
# Kappa = 1, perfect agreement exists. 
# Kappa < 0, agreement is weaker than expected by chance; this rarely happens.
# Kappa close to 0, the degree of agreement is the same as would be expected by chance

### Display the test and Predicted Values 

In [None]:
ddf=pd.DataFrame(data=[test_preds2,Y_test])
display (ddf)

### Transpose and display

In [None]:
display (ddf.T) 

### Decision Tree Classifier  

In [None]:
from sklearn.tree import DecisionTreeClassifier

# fit model on train data
DT= DecisionTreeClassifier().fit(X,Y)

# prediction on train
train_preds3=DT.predict(X_train)

# Accuracy on train
print("Model accuracy on train is: ", accuracy_score(Y_train,train_preds3))

# fit model on test data
DT= DecisionTreeClassifier().fit(X,Y)

# prediction on train
test_preds3=DT.predict(X_test)

# Accuracy on train
print("Model accuracy on test is: ", accuracy_score(Y_test,test_preds3))
print('-'*50)
 
# Confusion Matrix
print("confusion_matrix train is:\n ", confusion_matrix(Y_train, train_preds3))
print("confusion_matrix test is:\n ", confusion_matrix(Y_test, test_preds3))
print('Wrong predictions out of total')
print('-'*50)
 
print('\nClassification Report Train is ')
print(classification_report (Y_train, train_preds3))
print('\nClassification Report Test is ')
print(classification_report (Y_test, test_preds3))

#### Wrong Prediction and Kappa Score   
#### Wrong Predictions made.

In [None]:
print((Y_test !=test_preds3).sum(),'/',((Y_test == test_preds3).sum()+(Y_test != test_preds3).sum()))
print('-'*50)

In [None]:
# Kappa Score
print('KappaScore is: ', metrics.cohen_kappa_score(Y_test,test_preds3))

####  Kappa = 1, perfect agreement exists. 

### Naïve Bayce  algorithm 

In [None]:
from sklearn.naive_bayes import GaussianNB
import numpy as np
import os,sys
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#fit the model on train data 
NB=GaussianNB()
NB.fit(X_train,Y_train)
#predict on train 
train_preds4 = NB.predict(X_train)
#accuracy on train
print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds4))

#predict on test
test_preds4 = NB.predict(X_test)
#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds4))
print('-'*50)
#Confusion matrix
print("confusion_matrix train is: \n", confusion_matrix(Y_train, train_preds4))
print("confusion_matrix test is:\n ", confusion_matrix(Y_test, test_preds4))
print('Wrong predictions out of total')
print('-'*50)
print('\nClassification Report Train is ')
print(classification_report (Y_train, train_preds4))
print('\nClassification Report Test is ')
print(classification_report (Y_test, test_preds4))

# Wrong Prediction and Kappa Score   
# Wrong Predictions made.

print((Y_test !=test_preds4).sum(),'/',((Y_test == test_preds4).sum()+(Y_test != test_preds4).sum()))
print('-'*50)

# Kappa Score
print('KappaScore is: ', metrics.cohen_kappa_score(Y_test,test_preds4))

#### Wrong prediction and kappa score

In [None]:
# wrong prediction made
print((Y_test!=test_preds4.sum(),'/',((Y_test==test_preds4).sum()+(Y_test!=test_preds4).sum())))
print('-'*50)

# kappa score
print('KappaScore is: ',metrics.cohen_kappa_score(Y_test,test_preds4))

### K Neighbours Classifier

In [None]:
Ks = 10
mean_acc = []
ConfustionMx = [];
for n in range(2,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,Y_train)
    yhat=neigh.predict(X_test)
    mean_acc.append(metrics.accuracy_score(Y_test, yhat))  
print('Neighbor Accuracy List')
print(mean_acc)

In [None]:
plt.plot(range(2,Ks),mean_acc,'g')
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbours (K)')
plt.tight_layout()
plt.show()

In [None]:
 from sklearn.neighbors  import KNeighborsClassifier

# fit the data on train data
KNN = KNeighborsClassifier().fit(X_train,Y_train)
# predict on train
train_preds5=KNN.predict(X_train)
# accuracy on train
print("Model accuracy on train is :",accuracy_score(Y_train,train_preds5))

# fit the data on test data
# prediction on test
# test_preds5=KNN.predict(X_train)
# # accuracy on test
# print("Model accuracy on test is:",accuracy_score(Y_test,test_preds5))
# print('_'*50)
test_preds5 = KNN.predict(X_test)
#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds5))
print('-'*50)


# confusion matrix
print("confusion_matrix train is:\n ", confusion_matrix(Y_train, train_preds5))
print("confusion_matrix test is:\n ", confusion_matrix(Y_test, test_preds5))
print('Wrong predictions out of total')
print('-'*50)
print('\nClassification Report Train is ')
print(classification_report (Y_train, train_preds5))
print('\nClassification Report Test is ')
print(classification_report (Y_test, test_preds5))

In [None]:
# Wrong Prediction and Kappa Score   

# Wrong Predictions made.
print((Y_test !=test_preds5).sum(),'/',((Y_test == test_preds5).sum()+(Y_test != test_preds5).sum()))

print('-'*50)
# Kappa Score
print('KappaScore is: ', metrics.cohen_kappa_score(Y_test,test_preds5))


### Support Vector Machine 

In [None]:
from sklearn.svm import SVC
#fit the model on train data 
SVM = SVC(kernel='linear')
SVM.fit(X_train, Y_train)

#predict on train 
train_preds6 = SVM.predict(X_train)
#accuracy on train
print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds6))

#predict on test
test_preds6 = SVM.predict(X_test)
#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds6))
print('-'*50)
#Confusion matrix
print("confusion_matrix train is: \n", confusion_matrix(Y_train, train_preds6))
print("confusion_matrix test is:\n ", confusion_matrix(Y_test, test_preds6))
print('Wrong predictions out of total')
print('-'*50)

print("recall", metrics.recall_score(Y_test, test_preds6))
print('-'*50)
print('\nClassification Report Train is ')
print(classification_report (Y_train, train_preds6))
print('\nClassification Report Test is ')
print(classification_report (Y_test, test_preds6))   

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}
grid_SVC = GridSearchCV(SVC(), param_grid, cv=5)
grid_SVC.fit(X_train, Y_train) 

In [None]:
best_svc = grid_SVC.best_estimator_

y_pred = best_svc.predict(X_test)

cm = confusion_matrix(Y_train, train_preds6)

# Plot the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_svc.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix for SVM', y=1.1)
plt.show()


In [None]:
# Wrong Prediction and Kappa Score   

# Wrong Predictions made.
print((Y_test !=test_preds6).sum(),'/',((Y_test == test_preds6).sum()+(Y_test != test_preds6).sum()))
print('-'*50)
# Kappa Score
print('KappaScore is: ', metrics.cohen_kappa_score(Y_test,test_preds6))

### Create Pickle File    

In [None]:
import pickle 
# Saving model to disk
pickle.dump(SVM,open('deploy_SVM.pkl','wb'))
# Open the Pickle File 
model=pickle.load(open('deploy_SVM.pkl','rb'))
# Prediction 
print (model.predict (X_train))
