In [None]:
#data processing
import pandas as pd
import numpy as np
import missingno
from collections import Counter

#Machine learning models 
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

#model evaluation and hyperparameter tuning
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

#data visualisation
import seaborn as sns
import matplotlib.pyplot as plt

import warnings 
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('../input/parkinsons-disease-classification/pd_speech_features.csv',index_col=0, delimiter=',', skiprows=1)

In [None]:
df.head()

In [None]:
df.apply(lambda x: sum(x.isnull()),axis=0)

In [None]:
df.describe()

In [None]:
df[['gender','class']].groupby('gender',as_index= False).mean().sort_values(by= 'class',ascending = False)

In [None]:
sns.barplot(x='gender',y='class',data = df)
plt.ylabel('Positivity Rate')
plt.title ('Gender Based Positive')

# Handling Outliers

In [None]:
def outliers(data, n, specs):
    out_i=[]
    for col in specs:
        quar1=np.percentile(data[col],25)
        quar3=np.percentile(data[col],75)
        quar=quar3-quar1
        outlier_step = 1.5 * quar 
        outlier_list_col = data[(data[col] < quar1 - outlier_step) | (data[col] > quar3 + outlier_step)].index
        out_i.extend(outlier_list_col) 
    out_i = Counter(out_i)
    multiple_outliers = list(key for key, value in out_i.items() if value > n) 
    return multiple_outliers
outliers_to_drop = outliers(df, 2, ['PPE', 'DFA', 'RPDE', 'numPulses','numPeriodsPulses','meanPeriodPulses','locPctJitter'])
outliers_to_drop

In [None]:
print("Before: {} rows".format(len(df)))
df = df.drop(outliers_to_drop, axis = 0).reset_index(drop = True)
print("After: {} rows".format(len(df)))

In [None]:
sns.heatmap(df[['class', 'gender', 'PPE', 'DFA', 'RPDE','numPulses','numPeriodsPulses','meanPeriodPulses','locPctJitter']].corr(), annot = True, fmt = '.2f', cmap = 'coolwarm')

# Skewness

## DFA

In [None]:
sns.distplot(df['DFA'], label = 'Skewness: %.2f'%(df['DFA'].skew()))
plt.legend(loc = 'best')
plt.title('DFA distribution')

In [None]:
sns.kdeplot(df['DFA'][df['class'] == 0], label = 'Do not have disease')
sns.kdeplot(df['DFA'][df['class'] == 1], label = 'have disease')
plt.legend()
plt.xlabel('DFA')
plt.title('Relation of DFA and disease')

In [None]:
g = sns.FacetGrid(df, col = 'class')
g.map(sns.distplot, 'DFA')

# gender

In [None]:
sns.distplot(df['gender'], label = 'Skewness: %.2f'%(df['gender'].skew()))
plt.legend(loc = 'best')
plt.title('gender distribution')

In [None]:
sns.kdeplot(df['gender'][df['class'] == 0], label = 'Do not have disease')
sns.kdeplot(df['gender'][df['class'] == 1], label = 'have disease')
plt.legend()
plt.xlabel('gender')
plt.title('Relation of gender and disease')

In [None]:
g = sns.FacetGrid(df, col = 'class')
g.map(sns.distplot, 'gender')

## RPDE

In [None]:
sns.distplot(df['RPDE'], label = 'Skewness: %.2f'%(df['RPDE'].skew()))
plt.legend(loc = 'best')
plt.title('RPDE distribution')

In [None]:
sns.kdeplot(df['RPDE'][df['class'] == 0], label = 'Do not have disease')
sns.kdeplot(df['RPDE'][df['class'] == 1], label = 'have disease')
plt.legend()
plt.xlabel('RPDE')
plt.title('Relation of RPDE and disease')

In [None]:
g = sns.FacetGrid(df, col = 'class')
g.map(sns.distplot, 'RPDE')

## numPulses

In [None]:
sns.distplot(df['numPulses'], label = 'Skewness: %.2f'%(df['numPulses'].skew()))
plt.legend(loc = 'best')
plt.title('numPulses distribution')

In [None]:
sns.kdeplot(df['numPulses'][df['class'] == 0], label = 'Do not have disease')
sns.kdeplot(df['numPulses'][df['class'] == 1], label = 'have disease')
plt.legend()
plt.xlabel('numPulses')
plt.title('Relation of numPulses and disease')

In [None]:
g = sns.FacetGrid(df, col = 'class')
g.map(sns.distplot, 'numPulses')

## numPeriodsPulses

In [None]:
sns.distplot(df['numPeriodsPulses'], label = 'Skewness: %.2f'%(df['numPeriodsPulses'].skew()))
plt.legend(loc = 'best')
plt.title('numPeriodsPulses distribution')

In [None]:
sns.kdeplot(df['numPeriodsPulses'][df['class'] == 0], label = 'Do not have disease')
sns.kdeplot(df['numPeriodsPulses'][df['class'] == 1], label = 'have disease')
plt.legend()
plt.xlabel('numPeriodsPulses')
plt.title('Relation of numPeriodsPulses and disease')

In [None]:
g = sns.FacetGrid(df, col = 'class')
g.map(sns.distplot, 'numPeriodsPulses')

## locPctJitter


In [None]:
sns.distplot(df['locPctJitter'], label = 'Skewness: %.2f'%(df['locPctJitter'].skew()))
plt.legend(loc = 'best')
plt.title('locPctJitter distribution')

In [None]:
sns.kdeplot(df['locPctJitter'][df['class'] == 0], label = 'Do not have disease')
sns.kdeplot(df['locPctJitter'][df['class'] == 1], label = 'have disease')
plt.legend()
plt.xlabel('locPctJitter')
plt.title('Relation of locPctJitter and disease')

In [None]:
g = sns.FacetGrid(df, col = 'class')
g.map(sns.distplot, 'RPDE')

In [None]:
# Apply log transformation to locPctJitter column to reduce skewness

df['locPctJitter'] = df['locPctJitter'].map(lambda x: np.log(x) if x > 0 else 0)

In [None]:
sns.distplot(df['locPctJitter'], label = 'Skewness: %.2f'%(df['locPctJitter'].skew()))
plt.legend(loc = 'best')
plt.title('locPctJitter distribution')

In [None]:
dataX=df.drop('class',axis=1)
dataY=df['class']

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(dataX,dataY,test_size=0.15,random_state=42)

In [None]:
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('Y_train',Y_train.shape)
print('Y_test',Y_test.shape)

In [None]:
#Logistic Regression
lr=LogisticRegression()
lr.fit(X_train,Y_train)
pred_y=lr.predict(X_test)
acc_lr=round(lr.score(X_train,Y_train)*100,2)
acc_lr

In [None]:
#SVM
svm =SVC()
svm.fit(X_train, Y_train)
pred_y= svm.predict(X_test)
acc_svm= round(svm.score(X_train, Y_train)*100,2)
acc_svm

In [None]:
#K-nearest neighbours (KNN)
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, Y_train)
pred_y = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

In [None]:
#Gaussian naive bayes
gnb = GaussianNB()
gnb.fit(X_train, Y_train)
Y_pred = gnb.predict(X_test)
acc_gnb = round(gnb.score(X_train, Y_train) * 100, 2)
acc_gnb

In [None]:
#Perceptron
percep = Perceptron()
percep.fit(X_train, Y_train)
pred_y = percep.predict(X_test)
acc_percep = round(percep.score(X_train, Y_train) * 100, 2)
acc_percep

In [None]:
#Linear SVC
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
pred_y = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc

In [None]:
#Stochastic gradient descent
sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
pred_y = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

In [None]:
#Decision tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

In [None]:
#Random forest
random_forest = RandomForestClassifier(n_estimators = 100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

# Accuracy Comparision

In [None]:
models = pd.DataFrame({'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
                                 'Random Forest', 'Naive Bayes', 'Perceptron', 'Stochastic Gradient Decent', 
                                 'Linear SVC', 'Decision Tree'],
                       'Score': [acc_svm, acc_knn, acc_lr, acc_random_forest, acc_gnb, acc_percep,
                                 acc_sgd, acc_linear_svc, acc_decision_tree]})

models.sort_values(by = 'Score', ascending = False, ignore_index = True)

# K fold Cross Validaton

In [None]:
# Create a list which contains classifiers 

classifiers = []
classifiers.append(LogisticRegression())
classifiers.append(SVC())
classifiers.append(KNeighborsClassifier(n_neighbors = 5))
classifiers.append(GaussianNB())
classifiers.append(Perceptron())
classifiers.append(LinearSVC())
classifiers.append(SGDClassifier())
classifiers.append(DecisionTreeClassifier())
classifiers.append(RandomForestClassifier())

len(classifiers)

In [None]:
cv_results = []
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier, X_train, Y_train, scoring = 'accuracy', cv = 9))

# Classifier cross validation

In [None]:
cv_mean = []
cv_std = []
for cv_result in cv_results:
    cv_mean.append(cv_result.mean())
    cv_std.append(cv_result.std())

In [None]:
cv_res = pd.DataFrame({'Cross Validation Mean': cv_mean, 'Cross Validation Std': cv_std, 'Algorithm': ['Logistic Regression', 'Support Vector Machines', 'KNN', 'Gausian Naive Bayes', 'Perceptron', 'Linear SVC', 'Stochastic Gradient Descent', 'Decision Tree', 'Random Forest']})
cv_res.sort_values(by = 'Cross Validation Mean', ascending = False, ignore_index = True)

In [None]:
sns.barplot('Cross Validation Mean', 'Algorithm', data = cv_res, order = cv_res.sort_values(by = 'Cross Validation Mean', ascending = False)['Algorithm'], palette = 'Set3', **{'xerr': cv_std})
plt.ylabel('Algorithm')
plt.title('Cross Validation Scores')

In [112]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 5 , stop = 15, num = 10)] # returns 10 numbers 

max_features = ['auto', 'log2']

max_depth = [int(x) for x in np.linspace(5, 10, num = 2)] 

max_depth.append(None)

bootstrap = [True, False]

r_grid = {'n_estimators': n_estimators,

               'max_features': max_features,

               'max_depth': max_depth,

               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [121]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state = 1)

In [122]:
rfr_random = RandomizedSearchCV(estimator=rfr, param_distributions=r_grid, n_iter = 20, scoring='neg_mean_absolute_error', cv = 3, verbose=2, random_state=42, n_jobs=-1, return_train_score=True)

rfr_random.fit(X_train, Y_train);

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [123]:
print(rfr_random.best_params_)

{'n_estimators': 6, 'max_features': 'log2', 'max_depth': 10, 'bootstrap': False}


In [135]:
#Random forest
random_forest = RandomForestClassifier(n_estimators= 6, max_features= np.log(2), max_depth= 10, bootstrap=False)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

100.0

In [136]:
cross_val_score(random_forest, X_train, Y_train, scoring = 'accuracy', cv = 10).mean()

0.8315584415584414

In [137]:
Y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1])

In [138]:
from sklearn import metrics


In [139]:
print(metrics.classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.70      0.64      0.67        25
           1       0.88      0.91      0.89        74

    accuracy                           0.84        99
   macro avg       0.79      0.77      0.78        99
weighted avg       0.83      0.84      0.84        99



## 

## 

## 