In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler 

In [2]:
# loading the data from csv file to a Pandas DataFrame
parkinsons_data = pd.read_csv('/kaggle/input/parkinsons/parkinsons.csv')

In [3]:
# printing the first 5 rows of the dataframe
parkinsons_data.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [4]:
# number of rows and columns in the dataframe
parkinsons_data.shape

(195, 24)

In [5]:
# getting more information about the dataset
parkinsons_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null    float64
 1

In [6]:
# checking for missing values in each column
parkinsons_data.isnull().sum()

name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
status              0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
dtype: int64

In [7]:
# getting some statistical measures about the data
parkinsons_data.describe()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
count,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,...,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0
mean,154.228641,197.104918,116.324631,0.00622,4.4e-05,0.003306,0.003446,0.00992,0.029709,0.282251,...,0.046993,0.024847,21.885974,0.753846,0.498536,0.718099,-5.684397,0.22651,2.381826,0.206552
std,41.390065,91.491548,43.521413,0.004848,3.5e-05,0.002968,0.002759,0.008903,0.018857,0.194877,...,0.030459,0.040418,4.425764,0.431878,0.103942,0.055336,1.090208,0.083406,0.382799,0.090119
min,88.333,102.145,65.476,0.00168,7e-06,0.00068,0.00092,0.00204,0.00954,0.085,...,0.01364,0.00065,8.441,0.0,0.25657,0.574282,-7.964984,0.006274,1.423287,0.044539
25%,117.572,134.8625,84.291,0.00346,2e-05,0.00166,0.00186,0.004985,0.016505,0.1485,...,0.024735,0.005925,19.198,1.0,0.421306,0.674758,-6.450096,0.174351,2.099125,0.137451
50%,148.79,175.829,104.315,0.00494,3e-05,0.0025,0.00269,0.00749,0.02297,0.221,...,0.03836,0.01166,22.085,1.0,0.495954,0.722254,-5.720868,0.218885,2.361532,0.194052
75%,182.769,224.2055,140.0185,0.007365,6e-05,0.003835,0.003955,0.011505,0.037885,0.35,...,0.060795,0.02564,25.0755,1.0,0.587562,0.761881,-5.046192,0.279234,2.636456,0.25298
max,260.105,592.03,239.17,0.03316,0.00026,0.02144,0.01958,0.06433,0.11908,1.302,...,0.16942,0.31482,33.047,1.0,0.685151,0.825288,-2.434031,0.450493,3.671155,0.527367


In [8]:
# distribution of target Variable
parkinsons_data['status'].value_counts()

status
1    147
0     48
Name: count, dtype: int64

In [9]:
# grouping the data bas3ed on the target variable
#parkinsons_data.groupby('status').mean()

In [10]:
#parkinsons_data.drop(columns=['name'], axis=1)
# grouping the data bas3ed on the target variable
parkinsons_data.head()
parkinsons_data.groupby(['status', 'name']).mean()

#parkinsons_data.groupby('status').mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
status,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,phon_R01_S07_1,197.076,206.896,192.055,0.00289,0.000010,0.00166,0.00168,0.00498,0.01098,0.097,...,0.00802,0.01689,0.00339,26.775,0.422229,0.741367,-7.348300,0.177551,1.743867,0.085569
0,phon_R01_S07_2,199.228,209.512,192.091,0.00241,0.000010,0.00134,0.00138,0.00402,0.01015,0.089,...,0.00762,0.01513,0.00167,30.940,0.432439,0.742055,-7.682587,0.173319,2.103106,0.068501
0,phon_R01_S07_3,198.383,215.203,193.104,0.00212,0.000010,0.00113,0.00135,0.00339,0.01263,0.111,...,0.00951,0.01919,0.00119,30.775,0.465946,0.738703,-7.067931,0.175181,1.512275,0.096320
0,phon_R01_S07_4,202.266,211.604,197.079,0.00180,0.000009,0.00093,0.00107,0.00278,0.00954,0.085,...,0.00719,0.01407,0.00072,32.684,0.368535,0.742133,-7.695734,0.178540,1.544609,0.056141
0,phon_R01_S07_5,203.184,211.526,196.160,0.00178,0.000009,0.00094,0.00106,0.00283,0.00958,0.085,...,0.00726,0.01403,0.00065,33.047,0.340068,0.741899,-7.964984,0.163519,1.423287,0.044539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,phon_R01_S44_2,148.790,158.359,138.990,0.00309,0.000020,0.00152,0.00186,0.00456,0.01574,0.142,...,0.01309,0.02518,0.00488,24.412,0.402591,0.762508,-6.311987,0.182459,2.251553,0.160306
1,phon_R01_S44_3,148.143,155.982,135.041,0.00392,0.000030,0.00204,0.00231,0.00612,0.01450,0.131,...,0.01263,0.02175,0.00540,23.683,0.398499,0.778349,-5.711205,0.240875,2.845109,0.192730
1,phon_R01_S44_4,150.440,163.441,144.736,0.00396,0.000030,0.00206,0.00233,0.00619,0.02551,0.237,...,0.02148,0.03964,0.00611,23.133,0.352396,0.759320,-6.261446,0.183218,2.264226,0.144105
1,phon_R01_S44_5,148.462,161.078,141.998,0.00397,0.000030,0.00202,0.00235,0.00605,0.01831,0.163,...,0.01559,0.02849,0.00639,22.866,0.408598,0.768845,-5.704053,0.216204,2.679185,0.197710


In [11]:
X = parkinsons_data.drop(columns=['name','status'], axis=1)
Y = parkinsons_data['status']

In [12]:
print(X)

     MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0        119.992       157.302        74.997         0.00784   
1        122.400       148.650       113.819         0.00968   
2        116.682       131.111       111.555         0.01050   
3        116.676       137.871       111.366         0.00997   
4        116.014       141.781       110.655         0.01284   
..           ...           ...           ...             ...   
190      174.188       230.978        94.261         0.00459   
191      209.516       253.017        89.488         0.00564   
192      174.688       240.005        74.287         0.01360   
193      198.764       396.961        74.904         0.00740   
194      214.289       260.277        77.973         0.00567   

     MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  \
0             0.00007   0.00370   0.00554     0.01109       0.04374   
1             0.00008   0.00465   0.00696     0.01394       0.06134   
2             0.00

In [13]:
print(Y)

0      1
1      1
2      1
3      1
4      1
      ..
190    0
191    0
192    0
193    0
194    0
Name: status, Length: 195, dtype: int64


In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [15]:
print(X.shape, X_train.shape, X_test.shape)

(195, 22) (156, 22) (39, 22)


In [16]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
print(X_train)

[[ 0.63239631 -0.02731081 -0.87985049 ... -0.97586547 -0.55160318
   0.07769494]
 [-1.05512719 -0.83337041 -0.9284778  ...  0.3981808  -0.61014073
   0.39291782]
 [ 0.02996187 -0.29531068 -1.12211107 ... -0.43937044 -0.62849605
  -0.50948408]
 ...
 [-0.9096785  -0.6637302  -0.160638   ...  1.22001022 -0.47404629
  -0.2159482 ]
 [-0.35977689  0.19731822 -0.79063679 ... -0.17896029 -0.47272835
   0.28181221]
 [ 1.01957066  0.19922317 -0.61914972 ... -0.716232    1.23632066
  -0.05829386]]


In [18]:
acc = [] # list to store all performance metric

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_curve, auc

In [20]:
best_score = 0
kfolds=5

for c_paramter in [0.001, 0.01, 0.1, 1, 10, 100, 1000]: #iterate over the values we need to try for the parameter C
    for gamma_paramter in [0.001, 0.01, 0.1, 1, 10, 100, 1000]: #iterate over the values we need to try for the parameter gamma
        for k_parameter in ['rbf', 'linear', 'poly', 'sigmoid']: # iterate over the values we need to try for the kernel parameter
            svmModel = SVC(kernel=k_parameter, C=c_paramter, gamma=gamma_paramter) #define the model
            # perform cross-validation
            scores = cross_val_score(svmModel, X_train, Y_train, cv=kfolds, scoring='accuracy')
            # the training set will be split internally into training and cross validation

            # compute mean cross-validation accuracy
            score = np.mean(scores)
            # if we got a better score, store the score and parameters
            if score > best_score:
                best_score = score #store the score 
                best_parameter_c = c_paramter #store the parameter c
                best_parameter_gamma = gamma_paramter #store the parameter gamma
                best_parameter_k = k_parameter
            

# rebuild a model with best parameters to get score 
SelectedSVMmodel = SVC(C=best_parameter_c, gamma=best_parameter_gamma, kernel=best_parameter_k).fit(X_train, Y_train)

test_score = SelectedSVMmodel.score(X_test, Y_test)
PredictedOutput = SelectedSVMmodel.predict(X_test)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on cross validation set is:", best_score)
print("Best parameter for c is: ", best_parameter_c)
print("Best parameter for gamma is: ", best_parameter_gamma)
print("Best parameter for kernel is: ", best_parameter_k)
print("Test accuracy with the best parameters is", test_score)
print("Test recall with the best parameters is", test_recall)
print("Test recall with the best parameter is", test_auc)

m = 'SVM'
acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])

Best accuracy on cross validation set is: 0.9356854838709678
Best parameter for c is:  100
Best parameter for gamma is:  0.1
Best parameter for kernel is:  rbf
Test accuracy with the best parameters is 0.9230769230769231
Test recall with the best parameters is 0.9032258064516129
Test recall with the best parameter is 0.9516129032258065


In [21]:
#model = svm.SVC(kernel='linear')
# training the SVM model with training data
#model.fit(X_train, Y_train)
# accuracy score on training data
#X_train_prediction = model.predict(X_train)
#training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
#print('Accuracy score of training data : ', training_data_accuracy)
# accuracy score on training data
#X_test_prediction = model.predict(X_test)
#test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
#print('Accuracy score of test data : ', test_data_accuracy)
#m = 'SVM'
#acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])

In [22]:
best_score=0
#kfolds=5 # set the number of folds

for c in [0.001, 0.1, 1, 10, 100]:
    logRegModel = LogisticRegression(C=c)
    # perform cross-validation
    scores = cross_val_score(logRegModel, X_train, Y_train, cv=kfolds, scoring='accuracy') # Get recall for each parameter setting
    
    # compute mean cross-validation accuracy
    score = np.mean(scores)
    
    # Find the best parameters and score
    if score > best_score:
        best_score = score
        best_parameters = c

# rebuild a model on the combined training and validation set
SelectedLogRegModel = LogisticRegression(C=best_parameters).fit(X_train, Y_train)

test_score = SelectedLogRegModel.score(X_test, Y_test)
PredictedOutput = SelectedLogRegModel.predict(X_test)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on validation set is:", best_score)
print("Best parameter for regularization (C) is: ", best_parameters)
print("Test accuracy with best C parameter is", test_score)
print("Test recall with the best C parameter is", test_recall)
print("Test AUC with the best C parameter is", test_auc)
m = 'Logistic Regression (w/ imputation)'
acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])

Best accuracy on validation set is: 0.8590725806451613
Best parameter for regularization (C) is:  0.1
Test accuracy with best C parameter is 0.8205128205128205
Test recall with the best C parameter is 0.9032258064516129
Test AUC with the best C parameter is 0.7016129032258065


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [23]:

# Dataset after dropping missing value rows
best_score=0
kfolds=5 # set the number of folds

# Feature scaling
scaler = MinMaxScaler().fit(X_train)
X_trainval_scaled_dna = scaler.transform(X_train)
X_test_scaled_dna = scaler.transform(X_test)

for c in [0.001, 0.1, 1, 10, 100]:
    logRegModel = LogisticRegression(C=c)
    # perform cross-validation
    scores = cross_val_score(logRegModel, X_trainval_scaled_dna, Y_train, cv=kfolds, scoring='accuracy')
    
    # compute mean cross-validation accuracy
    score = np.mean(scores)
    
    # Find the best parameters and score
    if score > best_score:
        best_score = score
        best_parameters = c

# rebuild a model on the combined training and validation set
SelectedLogRegModel = LogisticRegression(C=best_parameters).fit(X_trainval_scaled_dna, Y_train)

test_score = SelectedLogRegModel.score(X_test_scaled_dna, Y_test)
PredictedOutput = SelectedLogRegModel.predict(X_test)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on validation set is:", best_score)
print("Best parameter for regularization (C) is: ", best_parameters)
print("Test accuracy with best C parameter is", test_score)        
print("Test recall with the best C parameter is", test_recall)
print("Test AUC with the best C parameter is", test_auc)

m = 'Logistic Regression (w/ dropna)'
acc.append([m, test_score, test_recall, test_recall, fpr, tpr, thresholds])

Best accuracy on validation set is: 0.8399193548387098
Best parameter for regularization (C) is:  1
Test accuracy with best C parameter is 0.8205128205128205
Test recall with the best C parameter is 0.41935483870967744
Test AUC with the best C parameter is 0.7096774193548387


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
#from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
#model = LogisticRegression()
#model.fit(X_train, Y_train)
# accuracy score on training data
#X_train_prediction = model.predict(X_train)
#training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
#print('Accuracy score of training data : ', training_data_accuracy)
# accuracy score on training data
#X_test_prediction = model.predict(X_test)
#test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
#print('Accuracy score of test data : ', test_data_accuracy)
#m = 'LogisticRegression'
#acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])


In [25]:
best_score = 0

for md in range(1, 9): # iterate different maximum depth values
    # train the model
    treeModel = DecisionTreeClassifier(random_state=0, max_depth=md, criterion='gini')
    # perform cross-validation
    scores = cross_val_score(treeModel, X_train, Y_train, cv=kfolds, scoring='accuracy')
    
    # compute mean cross-validation accuracy
    score = np.mean(scores)
    
    # if we got a better score, store the score and parameters
    if score > best_score:
        best_score = score
        best_parameter = md

# Rebuild a model on the combined training and validation set        
SelectedDTModel = DecisionTreeClassifier(max_depth=best_parameter).fit(X_train, Y_train)

test_score = SelectedDTModel.score(X_test, Y_test)
PredictedOutput = SelectedDTModel.predict(X_test)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on validation set is:", best_score)
print("Best parameter for the maximum depth is: ", best_parameter)
print("Test accuracy with best parameter is ", test_score)
print("Test recall with best parameters is ", test_recall)
print("Test AUC with the best parameter is ", test_auc)

m = 'Decision Tree'
acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])

Best accuracy on validation set is: 0.8655241935483872
Best parameter for the maximum depth is:  1
Test accuracy with best parameter is  0.7948717948717948
Test recall with best parameters is  0.9032258064516129
Test AUC with the best parameter is  0.6391129032258065


In [26]:
print("Feature importance: ")
np.array([X.columns.values.tolist(), list(SelectedDTModel.feature_importances_)]).T

Feature importance: 


array([['MDVP:Fo(Hz)', '0.0'],
       ['MDVP:Fhi(Hz)', '0.0'],
       ['MDVP:Flo(Hz)', '0.0'],
       ['MDVP:Jitter(%)', '0.0'],
       ['MDVP:Jitter(Abs)', '0.0'],
       ['MDVP:RAP', '0.0'],
       ['MDVP:PPQ', '0.0'],
       ['Jitter:DDP', '0.0'],
       ['MDVP:Shimmer', '0.0'],
       ['MDVP:Shimmer(dB)', '0.0'],
       ['Shimmer:APQ3', '0.0'],
       ['Shimmer:APQ5', '0.0'],
       ['MDVP:APQ', '0.0'],
       ['Shimmer:DDA', '0.0'],
       ['NHR', '0.0'],
       ['HNR', '0.0'],
       ['RPDE', '0.0'],
       ['DFA', '0.0'],
       ['spread1', '1.0'],
       ['spread2', '0.0'],
       ['D2', '0.0'],
       ['PPE', '0.0']], dtype='<U32')

In [27]:
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
#SelectedDTModel = DecisionTreeClassifier(max_depth=best_parameter).fit(X_trainval_scaled, Y_trainval )
#model = DecisionTreeClassifier()
#model.fit(X_train, Y_train)
# accuracy score on training data
#X_train_prediction = model.predict(X_train)
#training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
#print('Accuracy score of training data : ', training_data_accuracy)
# accuracy score on training data
#X_test_prediction = model.predict(X_test)
#test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
#print('Accuracy score of test data : ', test_data_accuracy)
#m = 'DecisionTreeClassifier'
#acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])

In [28]:
#from sklearn.tree import export_graphviz
#import graphviz 
#dot_data=export_graphviz(SelectedDTModel, feature_names=X_train.values.tolist(),out_file=None)
#graph = graphviz.Source(dot_data)  
#graph 

In [29]:
best_score = 0

for M in range(2, 15, 2): # combines M trees
    for d in range(1, 9): # maximum number of features considered at each split
        for m in range(1, 9): # maximum depth of the tree
            # train the model
            # n_jobs(4) is the number of parallel computing
            forestModel = RandomForestClassifier(n_estimators=M, max_features=d, n_jobs=4,
                                          max_depth=m, random_state=0)
        
            # perform cross-validation
            scores = cross_val_score(forestModel, X_train, Y_train, cv=kfolds, scoring='accuracy')

            # compute mean cross-validation accuracy
            score = np.mean(scores)

            # if we got a better score, store the score and parameters
            if score > best_score:
                best_score = score
                best_M = M
                best_d = d
                best_m = m

# Rebuild a model on the combined training and validation set        
SelectedRFModel = RandomForestClassifier(n_estimators=M, max_features=d,
                                          max_depth=m, random_state=0).fit(X_train, Y_train )

PredictedOutput = SelectedRFModel.predict(X_test)
test_score = SelectedRFModel.score(X_test, Y_test)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on validation set is:", best_score)
print("Best parameters of M, d, m are: ", best_M, best_d, best_m)
print("Test accuracy with the best parameters is", test_score)
print("Test recall with the best parameters is:", test_recall)
print("Test AUC with the best parameters is:", test_auc)

m = 'Random Forest'
acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])

Best accuracy on validation set is: 0.8909274193548388
Best parameters of M, d, m are:  14 1 4
Test accuracy with the best parameters is 0.8717948717948718
Test recall with the best parameters is: 0.9032258064516129
Test AUC with the best parameters is: 0.8266129032258065


In [30]:
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.ensemble import AdaBoostClassifier
#from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
#SelectedDTModel = DecisionTreeClassifier(max_depth=best_parameter).fit(X_trainval_scaled, Y_trainval )
#model = RandomForestClassifier()
#model.fit(X_train, Y_train)
# accuracy score on training data
#X_train_prediction = model.predict(X_train)
#training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
#print('Accuracy score of training data : ', training_data_accuracy)
# accuracy score on training data
#X_test_prediction = model.predict(X_test)
#test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
#print('Accuracy score of test data : ', test_data_accuracy)
#m = 'RandomForestClassifier'
#acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])

In [31]:
#from sklearn.ensemble import AdaBoostClassifier
#from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
#model = AdaBoostClassifier()
#model.fit(X_train, Y_train)
# accuracy score on training data
#X_train_prediction = model.predict(X_train)
#training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
#print('Accuracy score of training data : ', training_data_accuracy)
# accuracy score on training data
#X_test_prediction = model.predict(X_test)
#test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
#print('Accuracy score of test data : ', test_data_accuracy)
#m = 'AdaBoostClassifier'
#acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])

In [32]:
best_score = 0
 

for M in range(2, 15, 2): # combines M trees
    for lr in [0.0001, 0.001, 0.01, 0.1, 1]:
        # train the model
        boostModel = AdaBoostClassifier(n_estimators=M, learning_rate=lr, random_state=0)

        # perform cross-validation
        scores = cross_val_score(boostModel, X_train, Y_train, cv=kfolds, scoring='accuracy')

        # compute mean cross-validation accuracy
        score = np.mean(scores)

        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_M = M
            best_lr = lr

# Rebuild a model on the combined training and validation set        
SelectedBoostModel = AdaBoostClassifier(n_estimators=M, learning_rate=lr, random_state=0).fit(X_train, Y_train )

PredictedOutput = SelectedBoostModel.predict(X_test)
test_score = SelectedBoostModel.score(X_test, Y_test)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on validation set is:", best_score)
print("Best parameter of M is: ", best_M)
print("best parameter of LR is: ", best_lr)
print("Test accuracy with the best parameter is", test_score)
print("Test recall with the best parameters is:", test_recall)
print("Test AUC with the best parameters is:", test_auc)

m = 'AdaBoost'
acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])

Best accuracy on validation set is: 0.8655241935483872
Best parameter of M is:  2
best parameter of LR is:  0.0001
Test accuracy with the best parameter is 0.8205128205128205
Test recall with the best parameters is: 0.8709677419354839
Test AUC with the best parameters is: 0.747983870967742


In [33]:
# Performance Metric for each model
result = pd.DataFrame(acc, columns=['Model', 'Accuracy', 'Recall', 'AUC', 'FPR', 'TPR', 'TH'])
result[['Model', 'Accuracy', 'Recall', 'AUC']]

Unnamed: 0,Model,Accuracy,Recall,AUC
0,SVM,0.923077,0.903226,0.951613
1,Logistic Regression (w/ imputation),0.820513,0.903226,0.701613
2,Logistic Regression (w/ dropna),0.820513,0.419355,0.419355
3,Decision Tree,0.794872,0.903226,0.639113
4,Random Forest,0.871795,0.903226,0.826613
5,AdaBoost,0.820513,0.870968,0.747984
