In [44]:
import pandas as pd
import numpy as np

In [45]:
Input_df = pd.read_csv('input/parkinsons')
df = Input_df.copy()

In [46]:
df.columns

Index(['name', 'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
       'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
       'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
       'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'status', 'RPDE', 'DFA',
       'spread1', 'spread2', 'D2', 'PPE'],
      dtype='object')

In [47]:
df.isnull().sum()

name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
status              0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
dtype: int64

In [48]:
df.dtypes

name                 object
MDVP:Fo(Hz)         float64
MDVP:Fhi(Hz)        float64
MDVP:Flo(Hz)        float64
MDVP:Jitter(%)      float64
MDVP:Jitter(Abs)    float64
MDVP:RAP            float64
MDVP:PPQ            float64
Jitter:DDP          float64
MDVP:Shimmer        float64
MDVP:Shimmer(dB)    float64
Shimmer:APQ3        float64
Shimmer:APQ5        float64
MDVP:APQ            float64
Shimmer:DDA         float64
NHR                 float64
HNR                 float64
status                int64
RPDE                float64
DFA                 float64
spread1             float64
spread2             float64
D2                  float64
PPE                 float64
dtype: object

In [49]:
X = df.drop(['status','name'],axis = 'columns')
y = df['status']

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
X_train,X_test,y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state = 1)

In [52]:
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression(penalty = 'l2',solver = 'newton-cg')

lgr.fit(X_train,y_train)
print(lgr.score(X_train,y_train))
print(lgr.score(X_test,y_test))

0.8897058823529411
0.7966101694915254


In [54]:
logModel = LogisticRegression()

In [55]:
param_grid = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }
]

In [56]:
from sklearn.model_selection import GridSearchCV

In [57]:
clf = GridSearchCV(logModel, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)

In [58]:
best_clf = clf.fit(X,y)

Fitting 3 folds for each of 1600 candidates, totalling 4800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 1308 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed:   44.9s
[Parallel(n_jobs=-1)]: Done 3924 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 4793 out of 4800 | elapsed:  1.8min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 4800 out of 4800 | elapsed:  1.8min finished


In [59]:
best_clf.best_estimator_

LogisticRegression(C=0.615848211066026, max_iter=2500, penalty='l1',
                   solver='liblinear')

In [60]:
print (f'Accuracy - : {best_clf.score(X,y):.3f}')

Accuracy - : 0.851


In [61]:
lg = LogisticRegression(C=0.615848211066026, max_iter=2500, penalty='l1',
                   solver='liblinear')

In [62]:
lg.fit(X_train, y_train)

LogisticRegression(C=0.615848211066026, max_iter=2500, penalty='l1',
                   solver='liblinear')

In [63]:
lg.score(X_train,y_train)

0.8897058823529411

In [66]:
from sklearn.preprocessing import MinMaxScaler

In [67]:
X_norm = MinMaxScaler().fit_transform(X)

In [68]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# feature engineering. Let's see the best 3 features by setting k = 3
kBest_chi = SelectKBest(score_func=chi2, k=3)
fit_test = kBest_chi.fit(X_norm, y)

In [69]:
# print test scores
fit_test.scores_

array([4.31911959, 0.96351509, 6.01407496, 2.46940099, 2.88520414,
       2.22850674, 2.61083648, 2.22759792, 4.21534209, 3.77465835,
       4.18788904, 3.85307328, 3.35236987, 4.18576878, 1.49598689,
       1.50116908, 1.92439718, 0.88372961, 5.83205241, 2.85383644,
       1.52725606, 5.68001265])