In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
# load data
data = pd.read_csv('./clean_data.csv')

# separate x and y
y_label = 'CMI:75 F3a) Epilepsy: Ever suffered'
y_df = data[y_label].copy()
mask_no = y_df == 3
mask_yes = (y_df == 1) | (y_df == 2)
y_df[mask_no] = 0
y_df[mask_yes] = 1
x_df = data.drop(y_label, axis = 1).copy()

In [3]:
variable_of_interest = []
for col in x_df.columns:
    if 'epilep' in col or 'Epilep' in col:
        continue
    if 'convuls' in col or 'Convuls' in col:
        continue
    if 'Special Education' in col:
        continue
    if 'Region' in col:
        continue
    if 'neuro' in col:
        continue
    if 'Reason' in col or 'reason' in col:
        continue
    if 'handicap' in col:
        continue
    if 'psychi' in col:
        continue
    if 'patient' in col:
        continue

    variable_of_interest.append(col)

print '{} predictors in file'.format( x_df.shape[1] )

x_df = x_df[variable_of_interest]

print '{} predictors of interest'.format( x_df.shape[1] )

1474 predictors in file
1351 predictors of interest


In [4]:
# if more than 15 unique values, then treated as numerical
master_list = []

for col in x_df.columns:
    # get unique count
    unique_count = len(x_df[col].unique())
    
    # numerical variable
    if unique_count > 15:
        # get a copy for processing
        curr_df = pd.DataFrame(x_df[col])
        # handle NaN
        curr_df[curr_df.isnull()] = -1
        # standardize
        curr_df = pd.DataFrame(scale(curr_df.values))
        # reset index
        curr_df.index = x_df.index
        # change column label
        curr_df.columns = [col]
        # append to master list
        master_list.append(curr_df)
    # categorical variable
    else:
        # get a copy
        curr_df = x_df[col].copy()
        # get dummy
        # if contains NaN
        if curr_df.isnull().values.any():
            dummy = pd.get_dummies(curr_df, dummy_na = True)
        # if no NaN
        else:
            dummy = pd.get_dummies(curr_df, dummy_na = False)
        # reset columns
        dummy.columns = [str(col)+'='+str(value) for value in dummy.columns]
        # reset index
        dummy.index = x_df.index
        # append to master list
        master_list.append(dummy)
        
# concatenate master list
x_df_extended = pd.concat(master_list, axis = 1)

print 'data shape before dummification/standardization', x_df.shape
print 'data shape after dummification/standardization', x_df_extended.shape

data shape before dummification/standardization (11326, 1351)
data shape after dummification/standardization (11326, 6972)


In [5]:
def scorer_epilepsy_accuracy(estimator, x, y):
    y_predict = estimator.predict(x)
    cm = confusion_matrix(y, y_predict)
    accuracy_all = estimator.score(x, y)
    accuracy_healthy = float(cm[0,0])/float(cm[0,0]+cm[0,1])
    accuracy_epilepsy = float(cm[1,1])/float(cm[1,0]+cm[1,1])
    return accuracy_all * accuracy_healthy * accuracy_epilepsy

In [6]:
weight_list = [{1:value} for value in 10**np.arange(0, 3, 1)]
penalty_list = ['l1', 'l2']
parameters = {'class_weight':weight_list, 'penalty':penalty_list}
gscv = GridSearchCV(LogisticRegressionCV(solver = 'liblinear'),\
                    parameters, scoring = 'roc_auc')

gscv.fit(x_df_extended.values, y_df.values)
best_model = gscv.best_estimator_

In [7]:
# get coefficient
best_model = gscv.best_estimator_
coef = best_model.coef_.flatten()

most_pos_var_index = np.argsort(coef)[-21:][::-1]
most_neg_var_index = np.argsort(coef)[0:20]

print 'most positive variables'
for index in most_pos_var_index:
    print coef[index], x_df_extended.columns[index]

most positive variables
3.02121427625 3P Type hcap for which will require help=7.0
2.51690133266 CMI:75 F3a) Hernia: Ever suffered=2.0
2.28339003785 1D Defects found in NCDS1 sample-MC 1:4=3.0
1.55806177639 CMI:78 F24 Specialist #1 seen about 'emotional problems'=2.0
1.0383073862 CMI:75 F3a) Gall bladder trouble: Eversuffered=2.0
0.62936017814 CMI:81 F30 Disability makes it harder to get/keep a paid job=1.0
0.570530912694 CMI:75 F3c) Migraine: Seen doctor in last 12 months=nan
0.51692816652 1P Meeting other kids outside household=4.0
0.512662605368 CMI:75 F3b) Arthritus/rheumatism/etc: Suffered in last 12 months=nan
0.440278447745 CMI:76 F7a) Usually bring up phlegm during day/night in winter=1.0
0.385388890744 1D Defects found in NCDS1 sample-MC 1:4=4.0
0.384559642296 1P Mother's spare-time reading-newspaper=4.0
0.382284178564 2M Girls breast rating=2.0
0.375840712436 3M Assessment of speech intelligibility=1.0
0.366032049374 1P No. kids undr 21 hhld,inc liv away=4.0
0.360221494182 2P

In [8]:
best_lr_model = gscv.best_estimator_
kf = KFold(n_splits = 3)
adf = pd.DataFrame({}, columns = ['accuracy_all', 'accuracy_healthy', 'accuracy_epilepsy'])
index = 0

for train_index, test_index in kf.split(x_df_extended.values):
    x_train = x_df_extended.values[train_index, :]
    x_test = x_df_extended.values[test_index, :]
    y_train = y_df.values[train_index]
    y_test = y_df.values[test_index]
    
    best_lr_model.fit(x_train, y_train)
    
    y_predict = best_model.predict_proba(x_test)[:, 0]
    mask = y_predict > (1-1e-7)
    y_predict[mask] = 0
    y_predict[~mask] = 1

    cm = confusion_matrix(y_test, y_predict)
    accuracy_all = float(cm[1,1] + cm[0,0])/float(cm[1,0]+cm[1,1]+cm[0,0]+cm[0,1])
    accuracy_epilepsy = float(cm[1,1])/float(cm[1,0]+cm[1,1])
    accuracy_healthy = float(cm[0,0])/float(cm[0,0]+cm[0,1])
    
    adf.loc[index] = [accuracy_all, accuracy_healthy, accuracy_epilepsy]

print adf.mean()

accuracy_all         0.722649
accuracy_healthy     0.723073
accuracy_epilepsy    0.692308
dtype: float64


In [9]:
print gscv.best_params_
print gscv.best_score_

{'penalty': 'l1', 'class_weight': {1: 1}}
0.795826037502
