In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics

In [28]:
# load data
data = pd.read_csv('./clean_data.csv')

# separate x and y
y_label = 'CMI:75 F3a) Epilepsy: Ever suffered'
y_df = data[y_label].copy()
x_df = data.drop(y_label, axis = 1)

In [29]:
# if more than 15 unique values, then treated as numerical
master_list = []

for col in x_df.columns:
    # get unique count
    unique_count = len(x_df[col].unique())
    
    # numerical variable
    if unique_count > 15:
        # get a copy for processing
        curr_df = x_df[col].copy()
        # handle NaN
        curr_df[curr_df.isnull()] = -1
        # standardize
        curr_df = pd.DataFrame(preprocessing.scale(curr_df.values))
        # reset index
        curr_df.index = x_df.index
        # append to master list
        master_list.append(curr_df)
    # categorical variable
    else:
        # get a copy
        curr_df = x_df[col].copy()
        # get dummy
        # if contains NaN
        if curr_df.isnull().values.any():
            dummy = pd.get_dummies(curr_df, dummy_na = True)
        # if no NaN
        else:
            dummy = pd.get_dummies(curr_df, dummy_na = False)
        # reset columns
        dummy.columns = [str(col)+'='+str(value) for value in dummy.columns]
        # reset index
        dummy.index = x_df.index
        # append to master list
        master_list.append(dummy)
        
# concatenate master list
x_df_extended = pd.concat(master_list, axis = 1)

print 'data shape before dummification/standardization', x_df.shape
print 'data shape after dummification/standardization', x_df_extended.shape

# for y_df
# 1: have epilepsy
# 2: have epilepsy only when pregnant
# 3: not have epilepsy

mask_0 = y_df == 3
y_df_masked = y_df.copy()
y_df_masked[mask_0] = 0
y_df_masked[~mask_0] = 1

data shape before dummification/standardization (11326, 1474)
data shape after dummification/standardization (11326, 7651)


In [41]:
# train test split
x_train, x_test, y_train, y_test = model_selection.train_test_split(\
    x_df_extended.values, y_df_masked.values)

# build model
lr_model = linear_model.LogisticRegressionCV()

# train model
lr_model.fit(x_train, y_train)

# predict
y_test_predict_proba = lr_model.predict_proba(x_test)[:, 0]
threshold = 0.999
mask_0 = y_test_predict_proba > threshold
y_test_predict = np.ones(y_test_predict_proba.shape)
y_test_predict[mask_0] = 0

# confusion matrix
cm = metrics.confusion_matrix(y_test, y_test_predict)
cm_df = pd.DataFrame(cm, columns = ['predict 0', 'predict 1'], index = ['true 0', 'true 1'])

# score
accuracy_all = float(sum(y_test_predict == y_test))/float(len(y_test))
accuracy_healthy = float(cm[0,0])/float(cm[0,0]+cm[0,1])
accuracy_epilepsy = float(cm[1,1])/float(cm[1,0]+cm[1,1])

# output
print 'accuracy overall: %.3f' % accuracy_all
print 'accuracy on healthy patients: %.3f' % accuracy_healthy
print 'accuracy on epilepsy patients: %.3f' % accuracy_epilepsy
cm_df

accuracy overall: 0.996
accuracy on healthy patients: 0.997
accuracy on epilepsy patients: 0.938


Unnamed: 0,predict 0,predict 1
true 0,2792,8
true 1,2,30


In [40]:
# get coefficient
coef = lr_model.coef_.flatten()

most_pos_var_index = np.argsort(coef)[-4:][:-1]
most_neg_var_index = np.argsort(coef)[0:3]

most_pos_var = x_df_extended.columns[most_pos_var_index]
most_neg_var = x_df_extended.columns[most_neg_var_index]

print 'most positive variables'
print most_pos_var.values
print 'most negative variables'
print most_neg_var.values

most positive variables
['CMI:75 F3a) Hernia: Ever suffered=2.0'
 'CMI:75 F3c) Epilepsy: Seen doctor in last 12 months=1.0'
 'CMI:75 F3b) Epilepsy: Suffered in last12 months=1.0']
most negative variables
['CMI:75 F3b) Epilepsy: Suffered in last12 months=nan'
 'CMI:75 F3c) Epilepsy: Seen doctor in last 12 months=nan'
 'CMI:75 F3a) Heart trouble: Ever suffered=3.0']
