In [151]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [152]:
# load data
data = pd.read_csv('./born_data.csv')

# separate x and y
y_df = data.iloc[:, -1].copy()
y_df[y_df <> 5] = 0
y_df[y_df == 5] = 1
x_df = data.iloc[:, 0:-1].copy()

In [None]:
for col in x_df.columns:
    if '11 years' in col or '16 years' in col:
        x_df.drop(col, axis = 1, inplace = True)

In [153]:
# if more than 15 unique values, then treated as numerical
master_list = []

for col in x_df.columns:
    # get unique count
    unique_count = len(x_df[col].unique())
    
    # numerical variable
    if unique_count > 15:
        # get a copy for processing
        curr_df = pd.DataFrame(x_df[col])
        # handle NaN
        curr_df[curr_df.isnull()] = -1
        # standardize
        curr_df = pd.DataFrame(scale(curr_df.values))
        # reset index
        curr_df.index = x_df.index
        # change column label
        curr_df.columns = [col]
        # append to master list
        master_list.append(curr_df)
    # categorical variable
    else:
        # get a copy
        curr_df = x_df[col].copy()
        # get dummy
        # if contains NaN
        if curr_df.isnull().values.any():
            dummy = pd.get_dummies(curr_df, dummy_na = True)
        # if no NaN
        else:
            dummy = pd.get_dummies(curr_df, dummy_na = False)
        # reset columns
        dummy.columns = [str(col)+'='+str(value) for value in dummy.columns]
        # reset index
        dummy.index = x_df.index
        # append to master list
        master_list.append(dummy)
        
# concatenate master list
x_df_extended = pd.concat(master_list, axis = 1)

print 'data shape before dummification/standardization', x_df.shape
print 'data shape after dummification/standardization', x_df_extended.shape

data shape before dummification/standardization (18558, 66)
data shape after dummification/standardization (18558, 556)


In [155]:
while True:
    x_train, x_test, y_train, y_test = train_test_split(\
        x_df_extended.values, y_df.values, test_size = 0.25)
    if sum(y_train == 1) > 2 and sum(y_test == 1) > 0:
        break
    else:
        print 'try again'

try again


In [156]:
weight_list = [{1:value} for value in 10**np.arange(2, 12, 2)]
parameters = {'class_weight':weight_list}
gscv = GridSearchCV(LogisticRegressionCV(), parameters, scoring = 'roc_auc')
gscv.fit(x_train, y_train)
best_lr_model = gscv.best_estimator_

In [157]:
# get coefficient
coef = best_lr_model.coef_.flatten()

most_pos_var_index = np.argsort(coef)[-21:][::-1]
most_neg_var_index = np.argsort(coef)[0:3]

print 'most positive variables'
for index in most_pos_var_index:
    print coef[index], x_df_extended.columns[index]

most positive variables
5.47401257949 1M Maternal illness while pregnantM:C1-2=2.0
3.14038828966 0  Inhalational analgesia=3.0
2.97976761386 0  Duration of labour-1st stage:hours=6.0
2.8756103458 0 Interval between this birth and last=6.0
2.74117173444 0 Abnormality during pregnancy=9.0
2.57597723771 0  Total number of antenatal vsits=5.0
2.561060463 0  Total number of antenatal vsits=2.0
2.44691796668 0 Mother's age last birthday,in years
2.42684059368 0 Mother's weight in stones,1958=4.0
2.37042254832 Region at NCDS3 (1974) - 16 years=6.0
2.17387208977 0 Smoking during pregnancy=2.0
2.06673005326 Region at NCDS2 (1969) - 11 years=6.0
1.9272961839 0 Smoking prior to pregnancy=7.0
1.90994935109 Region at NCDS3 (1974) - 16 years=3.0
1.90952600216 0 Birthweight-gestational age for sex=5.0
1.86248115601 Region at NCDS1 (1965) - 7 years=3.0
1.81502062676 Region at NCDS1 (1965) - 7 years=6.0
1.80132973435 0  Was mum at sch. after min.leaving age=5.0
1.65390368553 0  Week of mothers 1st ante

In [158]:
best_lr_model = gscv.best_estimator_

y_predict = best_lr_model.predict(x_test)
cm = confusion_matrix(y_test, y_predict)
accuracy_all = best_lr_model.score(x_test, y_test)
accuracy_epilepsy = float(cm[1,1])/float(cm[1,0]+cm[1,1])
accuracy_healthy = float(cm[0,0])/float(cm[0,0]+cm[0,1])

print accuracy_all
print accuracy_healthy
print accuracy_epilepsy

0.999568965517
0.999568872602
1.0


In [159]:
print cm

[[4637    2]
 [   0    1]]


In [160]:
print gscv.best_params_
print gscv.best_score_

{'class_weight': {1: 100000000}}
0.978687620262


In [161]:
#a function to deliver comparation
def compare(name):
    #ratio and count among whole population
    print pd.to_numeric(data[name],errors='coerce').value_counts()/pd.to_numeric(data[name],errors='coerce').value_counts().sum()
    print data[name].value_counts()
    
    #ratio and count among epilepsy patients
    print epi[name].value_counts()/epi[name].value_counts().sum()
    print epi[name].value_counts()

In [162]:
compare("0  Mums min. haemoglobin while pregnant")

10.0    0.568194
1.0     0.330745
2.0     0.082627
3.0     0.018375
4.0     0.000059
Name: 0  Mums min. haemoglobin while pregnant, dtype: float64
10.0    9586
1.0     5580
2.0     1394
3.0      310
4.0        1
Name: 0  Mums min. haemoglobin while pregnant, dtype: int64


NameError: global name 'epi' is not defined