In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import KFold

In [25]:
# load data
data = pd.read_csv('./born_data.csv')

# separate x and y
y_df = data.iloc[:, -1].copy()
y_df[y_df <> 5] = 0
y_df[y_df == 5] = 1
x_df = data.iloc[:, 0:-1].copy()

In [26]:
# if more than 15 unique values, then treated as numerical
master_list = []

for col in x_df.columns:
    # get unique count
    unique_count = len(x_df[col].unique())
    
    # numerical variable
    if unique_count > 15:
        # get a copy for processing
        curr_df = pd.DataFrame(x_df[col])
        # handle NaN
        curr_df[curr_df.isnull()] = -1
        # standardize
        curr_df = pd.DataFrame(preprocessing.scale(curr_df.values))
        # reset index
        curr_df.index = x_df.index
        # change column label
        curr_df.columns = [col]
        # append to master list
        master_list.append(curr_df)
    # categorical variable
    else:
        # get a copy
        curr_df = x_df[col].copy()
        # get dummy
        # if contains NaN
        if curr_df.isnull().values.any():
            dummy = pd.get_dummies(curr_df, dummy_na = True)
        # if no NaN
        else:
            dummy = pd.get_dummies(curr_df, dummy_na = False)
        # reset columns
        dummy.columns = [str(col)+'='+str(value) for value in dummy.columns]
        # reset index
        dummy.index = x_df.index
        # append to master list
        master_list.append(dummy)
        
# concatenate master list
x_df_extended = pd.concat(master_list, axis = 1)

print 'data shape before dummification/standardization', x_df.shape
print 'data shape after dummification/standardization', x_df_extended.shape

data shape before dummification/standardization (18558, 64)
data shape after dummification/standardization (18558, 537)


In [94]:
# cross validataion
kf = KFold(n_splits = 5)
accuracy_df = pd.DataFrame(columns = ['accuracy_all', 'accuracy_healthy', 'accuracy_epilepsy'])
weight_power = np.arange(2, 12, 1)
weight_list = 10**weight_power

for weight in weight_list:
    accuracy_df.loc[weight] = [0, 0, 0]
    for train_index, test_index in kf.split(x_df_extended.values):
        x_train = x_df_extended.values[train_index, :]
        x_test = x_df_extended.values[test_index, :]
        y_train = y_df.values[train_index]
        y_test = y_df.values[test_index]
    
        # build model
        lr_model = linear_model.LogisticRegressionCV(class_weight = {1:weight},scoring="roc_auc")

        # train model
        lr_model.fit(x_train, y_train)

        # predict
        y_test_predict = lr_model.predict(x_test)
        
        #y_test_predict_proba = lr_model.predict_proba(x_test)[:, 0]
        #threshold = 0.5
        #mask_0 = y_test_predict_proba > threshold
        #y_test_predict = np.ones(y_test_predict_proba.shape)
        #y_test_predict[mask_0] = 0
    
        # confusion matrix
        cm = metrics.confusion_matrix(y_test, y_test_predict)
        cm_df = pd.DataFrame(cm, columns = ['predict 0', 'predict 1'], index = ['true 0', 'true 1'])

        # score
        accuracy_all = float(sum(y_test_predict == y_test))/float(len(y_test))
        accuracy_healthy = float(cm[0,0])/float(cm[0,0]+cm[0,1])
        accuracy_epilepsy = float(cm[1,1])/float(cm[1,0]+cm[1,1])
    
        # append to dataframe
        accuracy_df.loc[weight] += [accuracy_all, accuracy_healthy, accuracy_epilepsy]
        
accuracy_df /= 5

In [95]:
# output
print accuracy_df

              accuracy_all  accuracy_healthy  accuracy_epilepsy
100               0.965944          0.968863           0.048951
1000              0.906241          0.908636           0.148096
10000             0.858068          0.860088           0.216472
100000            0.848642          0.850637           0.216472
1000000           0.824827          0.826742           0.216472
10000000          0.775302          0.776670           0.324165
100000000         0.853003          0.855080           0.201088
1000000000        0.843735          0.845729           0.219270
10000000000       0.834467          0.836377           0.241492
100000000000      0.826708          0.828593           0.241492


In [96]:
# get coefficient
lr_model = linear_model.LogisticRegressionCV(class_weight = {1:10000000},scoring="roc_auc")
lr_model.fit(x_train, y_train)
coef = lr_model.coef_.flatten()

most_pos_var_index = np.argsort(coef)[-21:][:-1]
most_neg_var_index = np.argsort(coef)[0:3]

most_pos_var = x_df_extended.columns[most_pos_var_index]
most_neg_var = x_df_extended.columns[most_neg_var_index]

print 'most positive variables'
print most_pos_var.values
print 'most negative variables'
print most_neg_var.values

most positive variables
['0  Past complications of pregnancy, etc=2.0'
 '0 Smoking prior to pregnancy=1.0'
 '0  SEG maternal GPa as mum left school (GRO 1951)=5.0'
 '0 Birthweight-gestational age for sex=2.0'
 'Region at NCDS1 (1965) - 7 years=3.0' '0  Foetal distress=1.0'
 'Region at PMS (1958) - Birth=8.0'
 '0 Week in which mother stopped work=7.0'
 'Region at NCDS1 (1965) - 7 years=6.0'
 '0  Delivery-Supervision Groups=5.0' 'Region at PMS (1958) - Birth=4.0'
 '0  Mum-No. cooked for when 1st pregnant=9.0'
 '0  Whether labour induced=10.0'
 "0 Socio-economic group mother's husband (GRO 1951)=11.0"
 '0  Duration membranes ruptured-hours=7.0' '0  Whether labour induced=4.0'
 '0  SEG maternal GPa as mum left school (GRO 1951)=1.0'
 '0  Duration of labour-1st stage:hours=2.0'
 '0 Abnormality during pregnancy=8.0' '0  Inhalational analgesia=9.0']
most negative variables
['Region at NCDS1 (1965) - 7 years=nan' 'Region at PMS (1958) - Birth=3.0'
 '0  Duration membranes ruptured-hours=3.0']


In [76]:
#a function to deliver comparation
def compare(name):
    #ratio and count among whole population
    print pd.to_numeric(data[name],errors='coerce').value_counts()/pd.to_numeric(data[name],errors='coerce').value_counts().sum()
    print data[name].value_counts()
    
    #ratio and count among epilepsy patients
    print epi[name].value_counts()/epi[name].value_counts().sum()
    print epi[name].value_counts()

In [77]:
compare("0  Mums min. haemoglobin while pregnant")

10.0    0.568194
1.0     0.330745
2.0     0.082627
3.0     0.018375
4.0     0.000059
Name: 0  Mums min. haemoglobin while pregnant, dtype: float64
10.0    9586
1.0     5580
2.0     1394
3.0      310
4.0        1
Name: 0  Mums min. haemoglobin while pregnant, dtype: int64
10.0    0.584906
1.0     0.358491
2.0     0.056604
Name: 0  Mums min. haemoglobin while pregnant, dtype: float64
10.0    31
1.0     19
2.0      3
Name: 0  Mums min. haemoglobin while pregnant, dtype: int64
