In [220]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [221]:
# load data
data = pd.read_csv('./born_data.csv')

# separate x and y
y_label = '12D Epilepsy identification'
y_df = data[y_label].copy()
y_df[y_df <> 5] = 0
y_df[y_df == 5] = 1
x_df = data.drop(y_label, axis = 1).copy()

In [222]:
for col in x_df.columns:
    if '11 years' in col or '16 years' in col:
        x_df.drop(col, axis = 1, inplace = True)

In [223]:
# if more than 15 unique values, then treated as numerical
master_list = []

for col in x_df.columns:
    # get unique count
    unique_count = len(x_df[col].unique())
    
    # numerical variable
    if unique_count > 15:
        # get a copy for processing
        curr_df = pd.DataFrame(x_df[col])
        # handle NaN
        curr_df[curr_df.isnull()] = -1
        # standardize
        curr_df = pd.DataFrame(scale(curr_df.values))
        # reset index
        curr_df.index = x_df.index
        # change column label
        curr_df.columns = [col]
        # append to master list
        master_list.append(curr_df)
    # categorical variable
    else:
        # get a copy
        curr_df = x_df[col].copy()
        # get dummy
        # if contains NaN
        if curr_df.isnull().values.any():
            dummy = pd.get_dummies(curr_df, dummy_na = True)
        # if no NaN
        else:
            dummy = pd.get_dummies(curr_df, dummy_na = False)
        # reset columns
        dummy.columns = [str(col)+'='+str(value) for value in dummy.columns]
        # reset index
        dummy.index = x_df.index
        # append to master list
        master_list.append(dummy)
        
# concatenate master list
x_df_extended = pd.concat(master_list, axis = 1)

print 'data shape before dummification/standardization', x_df.shape
print 'data shape after dummification/standardization', x_df_extended.shape

data shape before dummification/standardization (18558, 64)
data shape after dummification/standardization (18558, 523)


In [224]:
def scorer(estimator, x, y):
    y_predict = estimator.predict(x)
    cm = confusion_matrix(y, y_predict)
    accuracy_all = estimator.score(x, y)
    accuracy_healthy = float(cm[0,0])/float(cm[0,0]+cm[0,1])
    accuracy_epilepsy = float(cm[1,1])/float(cm[1,0]+cm[1,1])
    return accuracy_all * accuracy_healthy * accuracy_epilepsy

In [225]:
while True:
    x_train, x_test, y_train, y_test = train_test_split(\
        x_df_extended.values, y_df.values, test_size = 0.25)
    if sum(y_train == 1) > 2 and sum(y_test == 1) > 0:
        break
    else:
        print 'try again'

In [226]:
def scorer(estimator, x, y):
    y_predict = estimator.predict(x)
    cm = confusion_matrix(y, y_predict)
    accuracy_epilepsy = float(cm[1,1])/float(cm[1,0]+cm[1,1])
    return accuracy_epilepsy

In [227]:
# cross validataion to find best models
n_estimator_list = np.arange(10, 200, 10)
depth_list = np.arange(1, 10, 10)
class_weight_list = [{1:value} for value in 10**np.arange(-1, 5, 1)]

parameters = {'n_estimators':n_estimator_list,\
              'max_depth':depth_list,\
              'class_weight':class_weight_list}

gscv = GridSearchCV(RandomForestClassifier(), parameters, scoring = 'precision_macro')
gscv.fit(x_train, y_train)

best_rf_model = gscv.best_estimator_

In [228]:
# get importance (coef)
coef = best_rf_model.feature_importances_

most_pos_var_index = np.argsort(coef)[-21:][::-1]
most_neg_var_index = np.argsort(coef)[0:3]

print 'most positive variables'
for index in most_pos_var_index:
    print coef[index], x_df_extended.columns[index]

most positive variables
0.3 0  Siblings alive when mum left school=6.0
0.1 0 Birthweight for gest age for sex=5.0
0.1 0 Socio-economic group mother's husband (GRO 1951)=5.0
0.1 0  Siblings alive when mum left school=11.0
0.1 0  Siblings alive,dead when mum left sch=3.0
0.1 0 Mums paid job when starting this baby (GRO 1951)
0.1 0  Mothers blood group=5.0
0.1 Region at NCDS1 (1965) - 7 years=nan
0.0 0 Hrs of work in pregnancy & wk stopped=1.0
0.0 0 Hrs of work in pregnancy & wk stopped=4.0
0.0 0 Hrs of work in pregnancy & wk stopped=3.0
0.0 0 Hrs of work in pregnancy & wk stopped=2.0
0.0 1M Maternal illness while pregnantM:C2-2=nan
0.0 0 Hrs of work in pregnancy & wk stopped=6.0
0.0 0 Mums paid job during pregnancy (GRO 1951)
0.0 0  Siblings alive when mum left school=nan
0.0 0  Siblings alive when mum left school=10.0
0.0 0  Siblings alive when mum left school=9.0
0.0 0  Siblings alive when mum left school=8.0
0.0 0 Hrs of work in pregnancy & wk stopped=5.0
0.0 0 Hrs of work in pregnanc

In [229]:
best_rf_model = gscv.best_estimator_
kf = KFold(n_splits = 3)
adf = pd.DataFrame({}, columns = ['accuracy_all', 'accuracy_healthy', 'accuracy_epilepsy'])
index = 0

for train_index, test_index in kf.split(x_df_extended.values):
    x_train = x_df_extended.values[train_index, :]
    x_test = x_df_extended.values[test_index, :]
    y_train = y_df.values[train_index]
    y_test = y_df.values[test_index]
    
    best_rf_model.fit(x_train, y_train)
    
    y_predict = best_rf_model.predict(x_test)
    cm = confusion_matrix(y_test, y_predict)
    accuracy_all = best_rf_model.score(x_test, y_test)
    accuracy_epilepsy = float(cm[1,1])/float(cm[1,0]+cm[1,1])
    accuracy_healthy = float(cm[0,0])/float(cm[0,0]+cm[0,1])
    
    print cm
    
    adf.loc[index] = [accuracy_all, accuracy_healthy, accuracy_epilepsy]

print adf.mean()

[[ 235 5932]
 [   0   19]]
[[   0 6166]
 [   0   20]]
[[ 455 5711]
 [   3   17]]
accuracy_all         0.076301
accuracy_healthy     0.073792
accuracy_epilepsy    0.850000
dtype: float64


In [230]:
print gscv.best_score_
print gscv.best_params_

0.501679494725
{'n_estimators': 10, 'max_depth': 1, 'class_weight': {1: 1000}}
