In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import ensemble
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [2]:
# load data
data = pd.read_csv('./born_data.csv')

# separate x and y
y_df = data.iloc[:, -1].copy()
y_df[y_df <> 5] = 0
y_df[y_df == 5] = 1
x_df = data.iloc[:, 0:-1].copy()

In [3]:
# if more than 15 unique values, then treated as numerical
master_list = []

for col in x_df.columns:
    # get unique count
    unique_count = len(x_df[col].unique())
    
    # numerical variable
    if unique_count > 15:
        # get a copy for processing
        curr_df = pd.DataFrame(x_df[col])
        # handle NaN
        curr_df[curr_df.isnull()] = -1
        # standardize
        curr_df = pd.DataFrame(preprocessing.scale(curr_df.values))
        # reset index
        curr_df.index = x_df.index
        # change column label
        curr_df.columns = [col]
        # append to master list
        master_list.append(curr_df)
    # categorical variable
    else:
        # get a copy
        curr_df = x_df[col].copy()
        # get dummy
        # if contains NaN
        if curr_df.isnull().values.any():
            dummy = pd.get_dummies(curr_df, dummy_na = True)
        # if no NaN
        else:
            dummy = pd.get_dummies(curr_df, dummy_na = False)
        # reset columns
        dummy.columns = [str(col)+'='+str(value) for value in dummy.columns]
        # reset index
        dummy.index = x_df.index
        # append to master list
        master_list.append(dummy)
        
# concatenate master list
x_df_extended = pd.concat(master_list, axis = 1)

print 'data shape before dummification/standardization', x_df.shape
print 'data shape after dummification/standardization', x_df_extended.shape

data shape before dummification/standardization (18558, 64)
data shape after dummification/standardization (18558, 537)


In [4]:
# cross validataion to find best models
n_estimator_list = np.arange(10, 200, 20)
depth_list = np.arange(1, 10, 10)
class_weight_list = [{1:value} for value in 10**np.arange(2, 12, 1)]

parameters = {'n_estimators':n_estimator_list,\
              'max_depth':depth_list,\
              'class_weight':class_weight_list}

rf_model = ensemble.RandomForestClassifier()

gscv = GridSearchCV(rf_model, parameters)
gscv.fit(x_df_extended.values, y_df.values)

best_rf_model = gscv.best_estimator_

In [5]:
# get importance (coef)
coef = best_rf_model.feature_importances_

most_pos_var_index = np.argsort(coef)[-21:][:-1]
most_neg_var_index = np.argsort(coef)[0:3]

most_pos_var = x_df_extended.columns[most_pos_var_index]
most_neg_var = x_df_extended.columns[most_neg_var_index]

print 'most positive variables'
print most_pos_var.values
print 'most negative variables'
print most_neg_var.values

most positive variables
['0  Birth order-all mums siblings=11.0'
 '0  Birth order-all mums siblings=1.0'
 '0  Birth order-all mums siblings=2.0'
 '0  Birth order-all mums siblings=3.0'
 '0  Birth order-all mums siblings=4.0'
 '0  Birth order-all mums siblings=5.0'
 '0  Birth order-all mums siblings=6.0'
 '0  Birth order-all mums siblings=nan'
 '0  Birth order-all mums siblings=7.0'
 '0  Birth order-all mums siblings=8.0'
 '0  Birth order-all mums siblings=9.0'
 '0  Birth order-all mums siblings=10.0'
 '0 Interval between this birth and last=nan'
 '0  Siblings alive when mum left school=6.0' '0  Mothers blood group=7.0'
 "0 Ma's smoking after mth 4 of pregnancy=1.0"
 '0 Illness noted in PMS-MC 1:3=1.0'
 '0  Duration of labour-1st stage:hours=2.0'
 "0 Socio-economic group mother's husband (GRO 1951)=7.0"
 '0 Birthweight (ounces) or estimate']
most negative variables
['0-3D Sex of child=1.0' '0  Whether labour induced=1.0'
 '0  Duration membranes ruptured-hours=nan']


In [None]:
#a function to deliver comparation
def compare(name):
    #ratio and count among whole population
    print pd.to_numeric(data[name],errors='coerce').value_counts()/pd.to_numeric(data[name],errors='coerce').value_counts().sum()
    print data[name].value_counts()
    
    #ratio and count among epilepsy patients
    print epi[name].value_counts()/epi[name].value_counts().sum()
    print epi[name].value_counts()

In [None]:
compare("0  Mums min. haemoglobin while pregnant")