# First data explorations & messy transformations

In [122]:
import matplotlib.pyplot as plt
import seaborn
seaborn.set()  # this makes my plots pretty
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation as cv

from skll import metrics
from sklearn.metrics import make_scorer

import data_utils

In [108]:
train = pd.read_csv('../../data/train.csv').drop(['Id'], axis=1)
test = pd.read_csv('../../data/test.csv')

In [109]:
train = pd.get_dummies(train, columns=['Product_Info_2'])
test = pd.get_dummies(test, columns=['Product_Info_2'])

In [None]:
# fill NaN values for Familiy_Hist_2-5 with means
for i in range(2, 6):
    col = 'Family_Hist_%i' % i
    train[col] = train[col].fillna(train[col].mean())
    test[col] = test[col].fillna(test[col].mean())
# and everything else with 0
train = train.fillna(0)
test = test.fillna(0)

## Train a basic Random Forest

In [103]:
kappa_scorer = make_scorer(metrics.kappa, weights='quadratic')

In [155]:
X_train = train.drop(['Response'], axis=1).values
y_train = train.Response.values
rfr_params = {'n_estimators': 300, 'random_state': 42, 'n_jobs': -1}
model = RandomForestClassifier(**rfr_params)
folds = cv.KFold(n=len(y_train), n_folds=5, shuffle=True, random_state=42)
scores = cv.cross_val_score(model, X_train, y_train, cv=folds, scoring=kappa_scorer, n_jobs=-1)
print(scores)
print(scores.mean(), scores.std())

[ 0.49593096  0.50901067  0.5108705   0.49834324  0.49726198]
(0.50228346962076531, 0.0063259403571169865)


### Predict testdata

In [158]:
X_test = test.drop(['Id'], axis=1).values
rfr_params = {'n_estimators': 300, 'random_state': 42, 'n_jobs': -1}
model = RandomForestClassifier(**rfr_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [133]:
data_utils.write_submission(y_pred, test)

### Scores

**CV**
[ 0.49593096  0.50901067  0.5108705   0.49834324  0.49726198] (0.50228346962076531, 0.0063259403571169865)

**LB**
0.51081

### Feature importance (RandomForest)
List the 50 most important features (acording to RF)

In [160]:
imp = pd.Series(model.feature_importances_, name='Importance')
names = pd.Series(train.drop(['Response'], axis=1).columns, name='Feature')
pd.concat([names, imp], axis=1).sort_values('Importance', ascending=False)[:50]

Unnamed: 0,Feature,Importance
9,BMI,0.091645
8,Wt,0.066192
6,Ins_Age,0.040889
2,Product_Info_4,0.0399
10,Employment_Info_1,0.036277
7,Ht,0.031799
37,Medical_History_2,0.030827
34,Family_Hist_4,0.029889
36,Medical_History_1,0.029585
15,Employment_Info_6,0.029077


## histograms and scatterplots
(deactivated)