# First data explorations

In [122]:
import matplotlib.pyplot as plt
import seaborn
seaborn.set()  # this makes my plots pretty
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation as cv

from skll import metrics
from sklearn.metrics import make_scorer

import data_utils

In [103]:
kappa_scorer = make_scorer(metrics.kappa, weights='quadratic')

In [108]:
train = pd.read_csv('../../data/train.csv').drop(['Id'], axis=1)
test = pd.read_csv('../../data/test.csv')

In [109]:
train = pd.get_dummies(train, columns=['Product_Info_2'])
test = pd.get_dummies(test, columns=['Product_Info_2'])

In [None]:
# fill NaN values for Familiy_Hist_2-5 with means
for i in range(2, 6):
    col = 'Family_Hist_%i' % i
    train[col] = train[col].fillna(train[col].mean())
    test[col] = test[col].fillna(test[col].mean())
# and everything else with 0
train = train.fillna(0)
test = test.fillna(0)

## Train a basic Random Forest

In [105]:
X_train = train.drop(['Response'], axis=1).values
y_train = train.Response.values
rfr_params = {'n_estimators': 15, 'random_state': 42, 'n_jobs': -1}
model = RandomForestClassifier(**rfr_params)
folds = cv.KFold(n=len(y_train), n_folds=5, shuffle=True, random_state=42)
scores = cv.cross_val_score(model, X_train, y_train, cv=folds, scoring=kappa_scorer, n_jobs=-1)
print(scores)
print(scores.mean(), scores.std())

[ 0.49724135  0.49356778  0.50104413  0.50045855  0.49156269]
(0.49677490098804872, 0.003727357719121879)


### Predict testdata

In [125]:
X_test = test.drop(['Id'], axis=1).values
rfr_params = {'n_estimators': 100, 'random_state': 42, 'n_jobs': -1}
model = RandomForestClassifier(**rfr_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [126]:
data_utils.write_submission(y_pred, test)

## histograms and scatterplots