In [9]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score, StratifiedKFold

In [6]:
train = pd.read_csv('stm_train.csv', index_col=0)
target = pd.read_csv('target.csv', index_col=0)

In [8]:
target = target.loc[:, '0'].values

In [10]:
skf = StratifiedKFold(target, n_folds=10, random_state=42)

In [11]:
res = cross_val_score(RandomForestClassifier(n_estimators=1500, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0),
                      train, target, scoring='roc_auc', cv=skf)

In [12]:
print(res, np.mean(res), np.std(res))

[ 0.81980316  0.82652889  0.80696928  0.82204208  0.82376237  0.83331324
  0.83998096  0.85333348  0.84990458  0.81490572] 0.829054374932 0.0142084848769


In [13]:
rese = cross_val_score(ExtraTreesClassifier(n_estimators=500, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0),
                      train, target, scoring='roc_auc', cv=skf)

In [14]:
print(rese, np.mean(rese), np.std(rese))

[ 0.81169816  0.80918963  0.79401584  0.80832986  0.81212559  0.81999075
  0.82329254  0.83714605  0.82650436  0.80781605] 0.815010881689 0.0114272932867


In [17]:
st = train.ix[:, :292]

In [18]:
rest = cross_val_score(RandomForestClassifier(n_estimators=1500, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0),
                      st, target, scoring='roc_auc', cv=skf)

In [19]:
print(rest, np.mean(rest), np.std(rest))

[ 0.78238588  0.7767728   0.76983743  0.76453665  0.75228351  0.78557231
  0.79920877  0.79396601  0.80247204  0.76939027] 0.779642567504 0.0152907241562


In [20]:
reset = cross_val_score(ExtraTreesClassifier(n_estimators=500, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0),
                      st, target, scoring='roc_auc', cv=skf)

In [21]:
print(reset, np.mean(reset), np.std(reset))

[ 0.77235473  0.75757199  0.74889618  0.73359859  0.72838177  0.74897627
  0.76357969  0.76439285  0.77726978  0.75705314] 0.75520750053 0.0148642490642


In [23]:
test = pd.read_csv('stm_test.csv.gz', index_col=0)

In [25]:
clf = RandomForestClassifier(n_estimators=5000, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0)

In [26]:
clf.fit(train, target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features=50, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [27]:
preds = clf.predict_proba(test)[:, 1]

In [28]:
sub = pd.read_csv('data/sample_submission.csv', index_col='ID')
sub.TARGET = preds
sub.to_csv('submission/rf.csv')

In [29]:
clf = ExtraTreesClassifier(n_estimators=5000, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0)

In [30]:
clf.fit(train, target)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=50, max_features=50, max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [31]:
preds = clf.predict_proba(test)[:, 1]

In [32]:
sub = pd.read_csv('data/sample_submission.csv', index_col='ID')
sub.TARGET = preds
sub.to_csv('submission/et.csv')