In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from fastFM import mcmc
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
import xgboost as xgb
from tqdm import *
from libtelepot import sendMessage
import gc

In [2]:
train = pd.read_csv('stm_train.csv.gz', index_col=0)
target = pd.read_csv('target.csv', index_col=0)
test = pd.read_csv('stm_test.csv.gz', index_col=0)

In [3]:
target = target.loc[:, '0'].values

In [4]:
strain = train.ix[:, :292]
stest = test.ix[:, :292]

In [5]:
skf = StratifiedKFold(target, n_folds=10, random_state=42)

In [6]:
train_clfs = np.zeros((strain.shape[0], 7))
test_clfs = np.zeros((stest.shape[0], 7))

In [7]:
params = {'objective': 'binary:logistic', 
          'eval_metric': 'auc',
          'eta': 0.0202048,
          'max_depth': 5,
          'subsample': 0.6815,
          'colsample_bytree': 0.701,
          'silent': 1,
          'seed': 0,
          'nthreads': 12
         }

In [8]:
fold = 1
for train_index, test_index in skf:
    X_train, X_test = strain.iloc[train_index].copy(), strain.iloc[test_index].copy()
    y_train, y_test = target[train_index], target[test_index]
    
    sendMessage('Going through fold {:}'.format(fold))
    rfres = np.zeros((X_test.shape[0], 10))
    for st in range(10):
        rf = RandomForestClassifier(n_estimators=5000, max_depth=50, max_features=50,
                                    random_state=st, n_jobs=-1)
        rf.fit(X_train, y_train)
        #train_clfs[test_index, 0] = rf.predict_proba(X_test)[:, 1]
        rfres[:, st] = rf.predict_proba(X_test)[:, 1]
        sendMessage('Finished fitting RandomForest {:}'.format(st))
        del(rf)
        gc.collect()
    train_clfs[test_index, 0] = rfres.mean(axis=0)
    
    et = ExtraTreesClassifier(n_estimators=5000, max_depth=50, max_features=50,
                              random_state=42, n_jobs=-1)
    et.fit(X_train, y_train)
    train_clfs[test_index, 1] = et.predict_proba(X_test)[:, 1]
    sendMessage('Finished fitting ExtraTrees')
    del(et)
    gc.collect()
    
    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test)
    fxgb = xgb.train(params, dtrain, num_boost_round=500, verbose_eval=False)
    train_clfs[test_index, 2] = fxgb.predict(dtest)
    sendMessage('Finished fitting XGB1')
    del(fxgb)
    gc.collect()
    
    dtrains = xgb.DMatrix(X_train, y_train, missing=0)
    dtests = xgb.DMatrix(X_test, missing=0)
    sxgb = xgb.train(params, dtrains, num_boost_round=500, verbose_eval=False)
    train_clfs[test_index, 3] = sxgb.predict(dtests)
    sendMessage('Finished fitting XGB2')
    del(sxgb)
    gc.collect()
    
    nb = BernoulliNB()
    nb.fit(X_train, y_train)
    train_clfs[test_index, 4] = nb.predict_proba(X_test)[:, 1]
    sendMessage('Finished fitting NaiveBayes')
    
    sc = StandardScaler()
    X_train_sc = sc.fit_transform(X_train)
    X_test_sc = sc.transform(X_test)
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    train_clfs[test_index, 5] = lr.predict_proba(X_test)[:, 1]
    sendMessage('Finished fitting LogisticRegression')
    
#     fm = mcmc.FMClassification(n_iter=150, rank=20)
#     cn = []
#     for c in X_train.columns:
#         cn.append((c, len(X_train[c].unique())))
#     mask = X_train.columns[list(map(lambda x: x[1] < 500, cn))]
#     for i in range(len(mask)):
#         fct, ind = pd.factorize(X_train[mask[i]])
#         X_train.loc[:, mask[i]] = fct
#         X_test.loc[:, mask[i]] = ind.get_indexer(X_test[mask[i]])
#         X_test.loc[X_test[mask[i]] == -1, mask[i]] = X_test[mask[i]].max() + 1
#     ohe = OneHotEncoder()
#     ctrain = ohe.fit_transform(X_train[mask])
#     ctest = ohe.transform(X_test[mask])
#     preds = fm.fit_predict_proba(ctrain, y_train, ctest)
#     train_clfs[test_index, 6] = preds
#     sendMessage('Finished fitting FactorizationMachine')
    
    fold += 1

In [9]:
fold = 1
for train_index, test_index in skf:
    X_train, X_test = strain.iloc[train_index].copy(), strain.iloc[test_index].copy()
    y_train, y_test = target[train_index], target[test_index]
    
    sendMessage('Going through fold {:}'.format(fold))
    
    sc = StandardScaler()
    X_train_sc = sc.fit_transform(X_train)
    X_test_sc = sc.transform(X_test)
    lr = LogisticRegression()
    lr.fit(X_train_sc, y_train)
    train_clfs[test_index, 5] = lr.predict_proba(X_test_sc)[:, 1]
    sendMessage('Finished fitting LogisticRegression')
    
    fold += 1

In [11]:
indices = list()
for train_index, test_index in skf:
    indices.append((train_index, test_index))

In [17]:
from sklearn.metrics import roc_auc_score

In [37]:
for i in range(7):
    print(roc_auc_score(target, train_clfs[:, i]))

0.779259148489
0.758791670413
0.840758789214
0.840673318893
0.694625395301
0.792495086606
0.751685827057


In [38]:
clfs = pd.DataFrame(data=train_clfs[:, :], columns=['rf', 'et', 'xgb1', 'xgb2', 'nb', 'lr', 'fm'])

In [39]:
clfs.to_csv('clfs.csv.gz', compression='gzip')

In [27]:
from scipy.sparse import csr_matrix, hstack

In [36]:
fold = 1
for train_index, test_index in skf:
    X_train, X_test = strain.iloc[train_index].copy(), strain.iloc[test_index].copy()
    y_train, y_test = target[train_index], target[test_index]

    sendMessage('Going through fold {:}'.format(fold))

    fm = mcmc.FMClassification(n_iter=150, rank=20)
    cn = []
    for c in X_train.columns:
        cn.append((c, len(X_train[c].unique())))
    mask = X_train.columns[list(map(lambda x: x[1] < 500, cn))]
    for i in range(len(mask)):
        fct, ind = pd.factorize(X_train[mask[i]])
        X_train.loc[:, mask[i]] = fct
        X_test.loc[:, mask[i]] = ind.get_indexer(X_test[mask[i]])
        X_test.loc[X_test[mask[i]] == -1, mask[i]] = X_test[mask[i]].max() + 1
    ntr = pd.get_dummies(X_train[mask[0]])
    nts = pd.get_dummies(X_test[mask[0]])
    ctrain = csr_matrix(ntr.values)
    ctest = csr_matrix(nts.loc[:, ntr.columns].fillna(0).values)
    for i in tqdm(range(1, len(mask))):
        ntr = pd.get_dummies(X_train[mask[i]])
        nts = pd.get_dummies(X_test[mask[i]])
        ctrain = hstack((ctrain, csr_matrix(ntr.values)))
        ctest = hstack((ctest, csr_matrix(nts.loc[:, ntr.columns].fillna(0).values)))
    sendMessage('Fitting FactorizationMachine')
    preds = fm.fit_predict_proba(ctrain, y_train, ctest)
    train_clfs[test_index, 6] = preds
    print(roc_auc_score(y_test, preds))
    sendMessage('Finished fitting FactorizationMachine')
    
    del(ctrain)
    del(ctest)
    gc.collect()

    fold += 1

100%|██████████| 255/255 [00:36<00:00,  3.40it/s]


0.745111019509


100%|██████████| 255/255 [00:35<00:00,  3.48it/s]


0.76019995432


100%|██████████| 255/255 [00:35<00:00,  3.43it/s]


0.715652659423


100%|██████████| 255/255 [00:35<00:00,  3.52it/s]


0.746281740862


100%|██████████| 255/255 [00:35<00:00,  3.49it/s]


0.758830652152


100%|██████████| 255/255 [00:35<00:00,  3.49it/s]


0.758088706731


100%|██████████| 255/255 [00:35<00:00,  3.51it/s]


0.738303950535


100%|██████████| 255/255 [00:35<00:00,  3.45it/s]


0.782242545394


100%|██████████| 255/255 [00:35<00:00,  3.53it/s]


0.754537277998


100%|██████████| 255/255 [00:35<00:00,  3.45it/s]


0.760701730357


In [41]:
dtrain = xgb.DMatrix(train_clfs, target)

In [46]:
nparams = {'objective': 'binary:logistic', 
          'eval_metric': 'auc',
          'eta': 0.02,
          'max_depth': 3,
          'subsample': 0.8,
          'colsample_bytree': 0.8,
          'silent': 1,
          'seed': 0,
          'nthreads': 12
         }

In [47]:
xgb.cv(nparams, dtrain, num_boost_round=1000, early_stopping_rounds=200, 
       nfold=10, stratified=True, verbose_eval=True)

Will train until cv error hasn't decreased in 200 rounds.
[0]	cv-test-auc:0.8246032+0.013047990173202934	cv-train-auc:0.8288169+0.0020369232901609113
[1]	cv-test-auc:0.8301375999999999+0.01342682929957776	cv-train-auc:0.834414+0.002889435065890907
[2]	cv-test-auc:0.832822+0.014340203080849318	cv-train-auc:0.8367583999999999+0.0018570525679150799
[3]	cv-test-auc:0.8341303999999999+0.014028769790683713	cv-train-auc:0.8380061999999999+0.0017549056840753465
[4]	cv-test-auc:0.834737+0.013090446340747882	cv-train-auc:0.8390051+0.0012099479699557384
[5]	cv-test-auc:0.8350085+0.013002831970382467	cv-train-auc:0.8392628+0.0011741173535894838
[6]	cv-test-auc:0.8351268+0.012478111698490288	cv-train-auc:0.8394450000000001+0.0010793801925179291
[7]	cv-test-auc:0.8354791+0.012685109076787646	cv-train-auc:0.8395982+0.001060669675252393
[8]	cv-test-auc:0.8363313+0.012915923405238979	cv-train-auc:0.8400584+0.0009999334177833942
[9]	cv-test-auc:0.8364755+0.01285342644783872	cv-train-auc:0.8400974+0.0010

Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
0,0.824603,0.013048,0.828817,0.002037
1,0.830138,0.013427,0.834414,0.002889
2,0.832822,0.014340,0.836758,0.001857
3,0.834130,0.014029,0.838006,0.001755
4,0.834737,0.013090,0.839005,0.001210
5,0.835009,0.013003,0.839263,0.001174
6,0.835127,0.012478,0.839445,0.001079
7,0.835479,0.012685,0.839598,0.001061
8,0.836331,0.012916,0.840058,0.001000
9,0.836476,0.012853,0.840097,0.001018


In [40]:
for train_index, test_index in skf:
    X_train, X_test = train_clfs[train_index, :], train_clfs[test_index, :]
    y_train, y_test = target[train_index], target[test_index]
    
    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test, y_test)
    
    

array([ 0.82300917,  0.82103342,  0.80044103,  0.83381446,  0.84558662,
        0.82734036,  0.83748346,  0.85424765,  0.84115783,  0.82029631])

In [49]:
rf = RandomForestClassifier(n_estimators=5000, max_depth=50, max_features=50,
                            random_state=42, n_jobs=-1)
rf.fit(strain, target)
test_clfs[:, 0] = rf.predict_proba(stest)[:, 1]
sendMessage('Finished fitting RandomForest')
del(rf)
gc.collect()

et = ExtraTreesClassifier(n_estimators=5000, max_depth=50, max_features=50,
                          random_state=42, n_jobs=-1)
et.fit(strain, target)
test_clfs[:, 1] = et.predict_proba(stest)[:, 1]
sendMessage('Finished fitting ExtraTrees')
del(et)
gc.collect()

dtrain = xgb.DMatrix(strain, target)
dtest = xgb.DMatrix(stest)
fxgb = xgb.train(params, dtrain, num_boost_round=500, verbose_eval=False)
test_clfs[:, 2] = fxgb.predict(dtest)
sendMessage('Finished fitting XGB1')
del(fxgb)
gc.collect()

dtrains = xgb.DMatrix(strain, target, missing=0)
dtests = xgb.DMatrix(stest, missing=0)
sxgb = xgb.train(params, dtrains, num_boost_round=500, verbose_eval=False)
test_clfs[:, 3] = sxgb.predict(dtests)
sendMessage('Finished fitting XGB2')
del(sxgb)
gc.collect()

nb = BernoulliNB()
nb.fit(strain, target)
test_clfs[:, 4] = nb.predict_proba(stest)[:, 1]
sendMessage('Finished fitting NaiveBayes')

sc = StandardScaler()
X_train_sc = sc.fit_transform(strain)
X_test_sc = sc.transform(stest)
lr = LogisticRegression()
lr.fit(X_train_sc, target)
test_clfs[:, 5] = lr.predict_proba(X_test_sc)[:, 1]
sendMessage('Finished fitting LogisticRegression')

In [50]:
X_train = strain
y_train = target
X_test = stest

fm = mcmc.FMClassification(n_iter=150, rank=20)
cn = []
for c in X_train.columns:
    cn.append((c, len(X_train[c].unique())))
mask = X_train.columns[list(map(lambda x: x[1] < 500, cn))]
for i in range(len(mask)):
    fct, ind = pd.factorize(X_train[mask[i]])
    X_train.loc[:, mask[i]] = fct
    X_test.loc[:, mask[i]] = ind.get_indexer(X_test[mask[i]])
    X_test.loc[X_test[mask[i]] == -1, mask[i]] = X_test[mask[i]].max() + 1
ntr = pd.get_dummies(X_train[mask[0]])
nts = pd.get_dummies(X_test[mask[0]])
ctrain = csr_matrix(ntr.values)
ctest = csr_matrix(nts.loc[:, ntr.columns].fillna(0).values)
for i in tqdm(range(1, len(mask))):
    ntr = pd.get_dummies(X_train[mask[i]])
    nts = pd.get_dummies(X_test[mask[i]])
    ctrain = hstack((ctrain, csr_matrix(ntr.values)))
    ctest = hstack((ctest, csr_matrix(nts.loc[:, ntr.columns].fillna(0).values)))
sendMessage('Fitting FactorizationMachine')
preds = fm.fit_predict_proba(ctrain, y_train, ctest)
test_clfs[:, 6] = preds
print(roc_auc_score(y_test, preds))
sendMessage('Finished fitting FactorizationMachine')

100%|██████████| 254/254 [01:16<00:00,  1.58it/s]


ValueError: Found arrays with inconsistent numbers of samples: [ 7601 75818]

In [51]:
clfs_test = pd.DataFrame(data=test_clfs[:, :], columns=['rf', 'et', 'xgb1', 'xgb2', 'nb', 'lr', 'fm'])
clfs_test.to_csv('clfs_test.csv.gz', compression='gzip')

In [64]:
dtest = xgb.DMatrix(test_clfs)
dtrain = xgb.DMatrix(train_clfs, target)
gbm = xgb.train(nparams, dtrain, num_boost_round=200, verbose_eval=True)
predstack = gbm.predict(dtest)

In [65]:
sub = pd.read_csv('data/sample_submission.csv', index_col='ID')
sub.TARGET = predstack
sub.to_csv('submission/stack.csv')

In [58]:
sub = pd.read_csv('data/sample_submission.csv', index_col='ID')
sub.TARGET = test_clfs[:, -1]
sub.to_csv('submission/fm.csv')

In [14]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from fastFM import mcmc
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
import xgboost as xgb
from tqdm import *
from libtelepot import sendMessage
import gc
from sklearn.metrics import roc_auc_score

In [15]:
fold = 1
for train_index, test_index in skf:
    X_train, X_test = strain.iloc[train_index].copy(), strain.iloc[test_index].copy()
    y_train, y_test = target[train_index], target[test_index]
    
    sendMessage('Going through fold {:}'.format(fold))
    rfres = np.zeros((X_test.shape[0], 10))
    for st in range(10):
        rf = RandomForestClassifier(n_estimators=5000, max_depth=50, max_features=50,
                                    random_state=st, n_jobs=-1)
        rf.fit(X_train, y_train)
        #train_clfs[test_index, 0] = rf.predict_proba(X_test)[:, 1]
        rfres[:, st] = rf.predict_proba(X_test)[:, 1]
        sendMessage('Finished fitting RandomForest {:}'.format(st))
        del(rf)
        gc.collect()
    train_clfs[test_index, 0] = rfres.mean(axis=1)
    sendMessage('ROC = {:.7f}'.format(roc_auc_score(y_test, rfres.mean(axis=1))))
    fold += 1

KeyboardInterrupt: 

In [None]:
fold = 1
for train_index, test_index in skf:
    X_train, X_test = strain.iloc[train_index].copy(), strain.iloc[test_index].copy()
    y_train, y_test = target[train_index], target[test_index]
    
    sendMessage('Going through fold {:}'.format(fold))
    etres = np.zeros((X_test.shape[0], 10))
    for st in range(10):
        et = ExtraTreesClassifier(n_estimators=5000, max_depth=50, max_features=50,
                                  random_state=st, n_jobs=-1)
        et.fit(X_train, y_train)
        #train_clfs[test_index, 0] = rf.predict_proba(X_test)[:, 1]
        etres[:, st] = et.predict_proba(X_test)[:, 1]
        sendMessage('Finished fitting ExtraTrees {:}'.format(st))
        del(et)
        gc.collect()
    train_clfs[test_index, 1] = etres.mean(axis=1)
    sendMessage('ROC = {:.7f}'.format(roc_auc_score(y_test, etres.mean(axis=1))))
    fold += 1

In [24]:
cn = []
for c in strain.columns:
    cn.append((c, len(strain[c].unique())))

In [26]:
mask = strain.columns[list(map(lambda x: x[1] < 500, cn))]

In [28]:
fm = mcmc.FMClassification(n_iter=500, rank=30)

In [30]:
le = LabelEncoder()

In [31]:
for i in range(len(mask)):
    strain.loc[:, mask[i]] = le.fit_transform(strain[mask[i]])

In [32]:
ohe = OneHotEncoder()

In [33]:
ctr = ohe.fit_transform(strain[mask])

In [34]:
from sklearn.cross_validation import train_test_split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(ctr, target, test_size=.2, stratify=target)

In [None]:
preds = fm.fit_predict_proba(X_train, y_train, X_test)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_test, preds)

In [26]:
def smooth_target_mean_cv(train, target, C, cv=None):
    if cv == None:
        cv = [([x for x in range(len(train))], [x for x in range(len(train))])]
    if type(cv) == int:
        from sklearn.cross_validation import StratifiedKFold
        cv = StratifiedKFold(target, cv)
    #(среднее значение * размер категории + глобальное среднее значение * C) / (размер категории + С)
    res = np.zeros(train.shape)
    for trx, tsx in cv:
        Xtrain, Ytrain = train.iloc[trx], target[trx]
        Xtest, Ytest = train.iloc[tsx], target[tsx]
        #print(Xtrain.shape, type(Xtrain), Ytrain.shape, type(Ytrain))
        
        mean_target = Ytrain.mean()
        cv_res = res[tsx]
        #print(Xtest)
        for val in Xtrain.unique():
            cat_size = Xtrain.value_counts()[val]
            cat_mean = Ytrain[np.where(Xtrain == val)].mean()
            #print(val, cat_size, cat_mean)
            cv_res[np.where(Xtest == val)] = float(cat_mean * cat_size + mean_target * C) / float(cat_size + C)
            #print(cv_res)
        res[tsx] = cv_res
    return res

In [27]:
skf = StratifiedKFold(target, n_folds=10, random_state=42)

In [37]:
kektrain = train.loc[:, strain.columns.union([i+'_stm' for i in mask])]

In [38]:
kektest = test.loc[:, strain.columns.union([i+'_stm' for i in mask])]

In [39]:
import xgboost as xgb

In [40]:
params = {'objective': 'binary:logistic', 
          'eval_metric': 'auc',
          'eta': 0.0202048,
          'max_depth': 5,
          'subsample': 0.6815,
          'colsample_bytree': 0.701,
          'silent': 1,
          'seed': 0
}

In [41]:
dtrain = xgb.DMatrix(kektrain, target)
dktrain = xgb.DMatrix(kektrain, target, missing=0)

In [42]:
res = xgb.cv(params, dtrain, num_boost_round=1000, nfold=10, stratified=True, early_stopping_rounds=300,
             verbose_eval=True)

Will train until cv error hasn't decreased in 300 rounds.
[0]	cv-test-auc:0.7990869+0.012084441066511936	cv-train-auc:0.8058143999999998+0.005722398783726992
[1]	cv-test-auc:0.8078372999999999+0.012803365432963334	cv-train-auc:0.8172885000000001+0.005586635539392206
[2]	cv-test-auc:0.8108215+0.011622471778842923	cv-train-auc:0.8218501+0.006032536356293262
[3]	cv-test-auc:0.8138019+0.010183201691511374	cv-train-auc:0.8251742999999999+0.006084132888916897
[4]	cv-test-auc:0.8155060000000001+0.01257017604490883	cv-train-auc:0.8283891000000001+0.004733374197124071
[5]	cv-test-auc:0.8171422999999999+0.01274978385738361	cv-train-auc:0.8307667000000001+0.003963999849899093
[6]	cv-test-auc:0.8187840000000002+0.011781132042380307	cv-train-auc:0.8326300999999999+0.002538578714556633
[7]	cv-test-auc:0.8194072+0.011782686720778075	cv-train-auc:0.8339165999999999+0.0023872785426087167
[8]	cv-test-auc:0.8203502+0.012128260615603537	cv-train-auc:0.8347163+0.0023922024182748344
[9]	cv-test-auc:0.821069

KeyboardInterrupt: 

In [43]:
res2 = xgb.cv(params, dktrain, num_boost_round=1000, nfold=10, stratified=True, early_stopping_rounds=300,
              verbose_eval=True)

Will train until cv error hasn't decreased in 300 rounds.
[0]	cv-test-auc:0.7977873999999999+0.012805475072795999	cv-train-auc:0.804816+0.003742168943273412
[1]	cv-test-auc:0.8058588999999999+0.012669438112639408	cv-train-auc:0.8147399+0.007954593647069587
[2]	cv-test-auc:0.8120993999999999+0.014021454954461747	cv-train-auc:0.8230249000000001+0.0047190470319758366
[3]	cv-test-auc:0.8141594+0.014000972424799646	cv-train-auc:0.826634+0.0027881818807244025
[4]	cv-test-auc:0.8151575+0.014008076857656085	cv-train-auc:0.8282733999999999+0.0024758512556290555
[5]	cv-test-auc:0.8167145+0.014267816870495647	cv-train-auc:0.8306479000000001+0.0024007262422025507
[6]	cv-test-auc:0.8167595000000001+0.015046153682918428	cv-train-auc:0.8317422999999999+0.0022131822812411933
[7]	cv-test-auc:0.8182509000000001+0.014384741564936094	cv-train-auc:0.832456+0.002538041370821204
[8]	cv-test-auc:0.81924+0.014299045520593318	cv-train-auc:0.8337230999999999+0.002688838055740788
[9]	cv-test-auc:0.8196821+0.01368

KeyboardInterrupt: 

In [10]:
skf = StratifiedKFold(target, n_folds=10, random_state=42)

In [11]:
res = cross_val_score(RandomForestClassifier(n_estimators=1500, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0),
                      train, target, scoring='roc_auc', cv=skf)

In [12]:
print(res, np.mean(res), np.std(res))

[ 0.81980316  0.82652889  0.80696928  0.82204208  0.82376237  0.83331324
  0.83998096  0.85333348  0.84990458  0.81490572] 0.829054374932 0.0142084848769


In [13]:
rese = cross_val_score(ExtraTreesClassifier(n_estimators=500, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0),
                      train, target, scoring='roc_auc', cv=skf)

In [14]:
print(rese, np.mean(rese), np.std(rese))

[ 0.81169816  0.80918963  0.79401584  0.80832986  0.81212559  0.81999075
  0.82329254  0.83714605  0.82650436  0.80781605] 0.815010881689 0.0114272932867


In [17]:
st = train.ix[:, :292]

In [18]:
rest = cross_val_score(RandomForestClassifier(n_estimators=1500, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0),
                      st, target, scoring='roc_auc', cv=skf)

In [19]:
print(rest, np.mean(rest), np.std(rest))

[ 0.78238588  0.7767728   0.76983743  0.76453665  0.75228351  0.78557231
  0.79920877  0.79396601  0.80247204  0.76939027] 0.779642567504 0.0152907241562


In [20]:
reset = cross_val_score(ExtraTreesClassifier(n_estimators=500, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0),
                      st, target, scoring='roc_auc', cv=skf)

In [21]:
print(reset, np.mean(reset), np.std(reset))

[ 0.77235473  0.75757199  0.74889618  0.73359859  0.72838177  0.74897627
  0.76357969  0.76439285  0.77726978  0.75705314] 0.75520750053 0.0148642490642


In [23]:
test = pd.read_csv('stm_test.csv.gz', index_col=0)

In [25]:
clf = RandomForestClassifier(n_estimators=5000, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0)

In [26]:
clf.fit(train, target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features=50, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [27]:
preds = clf.predict_proba(test)[:, 1]

In [28]:
sub = pd.read_csv('data/sample_submission.csv', index_col='ID')
sub.TARGET = preds
sub.to_csv('submission/rf.csv')

In [29]:
clf = ExtraTreesClassifier(n_estimators=5000, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0)

In [30]:
clf.fit(train, target)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=50, max_features=50, max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [31]:
preds = clf.predict_proba(test)[:, 1]

In [32]:
sub = pd.read_csv('data/sample_submission.csv', index_col='ID')
sub.TARGET = preds
sub.to_csv('submission/et.csv')