In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from tqdm import *

In [2]:
train = pd.read_csv('stm_train.csv', index_col=0)
target = pd.read_csv('target.csv', index_col=0)
test = pd.read_csv('stm_test.csv.gz', index_col=0)

In [3]:
target = target.loc[:, '0'].values

In [4]:
strain = train.ix[:, :292]
stest = test.ix[:, :292]

In [24]:
cn = []
for c in strain.columns:
    cn.append((c, len(strain[c].unique())))

In [26]:
mask = strain.columns[list(map(lambda x: x[1] < 500, cn))]

In [27]:
from fastFM import mcmc

In [28]:
fm = mcmc.FMClassification(n_iter=500, rank=30)

In [29]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [30]:
le = LabelEncoder()

In [31]:
for i in range(len(mask)):
    strain.loc[:, mask[i]] = le.fit_transform(strain[mask[i]])

In [32]:
ohe = OneHotEncoder()

In [33]:
ctr = ohe.fit_transform(strain[mask])

In [5]:
from sklearn.cross_validation import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(strain, target, test_size=.2, stratify=target)

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
sc = StandardScaler()

In [11]:
sctr = sc.fit_transform(X_train)
scts = sc.transform(X_test)

In [15]:
lr = LogisticRegression(C=100)

In [16]:
lr.fit(sctr, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
roc_auc_score(y_test, lr.predict_proba(scts)[:, 1])

0.77213277114453449

In [None]:
preds = fm.fit_predict_proba(X_train, y_train, X_test)

In [7]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_test, preds)

In [26]:
def smooth_target_mean_cv(train, target, C, cv=None):
    if cv == None:
        cv = [([x for x in range(len(train))], [x for x in range(len(train))])]
    if type(cv) == int:
        from sklearn.cross_validation import StratifiedKFold
        cv = StratifiedKFold(target, cv)
    #(среднее значение * размер категории + глобальное среднее значение * C) / (размер категории + С)
    res = np.zeros(train.shape)
    for trx, tsx in cv:
        Xtrain, Ytrain = train.iloc[trx], target[trx]
        Xtest, Ytest = train.iloc[tsx], target[tsx]
        #print(Xtrain.shape, type(Xtrain), Ytrain.shape, type(Ytrain))
        
        mean_target = Ytrain.mean()
        cv_res = res[tsx]
        #print(Xtest)
        for val in Xtrain.unique():
            cat_size = Xtrain.value_counts()[val]
            cat_mean = Ytrain[np.where(Xtrain == val)].mean()
            #print(val, cat_size, cat_mean)
            cv_res[np.where(Xtest == val)] = float(cat_mean * cat_size + mean_target * C) / float(cat_size + C)
            #print(cv_res)
        res[tsx] = cv_res
    return res

In [27]:
skf = StratifiedKFold(target, n_folds=10, random_state=42)

In [37]:
kektrain = train.loc[:, strain.columns.union([i+'_stm' for i in mask])]

In [38]:
kektest = test.loc[:, strain.columns.union([i+'_stm' for i in mask])]

In [39]:
import xgboost as xgb

In [40]:
params = {'objective': 'binary:logistic', 
          'eval_metric': 'auc',
          'eta': 0.0202048,
          'max_depth': 5,
          'subsample': 0.6815,
          'colsample_bytree': 0.701,
          'silent': 1,
          'seed': 0
}

In [41]:
dtrain = xgb.DMatrix(kektrain, target)
dktrain = xgb.DMatrix(kektrain, target, missing=0)

In [42]:
res = xgb.cv(params, dtrain, num_boost_round=1000, nfold=10, stratified=True, early_stopping_rounds=300,
             verbose_eval=True)

Will train until cv error hasn't decreased in 300 rounds.
[0]	cv-test-auc:0.7990869+0.012084441066511936	cv-train-auc:0.8058143999999998+0.005722398783726992
[1]	cv-test-auc:0.8078372999999999+0.012803365432963334	cv-train-auc:0.8172885000000001+0.005586635539392206
[2]	cv-test-auc:0.8108215+0.011622471778842923	cv-train-auc:0.8218501+0.006032536356293262
[3]	cv-test-auc:0.8138019+0.010183201691511374	cv-train-auc:0.8251742999999999+0.006084132888916897
[4]	cv-test-auc:0.8155060000000001+0.01257017604490883	cv-train-auc:0.8283891000000001+0.004733374197124071
[5]	cv-test-auc:0.8171422999999999+0.01274978385738361	cv-train-auc:0.8307667000000001+0.003963999849899093
[6]	cv-test-auc:0.8187840000000002+0.011781132042380307	cv-train-auc:0.8326300999999999+0.002538578714556633
[7]	cv-test-auc:0.8194072+0.011782686720778075	cv-train-auc:0.8339165999999999+0.0023872785426087167
[8]	cv-test-auc:0.8203502+0.012128260615603537	cv-train-auc:0.8347163+0.0023922024182748344
[9]	cv-test-auc:0.821069

KeyboardInterrupt: 

In [43]:
res2 = xgb.cv(params, dktrain, num_boost_round=1000, nfold=10, stratified=True, early_stopping_rounds=300,
              verbose_eval=True)

Will train until cv error hasn't decreased in 300 rounds.
[0]	cv-test-auc:0.7977873999999999+0.012805475072795999	cv-train-auc:0.804816+0.003742168943273412
[1]	cv-test-auc:0.8058588999999999+0.012669438112639408	cv-train-auc:0.8147399+0.007954593647069587
[2]	cv-test-auc:0.8120993999999999+0.014021454954461747	cv-train-auc:0.8230249000000001+0.0047190470319758366
[3]	cv-test-auc:0.8141594+0.014000972424799646	cv-train-auc:0.826634+0.0027881818807244025
[4]	cv-test-auc:0.8151575+0.014008076857656085	cv-train-auc:0.8282733999999999+0.0024758512556290555
[5]	cv-test-auc:0.8167145+0.014267816870495647	cv-train-auc:0.8306479000000001+0.0024007262422025507
[6]	cv-test-auc:0.8167595000000001+0.015046153682918428	cv-train-auc:0.8317422999999999+0.0022131822812411933
[7]	cv-test-auc:0.8182509000000001+0.014384741564936094	cv-train-auc:0.832456+0.002538041370821204
[8]	cv-test-auc:0.81924+0.014299045520593318	cv-train-auc:0.8337230999999999+0.002688838055740788
[9]	cv-test-auc:0.8196821+0.01368

KeyboardInterrupt: 

In [10]:
skf = StratifiedKFold(target, n_folds=10, random_state=42)

In [11]:
res = cross_val_score(RandomForestClassifier(n_estimators=1500, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0),
                      train, target, scoring='roc_auc', cv=skf)

In [12]:
print(res, np.mean(res), np.std(res))

[ 0.81980316  0.82652889  0.80696928  0.82204208  0.82376237  0.83331324
  0.83998096  0.85333348  0.84990458  0.81490572] 0.829054374932 0.0142084848769


In [13]:
rese = cross_val_score(ExtraTreesClassifier(n_estimators=500, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0),
                      train, target, scoring='roc_auc', cv=skf)

In [14]:
print(rese, np.mean(rese), np.std(rese))

[ 0.81169816  0.80918963  0.79401584  0.80832986  0.81212559  0.81999075
  0.82329254  0.83714605  0.82650436  0.80781605] 0.815010881689 0.0114272932867


In [17]:
st = train.ix[:, :292]

In [18]:
rest = cross_val_score(RandomForestClassifier(n_estimators=1500, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0),
                      st, target, scoring='roc_auc', cv=skf)

In [19]:
print(rest, np.mean(rest), np.std(rest))

[ 0.78238588  0.7767728   0.76983743  0.76453665  0.75228351  0.78557231
  0.79920877  0.79396601  0.80247204  0.76939027] 0.779642567504 0.0152907241562


In [20]:
reset = cross_val_score(ExtraTreesClassifier(n_estimators=500, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0),
                      st, target, scoring='roc_auc', cv=skf)

In [21]:
print(reset, np.mean(reset), np.std(reset))

[ 0.77235473  0.75757199  0.74889618  0.73359859  0.72838177  0.74897627
  0.76357969  0.76439285  0.77726978  0.75705314] 0.75520750053 0.0148642490642


In [23]:
test = pd.read_csv('stm_test.csv.gz', index_col=0)

In [25]:
clf = RandomForestClassifier(n_estimators=5000, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0)

In [26]:
clf.fit(train, target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features=50, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [27]:
preds = clf.predict_proba(test)[:, 1]

In [28]:
sub = pd.read_csv('data/sample_submission.csv', index_col='ID')
sub.TARGET = preds
sub.to_csv('submission/rf.csv')

In [29]:
clf = ExtraTreesClassifier(n_estimators=5000, max_depth=50, 
                                             max_features=50, n_jobs=-1,
                                             random_state=0)

In [30]:
clf.fit(train, target)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=50, max_features=50, max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [31]:
preds = clf.predict_proba(test)[:, 1]

In [32]:
sub = pd.read_csv('data/sample_submission.csv', index_col='ID')
sub.TARGET = preds
sub.to_csv('submission/et.csv')