# Import libraries

In [18]:
import numpy as np # calculations with arrays
import pandas as pd # user-friendly DataFrames for data representation
import sklearn # machine learning algorithms
from sklearn import ensemble, linear_model
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt # import plot functions
# necessary to plot in jupyter notebook:
%matplotlib inline
import seaborn as sns # make plots beautiful

# Download data from competition's page

https://inclass.kaggle.com/c/data-mining-in-action-2016-competitions-01/data

# Load data using pandas

In [19]:
train = pd.read_csv('train2.csv')
test = pd.read_csv('test2.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Data

In [20]:
# print first row
train[:1]

Unnamed: 0,year,day,team1,team2,score1,score2,target
0,2998,19,317,131,336,278,True


In [21]:
test[:1]

Unnamed: 0,Id,year,team1,team2
0,0,3021,363,161


In [22]:
# Target variable is "target" and this means we will be predicting it
sample_submission[:1]

Unnamed: 0,Id,target
0,0,0.5


## Quick look at the unique values in data...

In [23]:
for c in train.columns:
    print c, train[c].unique()[:5]

year [2998 2999 3000 3001 3002]
day [19 28 30 31 33]
team1 [317  61 110 352 229]
team2 [131  29 141 146  91]
score1 [336 301 359 309 332]
score2 [278 259 267 410 220]
target [True False]


# Cross-validation

### Lets split data randomly to train and validatation. We will train our algorithms on selected train set and validate them on validation set. Easy as it can be!

In [24]:
# train size
train.shape 

(101609, 7)

train is quite big, so for example purposes we'll sample only part of it

In [25]:
from sklearn.cross_validation import ShuffleSplit

for itr, ite in ShuffleSplit(len(train), n_iter=1, train_size=0.4, test_size=0.1, random_state=0):
    pass

information about all functions can be found on the internet, for example

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html

In [26]:
# or you can open it in you Jupyter notebook executing function in this manner
?ShuffleSplit()

In [27]:
len(itr), len(ite)

(40643, 10161)

In [28]:
itr[:5], ite[:5]

(array([22710, 41665, 91975, 57348, 39931]),
 array([ 37078, 101474,  29858,  61674,   1049]))

now we have validation set "ite" to check the quality of our solution

# features and target

In [29]:
sample_submission[:2]

Unnamed: 0,Id,target
0,0,0.5
1,1,0.5


we need to change 'target' column in "sample_submission" to our predictions.

For now we will select only features that are present in both train and test:

In [30]:
features = []
for c in train.columns:
    if c in test.columns and c!='target':
        features += [c]
        print '"{}" is present in test and train'.format(c)
    else:
        print '"{}" is NOT present in test'.format(c)
        
features

"year" is present in test and train
"day" is NOT present in test
"team1" is present in test and train
"team2" is present in test and train
"score1" is NOT present in test
"score2" is NOT present in test
"target" is NOT present in test


['year', 'team1', 'team2']

here we split train on "train" and "validation" parts

In [31]:
xtrain = train.loc[itr, features]    
ytrain = train.loc[itr, 'target']

xval = train.loc[ite, features]
yval = train.loc[ite, 'target']

# Baseline solution

lets make baseline first by predicting the mean value

In [15]:
train.target.mean()

0.50096940231672393

In [16]:
constant_prediction = yval * 0 + train.target.mean()
constant_prediction = constant_prediction.values
constant_prediction

array([ 0.5009694,  0.5009694,  0.5009694, ...,  0.5009694,  0.5009694,
        0.5009694])

In [17]:
log_loss(yval, constant_prediction)

0.6931565015839517

In [18]:
submission = sample_submission.copy()
submission.target = train['target'].mean() # notice here that we can refer to a column 'target' in two ways
submission.to_csv('constant_submission.csv', index=False)

Now this should score like "Baseline - Constant" on Leaderboard!
You can submit this by going to 

https://inclass.kaggle.com/c/data-mining-in-action-2016-competitions-01/submissions/attach

## Data exploration

In [450]:
train = pd.read_csv('train2.csv')
test = pd.read_csv('test2.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [436]:
pd.unique(train.year)

array([2998, 2999, 3000, 3001, 3002, 3003, 3004, 3005, 3006, 3007, 3008,
       3009, 3010, 3011, 3012, 3013, 3014, 3015, 3016, 3017, 3018, 3019])

In [453]:
pd.unique(test.year) # нужно предсказать результаты последующих матчей по предыдущим

array([3021, 3020])

In [459]:
features

['year', 'team1', 'team2']

In [468]:
tr = train[train.year < 3018]
val = train[train.year >= 3018]

xtrain = tr.loc[:, features]    
ytrain = tr.loc[:, 'target']

xval = val.loc[:, features]
yval = val.loc[:, 'target']

## New features adding

In [498]:
train = pd.read_csv('train2.csv')
test = pd.read_csv('test2.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [499]:
tr = train.loc[itr]
val_set = train.loc[ite]

In [477]:
freq_counter = (train.groupby("team2").count().target + train.groupby("team1").count().target).to_dict()
# сколько раз играла каждая команда

In [478]:
team_names = sorted(pd.unique(train["team1"]))

In [479]:
av_scores={}
for team_name in team_names:
    av_scores[team_name] = (train[train.team1 == team_name].score1.mean() + train[train.team2 == team_name].score2.mean())/2

In [480]:
%%time
d = {}
for team_name in team_names:
    prob_t1 = float(train[train["team1"]==team_name].target.sum())/train[train["team1"]==team_name].target.shape[0]
    prob_t2 = 1 - float(train[train["team2"]==team_name].target.sum())/train[train["team2"]==team_name].target.shape[0]

    all_prob_to_win = 0.5*(prob_t1 + prob_t2)    ## probability that team wins
    d[team_name] = all_prob_to_win

CPU times: user 1.85 s, sys: 21.1 ms, total: 1.88 s
Wall time: 1.89 s


In [500]:
train["prob_t1_win"] = train["team1"].apply(lambda x: d.get(x, 0.5))
train["prob_t2_win"] = train["team2"].apply(lambda x: d.get(x, 0.5))
train["freq_cnt"] = train["team2"].apply(lambda x: freq_counter.get(x, 0))
train["av_score"] = train["team2"].apply(lambda x: av_scores.get(x, 0))

tr = train[train.year < 3018]
val = train[train.year >= 3018]

In [501]:
tr.drop(["day", "year", "score1", "score2", "team1", "team2"], axis=1, inplace=True)
val.drop(["day", "year", "score1", "score2", "team1", "team2"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [504]:
feat = list(tr.columns)
feat.remove('target')

In [507]:
feat

['prob_t1_win', 'prob_t2_win', 'freq_cnt', 'av_score']

In [505]:
xtrain = train.loc[itr, feat]    
ytrain = train.loc[itr, 'target']

xval = train.loc[ite, feat]
yval = train.loc[ite, 'target']

In [508]:
xtrain = tr.loc[:, feat]    
ytrain = tr.loc[:, 'target']

xval = val.loc[:, feat]
yval = val.loc[:, 'target']

In [527]:
param = {}
param['max_depth'] = 2
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'logloss'
param['eta'] = 0.15
#param['n_estimators'] = 10
param['subsample'] = 0.6
param['colsample_bytree'] = 0.6

numround = 150

In [528]:
Xdatatrain = xgb.DMatrix(data = xtrain, label = ytrain)
Xdatatest = xgb.DMatrix(data = xval, label = yval)

plst = list(param.items())
watchlist = [(Xdatatrain, 'train'), (Xdatatest, 'eval')]            

bst = xgb.train(plst, Xdatatrain, numround, evals = watchlist, verbose_eval = 10)
# ypredxgb_tr = bst.predict(Xdatatrain)

[0]	train-logloss:0.68698	eval-logloss:0.687772
[10]	train-logloss:0.643444	eval-logloss:0.645914
[20]	train-logloss:0.629259	eval-logloss:0.633108
[30]	train-logloss:0.622403	eval-logloss:0.62701
[40]	train-logloss:0.618663	eval-logloss:0.623989
[50]	train-logloss:0.617567	eval-logloss:0.623701
[60]	train-logloss:0.617051	eval-logloss:0.623422
[70]	train-logloss:0.616074	eval-logloss:0.62296
[80]	train-logloss:0.615465	eval-logloss:0.622746
[90]	train-logloss:0.614939	eval-logloss:0.622579
[100]	train-logloss:0.614613	eval-logloss:0.622833
[110]	train-logloss:0.614308	eval-logloss:0.622814
[120]	train-logloss:0.613925	eval-logloss:0.622806
[130]	train-logloss:0.61364	eval-logloss:0.623003
[140]	train-logloss:0.613294	eval-logloss:0.622542


In [424]:
tr = xgb.DMatrix(data = train[feat], label = train["target"])

In [425]:
bst = xgb.train(plst, tr, numround, evals = [(tr, 'train')], verbose_eval = 10)

[0]	train-logloss:0.678579
[10]	train-logloss:0.624711
[20]	train-logloss:0.615956
[30]	train-logloss:0.613713
[40]	train-logloss:0.612369
[50]	train-logloss:0.611221
[60]	train-logloss:0.610241
[70]	train-logloss:0.609353
[80]	train-logloss:0.608193
[90]	train-logloss:0.607353
[100]	train-logloss:0.606773
[110]	train-logloss:0.605908
[120]	train-logloss:0.605437
[130]	train-logloss:0.60468
[140]	train-logloss:0.604049
[150]	train-logloss:0.603518
[160]	train-logloss:0.60278
[170]	train-logloss:0.602382
[180]	train-logloss:0.601766
[190]	train-logloss:0.601343
[200]	train-logloss:0.600959
[210]	train-logloss:0.60042
[220]	train-logloss:0.599958
[230]	train-logloss:0.599566
[240]	train-logloss:0.599087


In [426]:
test["prob_t1_win"] = test["team1"].apply(lambda x: d.get(x, 0.5))
test["prob_t2_win"] = test["team2"].apply(lambda x: d.get(x, 0.5))
test["freq_cnt"] = test["team2"].apply(lambda x: freq_counter.get(x, 0))
test["av_score"] = test["team2"].apply(lambda x: av_scores.get(x, 0))

In [427]:
test.drop(["year", "team1", "team2"], axis=1, inplace=True)

In [428]:
ss = sample_submission.copy()
ss.target = bst.predict(xgb.DMatrix(test[feat]))

In [429]:
ss.to_csv('tuned_xgboost2.csv', index=False)

1) Смешать несколько разных моделей: бустинг, линейную модель, случайный лес. Например взять взвешенную сумму предсказаний: coeff1 * ypred1 + (1 - coeff1) * ypred2.

2) Сделать несложный трюк с данными перед тем как отдавать их в бустинг) Требует понимания структуры данных.

3) Добавить новые фичи :) Опять же, нужно использовать понимание того, с какими данными мы имеем дело.

Любой из этих идей может быть достаточно)

## Simple prob solution

In [85]:
sorted(pd.unique(train["team1"])) == sorted(pd.unique(train["team2"])) # the same commands!!

True

In [86]:
team_names = sorted(pd.unique(train["team1"]))

In [87]:
team_name = 317
prob_t1 = float(train[train["team1"]==team_name].target.sum())/train[train["team1"]==team_name].target.shape[0]
prob_t2 = 1 - float(train[train["team2"]==team_name].target.sum())/train[train["team2"]==team_name].target.shape[0]

all_prob_to_win = 0.5*(prob_t1 + prob_t2)    ## probability that team wins
all_prob_to_win

0.6967152686762779

In [89]:
%%time
d = {}
for team_name in team_names:
    prob_t1 = float(train[train["team1"]==team_name].target.sum())/train[train["team1"]==team_name].target.shape[0]
    prob_t2 = 1 - float(train[train["team2"]==team_name].target.sum())/train[train["team2"]==team_name].target.shape[0]

    all_prob_to_win = 0.5*(prob_t1 + prob_t2)    ## probability that team wins
    d[team_name] = all_prob_to_win

CPU times: user 1.95 s, sys: 22.1 ms, total: 1.97 s
Wall time: 1.99 s


In [90]:
d[146]

0.7885674931129476

In [26]:
val_set["prob_t1_win"] = val_set["team1"].apply(lambda x: d.get(x, 0.5))
val_set["prob_t2_win"] = val_set["team2"].apply(lambda x: d.get(x, 0.5))

In [27]:
val_set["prob_t2_lost"] = 1 - val_set["prob_t2_win"]

In [28]:
val_set.head()

Unnamed: 0,year,day,team1,team2,score1,score2,target,prob_t1_win,prob_t2_win,prob_t2_lost
37078,3006,214,305,196,263,247,True,0.472638,0.362673,0.637327
101474,3019,226,361,207,178,228,False,0.509714,0.673565,0.326435
29858,3005,95,178,204,212,197,True,0.578424,0.540334,0.459666
61674,3012,90,290,243,224,131,True,0.666226,0.62823,0.37177
1049,2998,93,354,186,270,259,True,0.486889,0.466581,0.533419


In [29]:
val_set["pred"] = (val_set["prob_t1_win"] + val_set["prob_t2_lost"])/2

In [30]:
log_loss(list(val_set["target"]), list(val_set["pred"]))

0.65163204966445343

In [69]:
test = pd.read_csv('test2.csv')
for_subm = test

In [70]:
for_subm["prob_t1_win"] = for_subm["team1"].apply(lambda x: d.get(x, 0.5))
for_subm["prob_t2_win"] = for_subm["team2"].apply(lambda x: d.get(x, 0.5))
for_subm["prob_t2_lost"] = 1 - for_subm["prob_t2_win"]

for_subm["target"] = (for_subm["prob_t1_win"] + for_subm["prob_t2_lost"])/2

In [71]:
for_subm.head()

Unnamed: 0,Id,year,team1,team2,prob_t1_win,prob_t2_win,prob_t2_lost,target
0,0,3021,363,161,0.42954,0.543362,0.456638,0.443089
1,1,3021,286,2,0.578631,0.387953,0.612047,0.595339
2,2,3020,232,52,0.546586,0.21181,0.78819,0.667388
3,3,3020,84,11,0.429677,0.487874,0.512126,0.470902
4,4,3021,305,39,0.472638,0.688156,0.311844,0.392241


In [72]:
for_subm.drop(["prob_t1_win", "prob_t2_win", "prob_t2_lost", "year", "team1", "team2"], axis=1, inplace=True)

In [77]:
for_subm.head()

Unnamed: 0,Id,target
0,0,0
1,1,1
2,2,1
3,3,0
4,4,0


In [78]:
for_subm.to_csv("prob_solution3.csv", index=False)

# Machine learning

Finally, lets try machine learning!

In [17]:
alg = linear_model.LogisticRegression()
alg.fit(xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
prediction = alg.predict_proba(xval)[:,1]

In [19]:
log_loss(yval, prediction)

0.69278081748011511

### Well, not so far from the constant solution... Let's try to understand why.

What's a linear model such as LogisticRegression is trying to do is multiply each variable on some coefficient and add add it up, in our case:

y_predicted = column1 \* coef1 + column2 \* coef2 + column3 \* coef3 + bias

We can print coefficients and bias:

In [20]:
alg.coef_, alg.intercept_

(array([[  1.78001477e-07,   3.28971478e-04,  -2.98418706e-04]]),
 array([  3.70539525e-09]))

But clearly, "team1" and "team2" are _categorical_ columns, just like names of the teams. 

So we need to turn "team" columns to something linear algorithm can work with. For example first few rows from here

In [21]:
train.loc[:2, 'team1']

0    317
1     61
2    110
Name: team1, dtype: int64

To this:

In [22]:
pd.get_dummies(train.loc[:2, 'team1'])

Unnamed: 0,61,110,317
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0


So each team name now has it's own column. Read about "pd.get_dummies" here:

http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html

### But let's come back to more interesting stuff for now
### We are competition's solvers, remember? Lets dive into the space of more complicated models!

In [23]:
alg = ensemble.RandomForestClassifier(15, n_jobs=4)
alg.fit(xtrain, ytrain)
prediction = alg.predict_proba(xval)[:,1]

In [24]:
log_loss(yval, prediction)

1.1153333082796977

Surprisingly, this doesn't work very well. Now, like competition pro, let's make our models bigger!

In [25]:
alg = ensemble.RandomForestClassifier(150, n_jobs=4)
alg.fit(xtrain, ytrain)
prediction = alg.predict_proba(xval)[:,1]

In [26]:
log_loss(yval, prediction)

0.7388226194069718

### Almost there! But for now let's skip this model too and go to _real_ competitions stuff

In [32]:
import xgboost as xgb

In [18]:
param = {}
param['max_depth'] = 8
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'logloss'
param['eta'] = 0.2

param['subsample'] = 0.7
param['colsample_bytree'] = 0.7

numround = 200

In [19]:
param.items()

[('colsample_bytree', 0.7),
 ('eval_metric', 'logloss'),
 ('subsample', 0.7),
 ('eta', 0.2),
 ('objective', 'binary:logistic'),
 ('max_depth', 8),
 ('booster', 'gbtree')]

Xgboost parameters

https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

In [16]:
Xdatatrain = xgb.DMatrix(data = xtrain, label = ytrain)
Xdatatest = xgb.DMatrix(data = xval, label = yval)

plst = list(param.items())
watchlist = [(Xdatatrain, 'train'), (Xdatatest, 'eval')]            

bst = xgb.train(plst, Xdatatrain, numround, evals = watchlist, verbose_eval = 10)
# ypredxgb_tr = bst.predict(Xdatatrain)

NameError: name 'xtrain' is not defined

Wow! Finally our model better than constant predictions! Congratulations! Don't hesitate, submit!

In [19]:
tr = xgb.DMatrix(data = train[features], label = train["target"])

In [20]:
bst = xgb.train(plst, tr, numround, evals = [(tr, 'train')], verbose_eval = 10)

[0]	train-logloss:0.689124
[10]	train-logloss:0.658489
[20]	train-logloss:0.64207
[30]	train-logloss:0.628206
[40]	train-logloss:0.615129
[50]	train-logloss:0.605808
[60]	train-logloss:0.597754
[70]	train-logloss:0.591092
[80]	train-logloss:0.583741
[90]	train-logloss:0.577176
[100]	train-logloss:0.572124
[110]	train-logloss:0.566043
[120]	train-logloss:0.560684
[130]	train-logloss:0.555131
[140]	train-logloss:0.550893
[150]	train-logloss:0.547827
[160]	train-logloss:0.543338
[170]	train-logloss:0.540397
[180]	train-logloss:0.537018
[190]	train-logloss:0.533727


In [151]:
ss = sample_submission.copy()

ss.target = bst.predict(xgb.DMatrix(test[features]))
ss.to_csv('tuned_xgboost.csv', index=False)

## xgboost with hyperopt

In [53]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing

import numpy as np
import pandas as pd

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import sys
# The path to XGBoost wrappers goes here
import xgboost as xgb

In [54]:
def score(params):
    print "Training with params : "
    print params
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(data = xtrain, label = ytrain)
    dvalid = xgb.DMatrix(data = xval, label = yval)
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    model = xgb.train(params, dtrain, num_round, evals=watchlist, verbose_eval = 10)
    predictions = model.predict(dvalid)
    score = log_loss(yval, predictions)
    print "\tScore {0}\n\n".format(score)
    return {'loss': score, 'status': STATUS_OK}

In [55]:
def optimize(trials):
    space = {
             'n_estimators' : 200, #hp.choice('n_estimators', np.arange(100, 200, 50, dtype=int)),
             #'eta' : 0.1, #hp.choice('eta', np.arange(0.1, 0.2, 0.1, dtype=float)),
             'max_depth' : hp.choice('max_depth', np.arange(8, 12)),
             #'subsample' : 0.7,#hp.choice('subsample', np.arange(0.7, 0.9, 0.1, dtype=float)),
             #'colsample_bytree' : 0.7,#hp.choice('colsample_bytree', np.arange(0.7, 0.9, 0.1, dtype=float)),
             #'booster': 'gbtree',
             'eval_metric': 'logloss',
             'objective': 'binary:logistic',
             'silent' : 1
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=10)

    print best

In [56]:
trials = Trials()

optimize(trials)

Training with params : 
{'n_estimators': 200, 'objective': 'binary:logistic', 'max_depth': 10, 'eval_metric': 'logloss', 'silent': 1}
[0]	eval-logloss:0.687281	train-logloss:0.678226
[10]	eval-logloss:0.662086	train-logloss:0.61144
[20]	eval-logloss:0.6538	train-logloss:0.576151
[30]	eval-logloss:0.6497	train-logloss:0.550356
[40]	eval-logloss:0.643803	train-logloss:0.519673
[50]	eval-logloss:0.644238	train-logloss:0.505495
[60]	eval-logloss:0.643028	train-logloss:0.483072
[70]	eval-logloss:0.64279	train-logloss:0.463482
[80]	eval-logloss:0.64387	train-logloss:0.444722
[90]	eval-logloss:0.645035	train-logloss:0.433404
[100]	eval-logloss:0.646799	train-logloss:0.42103
[110]	eval-logloss:0.648315	train-logloss:0.409799
[120]	eval-logloss:0.65037	train-logloss:0.400508
[130]	eval-logloss:0.651744	train-logloss:0.389124
[140]	eval-logloss:0.653998	train-logloss:0.380241
[150]	eval-logloss:0.655873	train-logloss:0.370247
[160]	eval-logloss:0.658325	train-logloss:0.36262
[170]	eval-logloss:0

In [None]:
Xdatatrain = xgb.DMatrix(data = xtrain, label = ytrain)
Xdatatest = xgb.DMatrix(data = xval, label = yval)

In [44]:
space ={
        'max_depth': hp.choice('max_depth', np.arange(8, 12, 1, dtype=int)),
        'subsample': hp.choice('subsample', np.arange(0.8, 1, 0.1, dtype=int)),
    }

In [45]:
space['max_depth']

<hyperopt.pyll.base.Apply at 0x11a1c79d0>

In [50]:
(9,)[0]

9

Вот несколько идей, которые помогут воспроизвести новый baseline:

1) Чтобы тюнить алгоритмы, нужно иметь "правильную" кросс-валидацию. В сореваниях "правильной" кросс-валидацией является та, которая имеет ту же структуру, как и разбиение данных на трейн и тест. Если кросс-валидация "правильная", то улучшение качества на ней будет соответствовать улучшению качества на лидерборде. Если "неправильная", то можно переобучиться - улучшить качество на кросс-валидации, но ухудшить его на лидерборде (то есть на тестовых данных). 

2) Xgboost - это алгоритм бустинга деревьев. На каждом шаге к уже имеющемуся набору деревьев добавляется новое, таким образом, чтобы уменьшить ошибку всей композиции. Старые деревья не меняются. Таким образом, если на шаге k текущее предсказание бустинга это Yk, а предсказание от нового дерева это pk, то Y{k+1} = Yk + coeff * pk.

Основные параметры хгбуста:

max_depth - глубина деревьев в бустинге,

subsample - каждое дерево обучается на случайной подвыборке данных, пропорциональной значению subsample. Subsample == 1 значит, что каждой дерево получает все строки данных, Subsample == 0.5 — случайно выбранную половину.

colsample_bytree - выбор доли признаков, которые будут использованы одним деревом в композиции. Аналогично subsample.

eta - коэффициент с которым новое дерево в композиции влияет на уже имеющееся предсказание. eta это максимальное по модулю значение, которое может принимать coeff из формулы выше.

https://github.com/dmlc/xgboost/blob/master/doc/param..

Инвайт в контест https://kaggle.com/join/dmia_sport0_join


### Strange, but it seems like we got 0.658 instead of 0.649! 

### What could it be? Perhabs we need to train on all data instead of just 40% of it? Or may be should think over our cross-validation process?

### Let's overview now what we just did here:
1) made cross-validation

2) tried linear models, they didn't work, but we figured out how to tackle this problem

3) tried random forest and almost beat constant benchmark

4) tried xgboost and finally beat constant prediction!

### But there is the last thing you must know before you'll start this challenge by trying to make the most thorough parameter tuning: the data has it's secrets and those who will find them will be generously rewarded...

### now, good luck with it!