# Import libraries

In [123]:
import numpy as np # calculations with arrays
import pandas as pd # user-friendly DataFrames for data representation
import sklearn # machine learning algorithms
from sklearn import ensemble, linear_model
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt # import plot functions
# necessary to plot in jupyter notebook:
%matplotlib inline
import seaborn as sns # make plots beautiful

# Download data from competition's page

https://inclass.kaggle.com/c/data-mining-in-action-2016-competitions-01/data

# Load data using pandas

In [236]:
train = pd.read_csv('train2.csv')
test = pd.read_csv('test2.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Data

In [125]:
# print first row
train[:1]

Unnamed: 0,year,day,team1,team2,score1,score2,target
0,2998,19,317,131,336,278,True


In [126]:
test[:1]

Unnamed: 0,Id,year,team1,team2
0,0,3021,363,161


In [127]:
# Target variable is "target" and this means we will be predicting it
sample_submission[:1]

Unnamed: 0,Id,target
0,0,0.5


## Quick look at the unique values in data...

In [128]:
for c in train.columns:
    print c, train[c].unique()[:5]

year [2998 2999 3000 3001 3002]
day [19 28 30 31 33]
team1 [317  61 110 352 229]
team2 [131  29 141 146  91]
score1 [336 301 359 309 332]
score2 [278 259 267 410 220]
target [True False]


# Cross-validation

### Lets split data randomly to train and validatation. We will train our algorithms on selected train set and validate them on validation set. Easy as it can be!

In [129]:
# train size
train.shape 

(101609, 7)

train is quite big, so for example purposes we'll sample only part of it

In [130]:
from sklearn.cross_validation import ShuffleSplit

for itr, ite in ShuffleSplit(len(train), n_iter=1, train_size=0.4, test_size=0.1, random_state=0):
    pass

information about all functions can be found on the internet, for example

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html

In [131]:
# or you can open it in you Jupyter notebook executing function in this manner
?ShuffleSplit()

In [132]:
len(itr), len(ite)

(40643, 10161)

In [133]:
itr[:5], ite[:5]

(array([22710, 41665, 91975, 57348, 39931]),
 array([ 37078, 101474,  29858,  61674,   1049]))

now we have validation set "ite" to check the quality of our solution

# features and target

In [134]:
sample_submission[:2]

Unnamed: 0,Id,target
0,0,0.5
1,1,0.5


we need to change 'target' column in "sample_submission" to our predictions.

For now we will select only features that are present in both train and test:

In [135]:
features = []
for c in train.columns:
    if c in test.columns and c!='target':
        features += [c]
        print '"{}" is present in test and train'.format(c)
    else:
        print '"{}" is NOT present in test'.format(c)
        
features

"year" is present in test and train
"day" is NOT present in test
"team1" is present in test and train
"team2" is present in test and train
"score1" is NOT present in test
"score2" is NOT present in test
"target" is NOT present in test


['year', 'team1', 'team2']

here we split train on "train" and "validation" parts

In [136]:
xtrain = train.loc[itr, features]    
ytrain = train.loc[itr, 'target']

xval = train.loc[ite, features]
yval = train.loc[ite, 'target']

# Baseline solution

lets make baseline first by predicting the mean value

In [137]:
train.target.mean()

0.50096940231672393

In [138]:
constant_prediction = yval * 0 + train.target.mean()
constant_prediction = constant_prediction.values
constant_prediction

array([ 0.5009694,  0.5009694,  0.5009694, ...,  0.5009694,  0.5009694,
        0.5009694])

In [139]:
log_loss(yval, constant_prediction)

0.6931565015839517

In [140]:
submission = sample_submission.copy()
submission.target = train['target'].mean() # notice here that we can refer to a column 'target' in two ways
submission.to_csv('constant_submission.csv', index=False)

Now this should score like "Baseline - Constant" on Leaderboard!
You can submit this by going to 

https://inclass.kaggle.com/c/data-mining-in-action-2016-competitions-01/submissions/attach

## Simple prob solution

In [141]:
sorted(pd.unique(train["team1"])) == sorted(pd.unique(train["team2"])) # the same commands!!

True

In [142]:
team_names = sorted(pd.unique(train["team1"]))

In [143]:
team_name = 317
prob_t1 = float(train[train["team1"]==team_name].target.sum())/train[train["team1"]==team_name].target.shape[0]
prob_t2 = 1 - float(train[train["team2"]==team_name].target.sum())/train[train["team2"]==team_name].target.shape[0]

all_prob_to_win = 0.5*(prob_t1 + prob_t2)    ## probability that team wins
all_prob_to_win

0.6967152686762779

In [202]:
train_set = train.loc[itr]
val_set = train.loc[ite]

In [203]:
%%time
d = {}
for team_name in team_names:
    prob_t1 = float(train_set[train_set["team1"]==team_name].target.sum())/train_set[train_set["team1"]==team_name].target.shape[0]
    prob_t2 = 1 - float(train_set[train_set["team2"]==team_name].target.sum())/train_set[train_set["team2"]==team_name].target.shape[0]

    all_prob_to_win = 0.5*(prob_t1 + prob_t2)    ## probability that team wins
    d[team_name] = all_prob_to_win

CPU times: user 1.68 s, sys: 17.6 ms, total: 1.69 s
Wall time: 1.7 s


In [204]:
d[146]

0.8175906183368871

In [220]:
val_set["prob_t1_win"] = val_set["team1"].apply(lambda x: d.get(x, 0.5))
val_set["prob_t2_win"] = val_set["team2"].apply(lambda x: d.get(x, 0.5))

In [221]:
val_set["prob_t2_lost"] = 1 - val_set["prob_t2_win"]

In [222]:
val_set.head()

Unnamed: 0,year,day,team1,team2,score1,score2,target,prob_t1_win,prob_t2_win,prob_t2_lost,pred
37078,3006,214,305,196,263,247,True,0.457215,0.360509,0.639491,0.554983
101474,3019,226,361,207,178,228,False,0.53947,0.670008,0.329992,0.418075
29858,3005,95,178,204,212,197,True,0.56,0.592766,0.407234,0.519045
61674,3012,90,290,243,224,131,True,0.670447,0.628573,0.371427,0.518998
1049,2998,93,354,186,270,259,True,0.538218,0.480051,0.519949,0.510154


In [227]:
val_set["pred"] = (val_set["prob_t1_win"] + val_set["prob_t2_lost"])/2

In [228]:
log_loss(list(val_set["target"]), list(val_set["pred"]))

0.65446123409511436

In [246]:
for_subm = test

In [247]:
for_subm["prob_t1_win"] = for_subm["team1"].apply(lambda x: d.get(x, 0.5))
for_subm["prob_t2_win"] = for_subm["team2"].apply(lambda x: d.get(x, 0.5))
for_subm["prob_t2_lost"] = 1 - for_subm["prob_t2_win"]
for_subm["target"] = (for_subm["prob_t1_win"] + for_subm["prob_t2_lost"])/2

In [248]:
for_subm.head()

Unnamed: 0,Id,year,team1,team2,prob_t1_win,prob_t2_win,prob_t2_lost,target
0,0,3021,363,161,0.437586,0.490602,0.509398,0.473492
1,1,3021,286,2,0.59038,0.386957,0.613043,0.601712
2,2,3020,232,52,0.546654,0.218915,0.781085,0.663869
3,3,3020,84,11,0.40452,0.478844,0.521156,0.462838
4,4,3021,305,39,0.457215,0.689543,0.310457,0.383836


In [254]:
for_subm.drop(["prob_t1_win", "prob_t2_win", "prob_t2_lost", "year", "team1", "team2"], axis=1, inplace=True)

In [257]:
for_subm.head()

Unnamed: 0,Id,target
0,0,0.473492
1,1,0.601712
2,2,0.663869
3,3,0.462838
4,4,0.383836


In [258]:
for_subm.to_csv("prob_solution.csv", index=False)

# Machine learning

Finally, lets try machine learning!

In [23]:
alg = linear_model.LogisticRegression()
alg.fit(xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
prediction = alg.predict_proba(xval)[:,1]

In [25]:
log_loss(yval, prediction)

0.69278091662349195

### Well, not so far from the constant solution... Let's try to understand why.

What's a linear model such as LogisticRegression is trying to do is multiply each variable on some coefficient and add add it up, in our case:

y_predicted = column1 \* coef1 + column2 \* coef2 + column3 \* coef3 + bias

We can print coefficients and bias:

In [26]:
alg.coef_, alg.intercept_

(array([[  2.31737378e-07,   3.29139376e-04,  -2.98254396e-04]]),
 array([  5.62890115e-09]))

But clearly, "team1" and "team2" are _categorical_ columns, just like names of the teams. 

So we need to turn "team" columns to something linear algorithm can work with. For example first few rows from here

In [27]:
train.loc[:2, 'team1']

0    317
1     61
2    110
Name: team1, dtype: int64

To this:

In [28]:
pd.get_dummies(train.loc[:2, 'team1'])

Unnamed: 0,61,110,317
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0


So each team name now has it's own column. Read about "pd.get_dummies" here:

http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html

### But let's come back to more interesting stuff for now
### We are competition's solvers, remember? Lets dive into the space of more complicated models!

In [29]:
alg = ensemble.RandomForestClassifier(15, n_jobs=4)
alg.fit(xtrain, ytrain)
prediction = alg.predict_proba(xval)[:,1]

In [30]:
log_loss(yval, prediction)

1.1744940647416549

Surprisingly, this doesn't work very well. Now, like competition pro, let's make our models bigger!

In [31]:
alg = ensemble.RandomForestClassifier(150, n_jobs=4)
alg.fit(xtrain, ytrain)
prediction = alg.predict_proba(xval)[:,1]

In [32]:
log_loss(yval, prediction)

0.74131637899967551

### Almost there! But for now let's skip this model too and go to _real_ competitions stuff

In [33]:
import xgboost

In [34]:
param = {}
param['max_depth'] = 8
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'logloss'
param['eta'] = 0.1

numround = 100

Xgboost parameters

https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

In [35]:
Xdatatrain = xgboost.DMatrix(data = xtrain, label = ytrain)
Xdatatest = xgboost.DMatrix(data = xval, label = yval)

plst = list(param.items())
watchlist = [(Xdatatrain, 'train'), (Xdatatest, 'eval')]            

bst = xgboost.train(plst, Xdatatrain, numround, evals = watchlist, verbose_eval = 10)
# ypredxgb_tr = bst.predict(Xdatatrain)

[0]	train-logloss:0.689826	eval-logloss:0.691345
[10]	train-logloss:0.670759	eval-logloss:0.681402
[20]	train-logloss:0.658005	eval-logloss:0.67538
[30]	train-logloss:0.647507	eval-logloss:0.669858
[40]	train-logloss:0.638459	eval-logloss:0.66616
[50]	train-logloss:0.628218	eval-logloss:0.661736
[60]	train-logloss:0.619701	eval-logloss:0.658285
[70]	train-logloss:0.6126	eval-logloss:0.655751
[80]	train-logloss:0.604423	eval-logloss:0.652431
[90]	train-logloss:0.597331	eval-logloss:0.649311


Wow! Finally our model better than constant predictions! Congratulations! Don't hesitate, submit!

In [95]:
ss = sample_submission.copy()

ss.target = bst.predict(xgboost.DMatrix(test[features]))
ss.to_csv('mighty_xgboost.csv', index=False)

### Strange, but it seems like we got 0.658 instead of 0.649! 

### What could it be? Perhabs we need to train on all data instead of just 40% of it? Or may be should think over our cross-validation process?

### Let's overview now what we just did here:
1) made cross-validation

2) tried linear models, they didn't work, but we figured out how to tackle this problem

3) tried random forest and almost beat constant benchmark

4) tried xgboost and finally beat constant prediction!

### But there is the last thing you must know before you'll start this challenge by trying to make the most thorough parameter tuning: the data has it's secrets and those who will find them will be generously rewarded...

### now, good luck with it!