In [1]:
import sqlite3
import zipfile
import pandas as pd
import numpy as np
from pandas.io import sql
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

# In order to calculate Log Loss the classifier must assign a probability to each class rather than 
# simply yielding the most likely class. 

def evaluation(label,pred_label):
    num = len(label)
    logloss = 0.0
    for i in range(num):
        p = max(min(pred_label[i][label[i]-1],1-10**(-15)),10**(-15))
        logloss += np.log(p)
    logloss = -1*logloss/num
    return logloss

conn = sqlite3.connect('database.sqlite')

# Get train data
query1 = """
select Position, HistCTR, IsClick from trainSearchStream where IsClick=1 limit 6000 offset 12345;
"""
query0 = """
select Position, HistCTR, IsClick from trainSearchStream where IsClick=0 limit 1000000 offset 12345678;
"""
df = pd.concat([sql.read_sql(query0, conn), sql.read_sql(query1, conn)])
print (df)
X = df[['Position', 'HistCTR']]
y = df.IsClick

val_data = X[0:20000]
val_label = y[0:20000]
train_data = X[20000:]
train_label = y[20000:]


# Get test data
query_test = """
select TestID, Position, HistCTR from testSearchStream where ObjectType = 3
"""
df_test = sql.read_sql(query_test, conn)
X_test = df_test[['Position', 'HistCTR']]

# Learn
model =  RandomForestClassifier()
model.fit(train_data, train_label)

val_pred_label = model.predict_proba(val_data)
logloss = evaluation(val_label,val_pred_label)
print ("logloss of validation set:",logloss)


pred = model.predict_proba(X_test)

# Output to csv
filename = 'submission.csv'
pd.DataFrame({'ID': df_test.TestId, 'IsClick': pred[:, 1]}).to_csv(filename, index=False)


      Position   HistCTR  IsClick
0            1  0.005386        0
1            7  0.002415        0
2            1  0.000786        0
3            1  0.024390        0
4            1  0.022997        0
5            7  0.005568        0
6            7  0.016668        0
7            1  0.009364        0
8            7  0.035114        0
9            1  0.004108        0
10           1  0.009109        0
11           7  0.009434        0
12           1  0.003874        0
13           1  0.003339        0
14           7  0.001186        0
15           1  0.008010        0
16           7  0.000744        0
17           1  0.007241        0
18           7  0.005038        0
19           1  0.009445        0
20           1  0.001402        0
21           7  0.003703        0
22           7  0.004094        0
23           1  0.005846        0
24           1  0.046817        0
25           7  0.010852        0
26           1  0.004338        0
27           7  0.002739        0
28           7