In [1]:
import pickle
import gzip

import pandas as pd
import numpy as np
import scipy.sparse as sp

from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import roc_auc_score

import competition_utils as u

In [2]:
import numba as nb

In [26]:
X_train = sp.load_npz('tmp/X_train_sparse.npz')
X_val = sp.load_npz('tmp/X_val_sparse.npz')

y_train = np.load('tmp/y_train.npy', )
y_val = np.load('tmp/y_val.npy', )

y_train = y_train.astype(np.float32)
y_val = y_val.astype(np.float32)

In [27]:

X_num_train = np.load('tmp/X_num_train.npy', )
X_num_val = np.load('tmp/X_num_val.npy', )

In [4]:
@nb.vectorize([nb.int32(nb.int32)], nopython=True, target='parallel')
def hashed(x):
    x = ((x >> 16) ^ x) * 0x45d9f3b
    x = ((x >> 16) ^ x) * 0x45d9f3b
    x = (x >> 16) ^ x
    return x

@nb.vectorize([nb.int32(nb.int32, nb.int32)], nopython=True, target='parallel')
def nonneg_mod(i, mod):
    m = i % mod
    if m < 0:
        m = m + mod
    return m

def rehash_matrix(X):
    n, _ = X.shape
    D = 2 ** 22
    indices_hashed = nonneg_mod(hashed(X.indices) + 13 * hashed(X.data), D)
    data = np.ones_like(X.data, dtype=np.uint8)
    X_res = sp.csr_matrix((data, indices_hashed, X.indptr), shape=(n, D))
    return X_res

%%time
X_train = rehash_matrix(X_train)
X_val = rehash_matrix(X_val)

In [5]:
from sklearn.preprocessing import OneHotEncoder

In [32]:
X_num_train = X_num_train.astype('uint32')
X_num_train = np.hstack([X_num_train, (X_num_train[:, 0] * X_num_train[:, 1]).reshape(-1, 1)], )

In [40]:
X_num_val = X_num_val.astype('uint32')
X_num_val = np.hstack([X_num_val, (X_num_val[:, 0] * X_num_val[:, 1]).reshape(-1, 1)], )

In [41]:
ohe1 = OneHotEncoder(dtype='uint8')
X_num_ohe_train = ohe1.fit_transform(X_num_train)
X_num_ohe_val = ohe1.transform(X_num_val)

In [42]:
cnt_num = X_num_ohe_train.sum(axis=0)
cnt_num = np.asarray(cnt_num)[0]
mask_num = cnt_num >= 50

In [45]:
X_num_ohe_train = X_num_ohe_train[:, mask_num]
X_num_ohe_val = X_num_ohe_val[:, mask_num]

In [50]:
model_num = ftrl.FtrlProximal(alpha=0.5, beta=1, l1=1, l2=0)

for i in tqdm(range(30)):
    model_num.fit(X_num_ohe_train, y_train)
    y_pred = model_num.predict(X_num_ohe_val)
    auc = roc_auc_score(y_val, y_pred)
    print(i + 1, auc)

1 0.642506682693
2 0.642492827052
3 0.641683926036



KeyboardInterrupt: 

In [16]:
cnt = (X_train > 0).sum(axis=0)
cnt = np.asarray(cnt)[0]
mask = cnt >= 50

In [17]:
X_train = X_train[:, mask]
X_val = X_val[:, mask]

In [19]:
X_train = sp.hstack([X_num_ohe_train, X_train, ], format='csr')
X_val = sp.hstack([X_num_ohe_val, X_val, ], format='csr')

In [23]:
import ftrl

1 0.639973905494
2 0.640146381476
3 0.640282567694
4 0.640534610387
5 0.640292584883
6 0.640291063364
7 0.640606071911
8 0.640566775348
9 0.640558171119
10 0.640580402796
11 0.640499300701


KeyboardInterrupt: 

In [25]:
for i in tqdm(range(30)):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    print(i + 1, auc)

1 0.724593215199
2 0.728597578866
3 0.730419357061
4 0.731608066791
5 0.732235884187
6 0.732710338279
7 0.73307411617
8 0.733273428634
9 0.733616817398
10 0.733743675737
11 0.733890267963
12 0.734023468792
13 0.73405509889
14 0.733992976886
15 0.734183652388
16 0.734156286282
17 0.734195739924
18 0.734247615128
19 0.734240461971
20 0.734340156671
21 0.73429368179
22 0.734404413477
23 0.73433469345
24 0.734379999698
25 0.734343744742
26 0.734282003323
27 0.734317453957
28 0.734360634774
29 0.734303076527
30 0.734318455458



In [7]:
for i in tqdm(range(30)):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    print(i + 1, auc)

0.724328867473
0.728590625559
0.730495705164
0.731532878902
0.732231844931
0.73270145889
0.733032186926
0.733351507954
0.733541617152
0.733776918727
0.733888759805
0.733943977512
0.733985556024
0.734170677925
0.734128688444
0.734216945388
0.734240800076
0.734249430881
0.734360581606
0.734338162489
0.734446227374
0.734390811491
0.734452169826
0.734406550549
0.734359516496
0.73438159352
0.73444946328
0.734416310455
0.734439298542
0.734445272918



In [8]:
X = sp.vstack([X_train, X_val])
y = np.concatenate([y_train, y_val])

Full model

In [9]:
%%time

model_full = ftrl.FtrlProximal(alpha=0.1, beta=1, l1=75, l2=25)
model_full.fit(X, y, num_passes=22)

CPU times: user 17min 58s, sys: 1.32 s, total: 18min
Wall time: 2min 31s


In [10]:
shift = 1.1875
scale = 850100

def shifted_scaled_sigmoid(x, shift=0, scale=1):
    s = 1 / (1 + np.exp(-x + shift))
    return (s * scale).round(2)

In [13]:
it_test = u.read_grouped('data/criteo_test_release.txt.gz')

In [12]:
f_out = open('pred_ftrl2.txt', 'w')

for gid, group in tqdm(it_test, total=7087738):
    cols = []
    vals = []

    for line in group:
        cols.append(line.idx)
        vals.append(line.val)

    X_val = u.to_csr(cols, vals)

    pred = model_full.predict(X_val)
    pred = shifted_scaled_sigmoid(pred, shift, scale)

    pred_str = u.to_prediction_str(gid, pred)
    
    f_out.write(pred_str)
    f_out.write('\n')

f_out.flush()
f_out.close()




In [14]:
!gzip pred_ftrl2.txt

In [15]:
import crowdai
challenge = crowdai.Challenge("CriteoAdPlacementNIPS2017", 'd671d30799fa215f63a5cb5049983c79')

[1m[36mCrowdAI.Event.Authentication[0m : Authenticating for challenge = [4m[1m[34mCriteoAdPlacementNIPS2017[0m
[1m[32mCrowdAI.Event.Connection.CONNECTED[0m[1m[32m[0m
[1m[32mCrowdAI.Event.Authentication.SUCCESS[0m : [1m[32mAuthentication Successful[0m


In [16]:
scores = challenge.submit('pred_ftrl2.txt.gz')

[1m[34mCrowdAI.Event.Misc.FILE_UPLOAD : Preparing for file upload[0m


  0% |          |[00:00<00:40]   2.45% /s] 

[1m[34mCrowdAI.Event.Misc.FILE_UPLOAD : Uploading file[0m


100% |██████████|[00:08<00:00]  12.23% /s] 
  0% |          |[00:00<?]  ?% /s] 

[1m[33mCrowdAI.Event.Job.ENQUEUED[0m : 7760cc63-c4fe-49dd-b994-e0ee3485de3b


                      0% |          |[00:00<?]  ?% /s]                     [1m[32m7760cc63-c4fe-49dd-b994-e0ee3485de3b[0m:   0% |          |[00:00<?]  ?% /s] 

[1m[34mCrowdAI.Event.Job.RUNNING[0m : 7760cc63-c4fe-49dd-b994-e0ee3485de3b
[1m[36mCrowdAI.Event.Job.INFO[0m : (7760cc63-c4fe-49dd-b994-e0ee3485de3b) Beginning grading of the submission


[1m[32m7760cc63-c4fe-49dd-b994-e0ee3485de3b[0m: 100% |█████████▉|[1:26:38<00:00]  52.22s/% ] 

[1m[36mCrowdAI.Event.Job.INFO[0m : (7760cc63-c4fe-49dd-b994-e0ee3485de3b) Scores Computed Successfully !!
[1m[36mCrowdAI.Event.Job.INFO[0m : (7760cc63-c4fe-49dd-b994-e0ee3485de3b) ImpWt_std: 0.0134043441367
[1m[36mCrowdAI.Event.Job.INFO[0m : (7760cc63-c4fe-49dd-b994-e0ee3485de3b) Uploading scores to the leaderboard....
[1m[36mCrowdAI.Event.Job.INFO[0m : (7760cc63-c4fe-49dd-b994-e0ee3485de3b) SNIPS_std: 0.000993291621552
[1m[36mCrowdAI.Event.Job.INFO[0m : (7760cc63-c4fe-49dd-b994-e0ee3485de3b) SNIPS : 53.0539863777
[1m[36mCrowdAI.Event.Job.INFO[0m : (7760cc63-c4fe-49dd-b994-e0ee3485de3b) ImpWt : 0.990946687511
[1m[36mCrowdAI.Event.Job.INFO[0m : (7760cc63-c4fe-49dd-b994-e0ee3485de3b) IPS_std: 2.53459631776
[1m[36mCrowdAI.Event.Job.INFO[0m : (7760cc63-c4fe-49dd-b994-e0ee3485de3b) IPS : 52.5736720603


                    [1m[32m7760cc63-c4fe-49dd-b994-e0ee3485de3b[0m: 100% |█████████▉|[1:26:39<00:00]  52.22s/% ] [1m[32m7760cc63-c4fe-49dd-b994-e0ee3485de3b[0m: 100% |██████████|[1:26:39<00:00]  257.11s/% ] 

[1m[36mCrowdAI.Event.Job.INFO[0m : (7760cc63-c4fe-49dd-b994-e0ee3485de3b) Scores Submited Successfully !!! 
[1m[32mCrowdAI.Event.Job.COMPLETE[0m : [1m[32m7760cc63-c4fe-49dd-b994-e0ee3485de3b[0m	   🍺 


                    [1m[32m7760cc63-c4fe-49dd-b994-e0ee3485de3b[0m: 100% |██████████|[1:26:39<00:00]  257.11s/% ] 


In [17]:
scores

{'impwt': 0.9909466875114865,
 'impwt_std': 0.013404344136707162,
 'ips': 52.57367206026822,
 'ips_std': 2.5345963177595756,
 'max_instances': 7087738,
 'message': '',
 'snips': 53.05398637770695,
 'snips_std': 0.0009932916215519697}

In [154]:
scores

{'impwt': 0.983055672490548,
 'impwt_std': 0.015840261443422165,
 'ips': 46.11893916159966,
 'ips_std': 1.885006865357641,
 'max_instances': 7087738,
 'message': '',
 'snips': 46.91386302136728,
 'snips_std': 0.0007714217927742332}