In [None]:
!ls data/

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!zcat data/criteo_train.txt.gz | head

In [20]:
import gzip

import pandas as pd

import numpy as np
import scipy.sparse as sp

from tqdm import tqdm_notebook as tqdm

In [22]:
def parse_features(s):
    split = s.split(' ')
    f0 = split[0]
    assert f0.startswith('0:')
    f0 = int(f0[2:])

    f1 = split[1]
    assert f1.startswith('1:')
    f1 = int(f1[2:])

    idx = []
    values = []
    
    for fv in split[2:]:
        f, v = fv.split(':')
        idx.append(int(f))
        values.append(int(v))

    return f0, f1, idx, values

In [None]:
results = []

with gzip.open('data/criteo_train.txt.gz', 'r') as f:
#with gzip.open('data/criteo_train_small.txt.gz', 'r') as f:
    for line in tqdm(f):    
        line = line.decode()
        split = line.split('|')
        id = int(split[0].strip())

        label = None
        propensity = None
        features = None

        if len(split) == 4:
            l = split[1]
            assert l.startswith('l')

            l = l.lstrip('l ').strip()
            if l == '0.999':
                label = 0
            elif l == '0.001':
                label = 1
            else:
                raise Exception('ololo')

            p = split[2]
            assert p.startswith('p')
            p = p.lstrip('p ').strip()
            propensity = float(p)

            features = split[3].lstrip('f ').strip()

            f0, f1, idx, val = parse_features(features)
        elif len(split) == 2:
            continue
#             features = split[1].lstrip('f ').strip()
#         else:
#             raise Exception('ololo')

        results.append((id, f0, f1, idx, val, propensity, label))

In [None]:
len(results)

In [None]:
df_train = pd.DataFrame(results, columns=['id', 'f0', 'f1', 'col_idx', 'values', 'propensity', 'click'])

In [None]:
import pickle 
with open('tmp/df_train_clicks.bin', 'wb') as f:
    pickle.dump(df_train, f)

In [None]:
df_train.click.mean()

In [None]:
!free -h

In [2]:
import pickle 
with open('tmp/df_train_clicks.bin', 'rb') as f:
    df_train = pickle.load(f)

In [3]:
!free -h

             total       used       free     shared    buffers     cached
Mem:           31G        30G       663M        60K        66M       6.0G
-/+ buffers/cache:        24G       6.8G
Swap:          15G       2.5G        13G


In [5]:
y = df_train.click.values.astype('uint8')

In [6]:
X_num = df_train[['f0', 'f1']].values.astype('float32')

In [10]:
with open('tmp/y.bin', 'wb') as f:
    pickle.dump(y, f)

with open('tmp/X_num.bin', 'wb') as f:
    pickle.dump(X_num, f)

In [5]:
columns = list(df_train.col_idx)
values = list(df_train['values'])

In [6]:
del df_train
import gc
gc.collect()

53

In [None]:
def to_csr(cols, vals, shape=74000):
    lens = [len(c) for c in cols]
    intptr = np.zeros((len(cols) + 1), dtype='uint32')
    intptr[1:] = lens
    intptr = intptr.cumsum()

    columns = np.concatenate(cols).astype('uint32')
    values = np.concatenate(vals).astype('uint8')

    return sp.csr_matrix((values, columns, intptr), shape=(len(cols), shape))

In [None]:
sp.save_npz('matrix.npz', X, compressed=False)

In [14]:
!ls -lh

total 1.7G
-rw-rw-r-- 1 agrigorev agrigorev  11K Nov  7 06:20 01-eda.ipynb
-rw-rw-r-- 1 agrigorev agrigorev 2.9K Nov  3 21:00 criteo_dataset.py
drwxrwxr-x 5 agrigorev agrigorev 4.0K Nov  3 20:47 criteo_starter_kit
drwxrwxr-x 2 agrigorev agrigorev 4.0K Nov  3 20:39 data
-rw-rw-r-- 1 agrigorev agrigorev 1.7G Nov  6 21:06 matrix.npz
drwxrwxr-x 2 agrigorev agrigorev 4.0K Nov  3 21:00 __pycache__
drwxrwxr-x 2 agrigorev agrigorev 4.0K Nov  5 12:22 tmp
-rw-rw-r-- 1 agrigorev agrigorev 2.4K Nov  3 21:00 utils.py


In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=1)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from time import time

In [18]:
#for C in [0.01, 0.1, 0.5, 1, 5]:
for C in [0.1]:
    t0 = time()

    lr = LogisticRegression(penalty='l1', C=C, random_state=1)
    lr.fit(X_train, y_train)

    y_pred = lr.decision_function(X_val)
    auc = roc_auc_score(y_val, y_pred)

    print('C=%s, took %.3fs, auc=%.3f' % (C, time() - t0, auc))

C=0.1, took 4454.416s, auc=0.734


In [19]:
with open('model_c01.bin', 'wb') as f:
    pickle.dump(lr, f)

In [11]:
import pickle

In [12]:
with open('model_c01.bin', 'rb') as f:
    lr = pickle.load(f)

In [13]:
(lr.coef_[0] == 0).mean()

0.92592142075173334

In [22]:
X.shape

(14175476, 73989)

In [23]:
!zcat data/criteo_test_release.txt.gz | head -n 10

896678244 |f 0:320 1:50 2:1 12:1 14:1 21:1 49:1 51:1 106:1 143:1 192:1 229:1 249:1 280:1 292:1 295:1 297:1 764:1 873:1 2675:1 4180:1 4686:1 
896678244 |f 0:320 1:50 2:1 21:1 22:1 23:1 24:1 49:1 50:1 51:3 52:1 75:1 115:1 143:1 147:1 192:1 229:1 249:3 253:3 280:1 305:1 310:1 408:1 528:1 588:1 591:1 637:1 730:1 837:1 848:1 1753:1 2511:1 2568:1 2675:1 4180:1 4686:1 
896678244 |f 0:320 1:50 2:1 21:1 22:1 23:1 24:1 49:1 51:2 52:1 91:1 115:1 136:1 143:1 147:1 182:1 192:1 229:1 249:2 280:1 305:1 310:1 312:1 320:1 588:1 591:1 614:1 615:1 637:1 798:1 1572:1 2233:2 2675:1 4180:1 4686:1 
896678244 |f 0:320 1:50 2:1 21:1 22:1 23:1 24:1 49:1 50:1 51:3 52:1 81:1 115:1 143:1 147:1 176:1 192:1 229:1 249:3 253:3 280:1 303:1 305:1 310:1 528:1 572:1 588:1 591:1 637:1 772:1 1753:1 2511:1 2568:1 2675:1 4180:1 4686:1 
896678244 |f 0:320 1:50 2:1 21:1 22:1 23:1 24:1 49:1 51:3 143:1 165:1 192:1 229:1 249:3 255:1 280:1 303:1 304:1 305:1 310:1 408:1 588:1 589:1 637:1 1760:3 2675:1 4180:1 4686:1 
896678244 |

In [14]:
def read_test_data():
    f = gzip.open('data/criteo_test_release.txt.gz', 'r')

    for line in f:    
        line = line.decode()
        split = line.split(' |f ')
        if len(split) != 2:
            continue

        id = int(split[0].strip())
        features = split[1].lstrip('f ').strip()
        yield (id, parse_features(features))

In [25]:
!zcat data/criteo_test_release.txt.gz | wc -l

87022251


In [25]:
from itertools import groupby


In [26]:


def sigmoid_percent(x):
    s = 1 / (1 + np.exp(-x))
    return (s * 100).round(2)

In [27]:
def to_prediction_str(id, preds):
    res = ['%d:%0.2f' % (i, p) for (i, p) in enumerate(preds)]
    return '%d;%s' % (id, ','.join(res))

In [31]:
test_data = read_test_data()
test_groups = groupby(test_data, key=lambda x: x[0])

7087738it groups

In [32]:
f_out = open('pred.txt', 'w')

for gid, group in tqdm(test_groups):
    cols = []
    vals = []

    for _, (f1, f2, col, val) in group:
        cols.append(col)
        vals.append(val)

    X_test = to_csr(cols, vals)
    pred = lr.decision_function(X_test)
    pred = sigmoid_percent(pred)
    
    pred_str = to_prediction_str(gid, pred)
    
    f_out.write(pred_str)
    f_out.write('\n')

f_out.flush()
f_out.close()




In [2]:
import crowdai
challenge = crowdai.Challenge("CriteoAdPlacementNIPS2017", 'd671d30799fa215f63a5cb5049983c79')

[1m[36mCrowdAI.Event.Authentication[0m : Authenticating for challenge = [4m[1m[34mCriteoAdPlacementNIPS2017[0m
[1m[32mCrowdAI.Event.Connection.CONNECTED[0m[1m[32m[0m
[1m[32mCrowdAI.Event.Authentication.SUCCESS[0m : [1m[32mAuthentication Successful[0m


In [3]:
scores = challenge.submit('pred.txt.gz')

[1m[34mCrowdAI.Event.Misc.FILE_UPLOAD : Preparing for file upload[0m


  0% |          |[00:00<00:25]   3.85% /s] 

[1m[34mCrowdAI.Event.Misc.FILE_UPLOAD : Uploading file[0m


100% |██████████|[00:03<00:00]  30.28% /s] 
  0% |          |[00:00<?]  ?% /s] 

[1m[33mCrowdAI.Event.Job.ENQUEUED[0m : 54dbf1eb-2c0f-4138-87b3-594e7decffa7


                      0% |          |[00:00<?]  ?% /s]                     [1m[32m54dbf1eb-2c0f-4138-87b3-594e7decffa7[0m:   0% |          |[00:00<?]  ?% /s] 

[1m[34mCrowdAI.Event.Job.RUNNING[0m : 54dbf1eb-2c0f-4138-87b3-594e7decffa7
[1m[36mCrowdAI.Event.Job.INFO[0m : (54dbf1eb-2c0f-4138-87b3-594e7decffa7) Beginning grading of the submission


[1m[32m54dbf1eb-2c0f-4138-87b3-594e7decffa7[0m: 100% |█████████▉|[1:27:09<00:00]  51.89s/% ] 

[1m[36mCrowdAI.Event.Job.INFO[0m : (54dbf1eb-2c0f-4138-87b3-594e7decffa7) Scores Computed Successfully !!
[1m[36mCrowdAI.Event.Job.INFO[0m : (54dbf1eb-2c0f-4138-87b3-594e7decffa7) ImpWt : 0.986030867203
[1m[36mCrowdAI.Event.Job.INFO[0m : (54dbf1eb-2c0f-4138-87b3-594e7decffa7) SNIPS_std: 0.00066494377503
[1m[36mCrowdAI.Event.Job.INFO[0m : (54dbf1eb-2c0f-4138-87b3-594e7decffa7) Uploading scores to the leaderboard....
[1m[36mCrowdAI.Event.Job.INFO[0m : (54dbf1eb-2c0f-4138-87b3-594e7decffa7) SNIPS : 50.2777411261
[1m[36mCrowdAI.Event.Job.INFO[0m : (54dbf1eb-2c0f-4138-87b3-594e7decffa7) ImpWt_std: 0.0114416243194
[1m[36mCrowdAI.Event.Job.INFO[0m : (54dbf1eb-2c0f-4138-87b3-594e7decffa7) IPS_std: 1.65497059166
[1m[36mCrowdAI.Event.Job.INFO[0m : (54dbf1eb-2c0f-4138-87b3-594e7decffa7) IPS : 49.5754046836


                    [1m[32m54dbf1eb-2c0f-4138-87b3-594e7decffa7[0m: 100% |█████████▉|[1:27:10<00:00]  51.89s/% ] [1m[32m54dbf1eb-2c0f-4138-87b3-594e7decffa7[0m: 100% |██████████|[1:27:10<00:00]  273.23s/% ] 

[1m[36mCrowdAI.Event.Job.INFO[0m : (54dbf1eb-2c0f-4138-87b3-594e7decffa7) Scores Submited Successfully !!! 
[1m[32mCrowdAI.Event.Job.COMPLETE[0m : [1m[32m54dbf1eb-2c0f-4138-87b3-594e7decffa7[0m	   🍺 


                    [1m[32m54dbf1eb-2c0f-4138-87b3-594e7decffa7[0m: 100% |██████████|[1:27:10<00:00]  273.23s/% ] 


In [4]:
scores

{'impwt': 0.986030867203039,
 'impwt_std': 0.011441624319414723,
 'ips': 49.575404683599835,
 'ips_std': 1.654970591660044,
 'max_instances': 7087738,
 'message': '',
 'snips': 50.27774112612186,
 'snips_std': 0.000664943775029717}

In [4]:
scores

{'impwt': 0.9775760846224011,
 'impwt_std': 0.021581243720401128,
 'ips': 45.21622789247131,
 'ips_std': 3.2114821679263055,
 'max_instances': 7087738,
 'message': '',
 'snips': 46.25341045442673,
 'snips_std': 0.0012869509224777156}