In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

from sklearn.metrics import roc_auc_score
from tqdm import tqdm_notebook as tqdm

In [2]:
import competition_utils as u

In [3]:
X_train = sp.load_npz('tmp/X_train_sparse.npz')
X_val = sp.load_npz('tmp/X_val_sparse.npz')

y_train = np.load('tmp/y_train.npy', )
y_val = np.load('tmp/y_val.npy', )

y_train = y_train.astype(np.float32)
y_val = y_val.astype(np.float32)

In [4]:
prospensity_val = np.load('tmp/prospensity_val.npy')

In [5]:
import ftrl

In [6]:
model = ftrl.FtrlProximal(alpha=0.1, beta=1, l1=75, l2=25)

In [7]:
for i in tqdm(range(20)):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    print(auc)

0.724557835545
0.728503792556
0.730382334198
0.731554873506
0.732187807822
0.73266944371
0.733034817245
0.733250238956
0.733580563871
0.733726686471
0.733892882517
0.734008730496
0.734051071205
0.734059912074
0.734242511593
0.73419372952
0.734230433598
0.734277064593
0.734289764739
0.734390659298



it_val = u.read_grouped('data/train_3.txt')

df_full_val = []

limit = 500000

cnt = 0

for id, group in tqdm(it_val, total=limit):
    df_full_val.extend(list(group))
    cnt = cnt + 1

    if cnt > limit:
        break

df_full_val = pd.DataFrame(df_full_val)

In [8]:
import pickle

In [9]:
with open('./tmp/df_full_val_500k.bin', 'rb') as f:
    df_full_val = pickle.load(f)

In [11]:
cols = list(df_full_val.idx)
vals = list(df_full_val.val)

X_val_full = u.to_csr(cols, vals)

In [12]:
%%time

df_full_val['predict'] = model.predict(X_val_full)

CPU times: user 9.25 s, sys: 40 ms, total: 9.29 s
Wall time: 1.18 s


In [10]:
from numba import njit

In [13]:
prop_series = df_full_val.propensity.dropna()
num_groups = len(prop_series)

group_indptr = np.zeros(num_groups + 1, dtype='uint32')
group_indptr[:num_groups] = prop_series.index.values
group_indptr[-1] = len(df_full_val)

groups = df_full_val.id.values[group_indptr[:-1]]

prospensities = prop_series.values.astype('float32')
clicks = df_full_val.label.dropna().values

In [14]:
model_predictions = df_full_val.predict.values

In [19]:
@njit
def compute_IPS(model_predictions, group_indptr, prospensities, clicks):
    nom_sum = 0.0

    num_pos = 0
    num_neg = 0

    n = len(clicks)
    for i in range(n):
        label = clicks[i]

        if label == 0:
            num_neg = num_neg + 1
            continue

        num_pos = num_pos + 1
        prosp = prospensities[i]

        group_start = group_indptr[i]
        group_end = group_indptr[i + 1]
        scores = model_predictions[group_start:group_end]

        best = scores.max()
        offset_scores = scores - best

        probs = np.exp(offset_scores)
        prob = probs[0] / probs.sum()

        weight = prosp * prob 
        nom_sum = nom_sum + weight

    denom = num_pos + 10 * num_neg
    return 10000 * nom_sum / denom        

In [61]:
@njit
def argmax_increase(model_predictions, group_indptr, inc):
    out = model_predictions.copy()

    n_groups = len(group_indptr) - 1

    for i in range(n_groups):
        group_start = group_indptr[i]
        group_end = group_indptr[i + 1]
        
        scores = model_predictions[group_start:group_end]
    
        m = group_start + np.argmax(scores)
        out[m] = out[m] + inc

    return out

In [34]:
%%time
compute_IPS(model_predictions, group_indptr, prospensities, clicks)

CPU times: user 28 ms, sys: 4 ms, total: 32 ms
Wall time: 32.5 ms


44.32622507690207

In [79]:
compute_IPS(pred_inc, group_indptr, prospensities, clicks)

100.01527644958387

In [82]:
def shifted_scaled_sigmoid(x, shift=1, scale=1000000):
    s = 1 / (1 + np.exp(-x + shift))
    return (s * scale).round(2)

In [91]:
shift = 1.1875
scale = 850100
pred_scaled = shifted_scaled_sigmoid(model_predictions, shift=shift, scale=scale)

pred_inc = argmax_increase(pred_scaled, group_indptr, 1)

compute_IPS(pred_inc, group_indptr, prospensities, clicks)

58.99182104226634

In [89]:
from scipy.optimize.optimize import fmin

In [95]:
def f_optimize(x):
    pred = shifted_scaled_sigmoid(model_predictions, shift=x[0], scale=x[1])
    ips = compute_IPS(pred, group_indptr, prospensities, clicks)
    return -ips

In [104]:
x0 = np.array([1, 1000000])
#x0 = np.array([0, 100])

In [105]:
fmin(f_optimize, x0)

Optimization terminated successfully.
         Current function value: -50.664431
         Iterations: 74
         Function evaluations: 179


array([  8.60489009e-01,   9.27951029e+04])

In [106]:
shift, scale = _
shift, scale

(0.86048900889449698, 92795.102890663606)

In [107]:
shift = 1.1875
scale = 850100


In [108]:
pred = shifted_scaled_sigmoid(model_predictions, shift=shift, scale=scale)
ips = compute_IPS(pred, group_indptr, prospensities, clicks)
ips

50.825171940625154

In [156]:
it_test = read_train('data/criteo_test_release.txt.gz')
test_groups = groupby(it_test, key=lambda x: x.id)

In [157]:
f_out = open('pred_ftrl.txt', 'w')

for gid, group in tqdm(test_groups, total=7087738):
    cols = []
    vals = []

    for line in group:
        cols.append(line.idx)
        vals.append(line.val)

    X_val = to_csr(cols, vals)
    pred = model_full.predict(X_val)
    pred = sigmoid_percent(pred)

    pred_str = to_prediction_str(gid, pred)
    
    f_out.write(pred_str)
    f_out.write('\n')

f_out.flush()
f_out.close()




In [158]:
!gzip pred_ftrl.txt

In [152]:
import crowdai
challenge = crowdai.Challenge("CriteoAdPlacementNIPS2017", 'd671d30799fa215f63a5cb5049983c79')

[1m[36mCrowdAI.Event.Authentication[0m : Authenticating for challenge = [4m[1m[34mCriteoAdPlacementNIPS2017[0m
[1m[32mCrowdAI.Event.Connection.CONNECTED[0m[1m[32m[0m
[1m[32mCrowdAI.Event.Authentication.SUCCESS[0m : [1m[32mAuthentication Successful[0m


In [159]:
scores = challenge.submit('pred_ftrl.txt.gz')

[1m[34mCrowdAI.Event.Misc.FILE_UPLOAD : Preparing for file upload[0m


  0% |          |[00:00<00:37]   2.64% /s] 

[1m[34mCrowdAI.Event.Misc.FILE_UPLOAD : Uploading file[0m


100% |██████████|[00:05<00:00]  18.83% /s] 
  0% |          |[00:00<?]  ?% /s] 

[1m[33mCrowdAI.Event.Job.ENQUEUED[0m : f9ad9084-6bba-4db6-a043-20e7f32bbd9e


                      0% |          |[00:00<?]  ?% /s]                     [1m[32mf9ad9084-6bba-4db6-a043-20e7f32bbd9e[0m:   0% |          |[00:00<?]  ?% /s] 

[1m[34mCrowdAI.Event.Job.RUNNING[0m : f9ad9084-6bba-4db6-a043-20e7f32bbd9e
[1m[36mCrowdAI.Event.Job.INFO[0m : (f9ad9084-6bba-4db6-a043-20e7f32bbd9e) Beginning grading of the submission


[1m[32mf9ad9084-6bba-4db6-a043-20e7f32bbd9e[0m: 100% |█████████▉|[1:27:31<00:00]  52.09s/% ] 

[1m[36mCrowdAI.Event.Job.INFO[0m : (f9ad9084-6bba-4db6-a043-20e7f32bbd9e) Scores Computed Successfully !!
[1m[36mCrowdAI.Event.Job.INFO[0m : (f9ad9084-6bba-4db6-a043-20e7f32bbd9e) ImpWt_std: 0.010838412606
[1m[36mCrowdAI.Event.Job.INFO[0m : (f9ad9084-6bba-4db6-a043-20e7f32bbd9e) SNIPS_std: 0.000738664556485
[1m[36mCrowdAI.Event.Job.INFO[0m : (f9ad9084-6bba-4db6-a043-20e7f32bbd9e) Uploading scores to the leaderboard....
[1m[36mCrowdAI.Event.Job.INFO[0m : (f9ad9084-6bba-4db6-a043-20e7f32bbd9e) SNIPS : 51.0486727528
[1m[36mCrowdAI.Event.Job.INFO[0m : (f9ad9084-6bba-4db6-a043-20e7f32bbd9e) ImpWt : 0.984911096015
[1m[36mCrowdAI.Event.Job.INFO[0m : (f9ad9084-6bba-4db6-a043-20e7f32bbd9e) IPS_std: 1.86442068073
[1m[36mCrowdAI.Event.Job.INFO[0m : (f9ad9084-6bba-4db6-a043-20e7f32bbd9e) IPS : 50.278404231


                    [1m[32mf9ad9084-6bba-4db6-a043-20e7f32bbd9e[0m: 100% |█████████▉|[1:27:32<00:00]  52.09s/% ] [1m[32mf9ad9084-6bba-4db6-a043-20e7f32bbd9e[0m: 100% |██████████|[1:27:32<00:00]  224.92s/% ] 

[1m[36mCrowdAI.Event.Job.INFO[0m : (f9ad9084-6bba-4db6-a043-20e7f32bbd9e) Scores Submited Successfully !!! 
[1m[32mCrowdAI.Event.Job.COMPLETE[0m : [1m[32mf9ad9084-6bba-4db6-a043-20e7f32bbd9e[0m	   🍺 


                    [1m[32mf9ad9084-6bba-4db6-a043-20e7f32bbd9e[0m: 100% |██████████|[1:27:32<00:00]  224.92s/% ] 


In [154]:
scores

{'impwt': 0.983055672490548,
 'impwt_std': 0.015840261443422165,
 'ips': 46.11893916159966,
 'ips_std': 1.885006865357641,
 'max_instances': 7087738,
 'message': '',
 'snips': 46.91386302136728,
 'snips_std': 0.0007714217927742332}

In [153]:
scores = challenge.submit('pred_ftrl.txt.gz')

[1m[34mCrowdAI.Event.Misc.FILE_UPLOAD : Preparing for file upload[0m


  0% |          |[00:00<00:25]   3.96% /s] 

[1m[34mCrowdAI.Event.Misc.FILE_UPLOAD : Uploading file[0m


100% |██████████|[00:05<00:00]  19.78% /s] 
  0% |          |[00:00<?]  ?% /s] 

[1m[33mCrowdAI.Event.Job.ENQUEUED[0m : ea739313-4bd4-411d-8639-c0da25882f43


                      0% |          |[00:00<?]  ?% /s]                     [1m[32mea739313-4bd4-411d-8639-c0da25882f43[0m:   0% |          |[00:00<?]  ?% /s] 

[1m[34mCrowdAI.Event.Job.RUNNING[0m : ea739313-4bd4-411d-8639-c0da25882f43
[1m[36mCrowdAI.Event.Job.INFO[0m : (ea739313-4bd4-411d-8639-c0da25882f43) Beginning grading of the submission


[1m[32mea739313-4bd4-411d-8639-c0da25882f43[0m: 100% |█████████▉|[1:26:21<00:00]  52.52s/% ] 

[1m[36mCrowdAI.Event.Job.INFO[0m : (ea739313-4bd4-411d-8639-c0da25882f43) Scores Computed Successfully !!
[1m[36mCrowdAI.Event.Job.INFO[0m : (ea739313-4bd4-411d-8639-c0da25882f43) ImpWt_std: 0.0158402614434
[1m[36mCrowdAI.Event.Job.INFO[0m : (ea739313-4bd4-411d-8639-c0da25882f43) SNIPS_std: 0.000771421792774
[1m[36mCrowdAI.Event.Job.INFO[0m : (ea739313-4bd4-411d-8639-c0da25882f43) Uploading scores to the leaderboard....
[1m[36mCrowdAI.Event.Job.INFO[0m : (ea739313-4bd4-411d-8639-c0da25882f43) SNIPS : 46.9138630214
[1m[36mCrowdAI.Event.Job.INFO[0m : (ea739313-4bd4-411d-8639-c0da25882f43) ImpWt : 0.983055672491
[1m[36mCrowdAI.Event.Job.INFO[0m : (ea739313-4bd4-411d-8639-c0da25882f43) IPS_std: 1.88500686536
[1m[36mCrowdAI.Event.Job.INFO[0m : (ea739313-4bd4-411d-8639-c0da25882f43) IPS : 46.1189391616


                    [1m[32mea739313-4bd4-411d-8639-c0da25882f43[0m: 100% |█████████▉|[1:26:22<00:00]  52.52s/% ] [1m[32mea739313-4bd4-411d-8639-c0da25882f43[0m: 100% |██████████|[1:26:22<00:00]  226.24s/% ] 

[1m[36mCrowdAI.Event.Job.INFO[0m : (ea739313-4bd4-411d-8639-c0da25882f43) Scores Submited Successfully !!! 
[1m[32mCrowdAI.Event.Job.COMPLETE[0m : [1m[32mea739313-4bd4-411d-8639-c0da25882f43[0m	   🍺 


                    [1m[32mea739313-4bd4-411d-8639-c0da25882f43[0m: 100% |██████████|[1:26:22<00:00]  226.24s/% ] 


In [154]:
scores

{'impwt': 0.983055672490548,
 'impwt_std': 0.015840261443422165,
 'ips': 46.11893916159966,
 'ips_std': 1.885006865357641,
 'max_instances': 7087738,
 'message': '',
 'snips': 46.91386302136728,
 'snips_std': 0.0007714217927742332}