In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt

In [2]:
pd.read_csv('train.tsv', sep='\t').head()

Unnamed: 0,docid,rel,quid,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45
0,1,0,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0
1,2,1,10,0.03131,0.666667,0.5,0.166667,0.033206,0.0,0.0,...,0.64689,0.686107,0.823908,0.750092,0.385426,0.923077,0.086207,0.333333,0.448276,0.0
2,3,1,10,0.078682,0.166667,0.5,0.333333,0.080022,0.0,0.0,...,0.649824,0.578581,0.868557,0.641385,0.010462,0.076923,0.074713,0.833333,0.678161,0.0
3,4,1,10,0.019058,1.0,1.0,0.5,0.022591,0.0,0.0,...,0.918403,0.868457,1.0,0.86346,0.016642,0.153846,0.04023,0.833333,0.896552,0.0
4,5,0,10,0.039477,0.0,0.75,0.166667,0.040555,0.0,0.0,...,0.565875,0.56944,0.769845,0.646567,0.073711,0.076923,0.034483,0.333333,0.218391,0.0


In [3]:
def read_batches(filename):
    with open(filename, 'r') as handler:
        next(handler)
        x_by_q = defaultdict(lambda: [])
        y_by_q = defaultdict(lambda: [])
        for line in handler:
            splited = list(map(float, line.strip().split()))
            q = splited[2]
            y_by_q[q].append(splited[1])
            x_by_q[q].append(splited[3:])
        X, Y = [], []
        for q in x_by_q:
            X.append(np.array(x_by_q[q]))
            Y.append(np.array(y_by_q[q]))
        return X, Y

def train_test_split(X, Y, test_size=0.2):
    assert len(X) == len(Y)
    permutation = np.random.permutation(len(X))
    split = int(len(permutation) * (1 - test_size))
    train_indices = permutation[:split]
    test_indices = permutation[split:]
    return (
        list(np.array(X)[train_indices]),
        list(np.array(Y)[train_indices]),
        list(np.array(X)[test_indices]),
        list(np.array(Y)[test_indices]),
    )

def generate_data(X, Y):
    permutation = np.random.permutation(len(X))
    index = 0
    while True:
        yield X[permutation[index]], Y[permutation[index]]
        index += 1
        if index >= len(X):
            index = 0

In [4]:
""" Reference from https://gist.github.com/bwhite/3726239
"""

import numpy as np

def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

def get_ranked_relevances(model, X, Y):
    all_ranked_relevances = []
    for x, y in zip(X, Y):
        prediction = model.predict(x)
        assert len(prediction) == len(y)
        pairs = list(zip(prediction, y))
        ranked_relevances = [
            y
            for prediction, y in sorted(pairs, key=lambda pair: -pair[0])
        ]
        all_ranked_relevances.append(ranked_relevances)
    return all_ranked_relevances

In [6]:
X, Y = read_batches('train.tsv')
train_X, train_Y, test_X, test_Y = train_test_split(X, Y)
features_num = 46

In [7]:
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Activation, Lambda
from keras.optimizers import Adam
import keras.backend as K


def list_net_loss(y_true, y_pred):
    """
    input:
        ytrue: tensor with shape [bs, 1]
        y_pred: tensor with shape [bs, 1]
    """
    y_true = K.squeeze(y_true, axis=1)  # shape: [bs]
    y_pred = K.squeeze(y_pred, axis=1)  # shape: [bs]
        
    y_true_exp = K.exp(y_true)  # shape: [bs]
    y_pred_exp = K.exp(2 * y_pred)  # shape: [bs]
    
    p_y = y_true_exp / K.sum(y_true_exp)  # shape: [bs]
    p_f = y_pred_exp / K.sum(y_pred_exp)  # shape: [bs]
    
    return -K.sum(p_y * K.log(p_f))

x = Input(shape=(features_num,))

ranking_nn = Sequential()
ranking_nn.add(Dense(64))
ranking_nn.add(Activation('tanh'))
ranking_nn.add(Dense(64))
ranking_nn.add(Activation('tanh'))
ranking_nn.add(Dense(64))
ranking_nn.add(Activation('tanh'))
ranking_nn.add(Dense(1))
ranking_nn.add(Activation('tanh'))

rank = ranking_nn(x)

model = Model(inputs=x, outputs=rank)
model.compile(loss=list_net_loss,
              optimizer=Adam(lr=5e-6))

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.


In [8]:
ranked_relevances = get_ranked_relevances(model, test_X, test_Y)
ndcg = np.array([ndcg_at_k(r, k=5) for r in ranked_relevances]).mean()
print(ndcg)

0.21239309095448444


In [9]:
model.fit_generator(
    generate_data(train_X, train_Y),
    epochs=100,
    steps_per_epoch=len(train_X),
    validation_data=generate_data(test_X, test_Y),
    validation_steps=len(test_X),
    verbose=1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x104e3d358>

In [10]:
ranked_relevances = get_ranked_relevances(model, test_X, test_Y)
ndcg = np.array([ndcg_at_k(r, k=5) for r in ranked_relevances]).mean()
print(ndcg)
ranked_relevances = get_ranked_relevances(model, test_X, test_Y)
ndcg = np.array([ndcg_at_k(r, k=10) for r in ranked_relevances]).mean()
print(ndcg)

0.4503403843902221
0.4839925868421581


In [11]:
model.fit_generator(
    generate_data(train_X, train_Y),
    epochs=100,
    steps_per_epoch=len(train_X),
    validation_data=generate_data(test_X, test_Y),
    validation_steps=len(test_X),
    verbose=1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x12b850f98>

In [12]:
ranked_relevances = get_ranked_relevances(model, test_X, test_Y)
ndcg = np.array([ndcg_at_k(r, k=5) for r in ranked_relevances]).mean()
print(ndcg)
ranked_relevances = get_ranked_relevances(model, test_X, test_Y)
ndcg = np.array([ndcg_at_k(r, k=10) for r in ranked_relevances]).mean()
print(ndcg)

0.45060973896374334
0.4803527221402283


In [13]:
model.fit_generator(
    generate_data(train_X, train_Y),
    epochs=100,
    steps_per_epoch=len(train_X),
    validation_data=generate_data(test_X, test_Y),
    validation_steps=len(test_X),
    verbose=1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x11e4a3198>

In [14]:
ranked_relevances = get_ranked_relevances(model, test_X, test_Y)
ndcg = np.array([ndcg_at_k(r, k=5) for r in ranked_relevances]).mean()
print(ndcg)
ranked_relevances = get_ranked_relevances(model, test_X, test_Y)
ndcg = np.array([ndcg_at_k(r, k=10) for r in ranked_relevances]).mean()
print(ndcg)

0.45781311836107896
0.48482830677696565


In [30]:
test = pd.read_csv('test.tsv', sep='\t')
test.head()
res = test[['docid', 'quid', 'feature_0']].copy()
res['feature_0'] = model.predict(test[test.columns.delete((0, 1))])
res.head()
csv = res.sort_values(['quid', 'feature_0'], ascending=False)[['docid', 'quid']]
csv.to_csv('rank_net_tanh.csv', index=False)