In [1]:
from matplotlib import pyplot as plt
from collections import Counter, defaultdict
import numpy as np
import pickle
import sys

sys.path.append("../code/python/utils")
from pool_iterator import pool_iterator
from metric import calculate_metric
from json_tools import get_from_pool
sys.path.append("../code/python/bandits")
from tomson_bandit import TomsonBandit
from cascade_bandit import CascadeBandit, get_nearest_queries
from damerau_levenshtien import damerau_levenshtein_distance

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
n_objects = 569105

In [3]:
queries_counter = pickle.load(open("queries_counter", "rb"))
queries_by_word = pickle.load(open("queries_by_word", "rb"))

In [26]:
queries_counter

Counter({(0, 1, 2, 3): 165,
         (4, 5, 6, 7): 1,
         (8, 9, 10): 1,
         (11, 12): 4,
         (13, 14, 15): 71,
         (16, 17, 18, 19, 20): 1,
         (21, 22, 23): 1,
         (24, 25, 26, 27): 1,
         (28, 29, 19, 30): 1,
         (31, 32, 14, 33): 1,
         (34, 35, 36, 37, 38, 39): 1,
         (40,): 1,
         (41, 42): 1,
         (43, 44): 14,
         (45, 46): 1,
         (47, 48, 49, 50, 51, 52, 53, 54, 55, 6, 56, 57): 1,
         (58, 59): 1,
         (60,): 1,
         (61, 62, 63, 48, 64, 65, 66, 67, 68, 48, 69, 70): 1,
         (71, 72): 2,
         (73,): 1,
         (74, 75, 76, 77, 48, 78, 79): 1,
         (80, 81, 82): 4,
         (83, 84, 85, 86, 87, 6, 88): 1,
         (89, 90): 4,
         (91, 92, 93, 94, 95): 1,
         (96, 97, 98, 99, 100, 101, 102): 1,
         (103, 104, 105): 10,
         (106, 107): 1,
         (108, 109, 110, 111): 1,
         (112, 113): 6,
         (114, 115, 6, 116, 33, 117): 1,
         (118, 119): 2,
       

In [4]:
%%time
position_variants = list(range(10)) + [100]
bandits_by_queries = {
    query: TomsonBandit(position_variants) for query in queries_counter
}
big_bandit = CascadeBandit(bandits_by_queries, queries_by_word)

CPU times: user 7.87 s, sys: 180 ms, total: 8.05 s
Wall time: 8.06 s


In [5]:
np_queries = np.array([query for query in queries_counter])

In [6]:
for i, query in enumerate(np_queries):
    if i % 10 == 0:
        print(" {}/{} - {} %".format(i, 459862, round(i * 100 / 459862)))
    for other_query in np_queries:
        continue

 0/459862 - 0 %
 10/459862 - 0 %
 20/459862 - 0 %
 30/459862 - 0 %
 40/459862 - 0 %
 50/459862 - 0 %
 60/459862 - 0 %
 70/459862 - 0 %
 80/459862 - 0 %
 90/459862 - 0 %
 100/459862 - 0 %
 110/459862 - 0 %
 120/459862 - 0 %
 130/459862 - 0 %
 140/459862 - 0 %
 150/459862 - 0 %
 160/459862 - 0 %
 170/459862 - 0 %
 180/459862 - 0 %
 190/459862 - 0 %
 200/459862 - 0 %
 210/459862 - 0 %
 220/459862 - 0 %
 230/459862 - 0 %
 240/459862 - 0 %
 250/459862 - 0 %
 260/459862 - 0 %
 270/459862 - 0 %
 280/459862 - 0 %
 290/459862 - 0 %
 300/459862 - 0 %
 310/459862 - 0 %
 320/459862 - 0 %
 330/459862 - 0 %
 340/459862 - 0 %
 350/459862 - 0 %
 360/459862 - 0 %
 370/459862 - 0 %
 380/459862 - 0 %
 390/459862 - 0 %
 400/459862 - 0 %


KeyboardInterrupt: 

In [7]:
np_queries

array([(0, 1, 2, 3), (4, 5, 6, 7), (8, 9, 10), ...,
       (466, 1051, 125, 50335, 19, 370751), (55164, 2262, 471, 7273),
       (324798, 10732, 304946, 304947)], dtype=object)

In [8]:
len(np_queries)

459862

In [23]:
with open("../code/python/bandits/queries.hpp", 'w') as handler:
    print("#include <vector>", file=handler)
    print("", file=handler)
    print("const std::vector<std::vector<int>> QUERIES = {", file=handler)
    for i, query in enumerate(np_queries):
        if i != len(np_queries) - 1:
            print("    {" + ", ".join(map(str, query)) + "},", file=handler)
        else:
            print("    {" + ", ".join(map(str, query)) + "}", file=handler)
    print("};", file=handler)

In [25]:
with open("../code/python/bandits/queries.txt", 'w') as handler:
    for i, query in enumerate(np_queries):
        print(str(len(query)) + " " + " ".join(map(str, query)), file=handler)
    print("};", file=handler)

In [27]:
def get_similarity(query_1, query_2):
    return len(set(query_1) & set(query_2)) / len(set(query_1) | set(query_2))

def get_nearests(query):
    similaries = np.array([
        get_similarity(query, other_query)
        for other_query in np_queries
    ])
    return np_queries[similaries >= 0.5], similaries[similaries >= 0.5]

In [32]:
words = sorted(list({
    word
    for query in np_queries
    for word in query
}))

In [37]:
words[-1]

370751

In [38]:
len(words)

370752

In [28]:
q = np_queries[0]
get_nearests(q)

(array([(0, 1, 2, 3), (0, 1, 2, 3, 1637, 463), (87, 0, 1, 2, 3, 183, 361),
        (0, 1, 542, 3, 542, 463), (14, 1079, 0, 1, 2, 3, 1637, 463),
        (0, 1, 90, 2), (0, 1, 2, 3, 4925), (2, 3, 1624, 463, 0, 1),
        (0, 1, 542, 3), (0, 1, 2, 3, 682, 463, 10895, 93934),
        (0, 1, 2, 3, 2262, 463, 10895, 93934), (0, 1, 2, 207667),
        (0, 1, 2, 3, 2262, 463, 220070), (16668, 0, 1, 2),
        (0, 1, 2, 3, 261767, 95733, 261768), (0, 1, 1637, 463, 2, 3),
        (0, 1, 3345, 359, 3), (0, 1, 2, 3, 2262, 463, 2270, 4829),
        (0, 1, 2, 3, 1637, 463, 11493), (14, 1079, 0, 1, 2),
        (0, 1, 2, 3, 472, 463), (0, 1, 2, 3, 2262, 463, 4748, 11493),
        (0, 1, 2, 3, 1637, 50557), (14, 0, 1, 2, 3, 340, 7559),
        (0, 1, 4925, 2, 3), (0, 338840, 2, 3),
        (0, 1, 2, 3, 8059, 4829, 2946), (1, 2, 3),
        (1079, 290525, 0, 1, 2, 3), (345758, 1, 2, 3), (14, 0, 1, 3, 359),
        (27838, 1, 2, 3), (351177, 0, 1, 2, 3)], dtype=object),
 array([1.        , 0.66666667, 

In [39]:
nearest_queries = {}

In [None]:
len(np_queries)

In [40]:
for i, query in enumerate(np_queries):
    if i % 10 == 0:
        print(" {}/{} - {} %".format(i, 459862, round(i * 100 / 459862)))
    nearest_queries[query] = get_nearests(query)

 0/459862 - 0 %
 10/459862 - 0 %


KeyboardInterrupt: 

In [None]:
pool_filenames = [
    "pool_with_queries/train_test_split/day_{}.json".format(i)
    for i in range(4)
]
train_pool_filenames = pool_filenames[:3]
test_pool = pool_filenames[3]

In [None]:
for pool_filename in train_pool_filenames:
    print(" fitting filename \"{}\"".format(pool_filename))
    for i, item in enumerate(pool_iterator(pool_filename)):
        if i % 5000 == 0:
            print(" {} %".format(round(400 * i / n_objects)))
        query = tuple(map(int, item["query"].split()))
        reward = 1 if item["target"] > 0 else 0
        big_bandit.take_reward(query, item["pos"], reward)

In [None]:
%%time
targets_test = get_from_pool(pool_iterator(test_pool), "target", int)
probas_test = get_from_pool(pool_iterator(test_pool), "p", float)
positions_test = get_from_pool(pool_iterator(test_pool), "pos", int)

In [None]:
all_positions = []
good_positions = []
target_and_statistic = []
for i, item in enumerate(pool_iterator(test_pool)):
    query = tuple(map(int, item["query"].split()))
    statistics = big_bandit.get_statistic_size(query)
    position = big_bandit.get_action(query)
    print(' positions: {} {}'.format(position, item["pos"]))
    all_positions.append(position)
    if position == item["pos"]:
        print(' target: {} static: {}'.format(item["target"], statistics))
        target_and_statistic.append((item["target"], statistics))

In [None]:
test_probas = []
test_targets = []
test_target_positions = []
test_statistics = []
for i, item in enumerate(pool_iterator(test_pool)):
    print('', i)
    if i > 1297:
        break
    query = tuple(map(int, item["query"].split()))
    statistics = big_bandit.get_statistic_size(query)
    test_probas.append(item["p"])
    test_target_positions.append(item["pos"])
    test_targets.append(item["target"])
    test_statistics.append(statistics)

In [None]:
len(all_positions), len(test_probas), len(test_targets), len(test_target_positions), len(test_statistics)

In [None]:
answer = np.array(all_positions)
test_target_positions = np.array(positions_test)[:len(answer)]
test_targets = np.array(targets_test)[:len(answer)]
test_probas = np.array(probas_test)[:len(answer)]
test_statistics = np.array(test_statistics)[:len(answer)]

In [None]:
pickle.dump(all_positions, open("all_positions", "wb"))

In [None]:
print('', calculate_metric(
    answer,
    test_target_positions,
    test_targets,
    test_probas
))

In [None]:
len(answer[test_statistics < 100000])

In [None]:
answer = np.array(all_positions)
answer[test_statistics < 1] = 2

In [None]:
print('', calculate_metric(
    answer,
    np.array(test_target_positions),
    np.array(test_targets),
    np.array(test_probas)
))

In [None]:
len(answer[answer == test_target_positions])

In [None]:
for i in list(range(10)) + [100]:
    print('', i, calculate_metric(
        np.array([i] * len(test_probas)),
        np.array(test_target_positions),
        np.array(test_targets),
        np.array(test_probas)
    ))

In [None]:
len(all_positions), len(good_positions)

In [None]:
targets = np.array(target_and_statistic)[:,0]
statistics = np.array(target_and_statistic)[:,1]

In [None]:
plt.scatter(targets, statistics)

In [None]:
np.mean(targets[statistics > 0])

In [None]:
np.mean(targets)

In [None]:
len(targets)

In [None]:
def get_positions(bandit, pool, verbose=False):
    positions = []
    for i, item in enumerate(pool_iterator(pool)):
        if verbose:
            if i % 5000 == 0:
                print(" {} %".format(round(400 * i / n_objects)))
        query = tuple(map(int, item["query"].split()))
        positions.append(bandit.get_action(query))
    return np.array(positions)

In [None]:
answer = get_positions(big_bandit, test_pool, verbose=True)
metric = calculate_metric(
    answer, positions_test, targets_test, probas_test
)

In [None]:
print('', metric, Counter(answer))