In [1]:
import xgboost as xgb
from json import loads as json_from_string
import numpy as np
from collections import Counter

In [2]:
with open('../data/pool.json/data') as handler:
    data = [json_from_string(line) for line in handler]

Имеем:
    - Все хедеры одинаковые.
    - В данных нас интересуют:
        - factors (Какие-то фичи. Их то 1052, то 1053. Леша сказал убрать 1053.)
        - images_metric (позиция, которую видели, win, lose)
        - p (вероятность, с которой такой серп рисовался)
        - uid и query (можно сделать из этого факторы или покластеризовать)

In [3]:
# Можно разделить данные на те, которые пользователь видел и те,
# которые не видел. Но пока без этого
#displayed_data = [line for line in data if line['images_metric'] is not None]
#undisplayed_data = [line for line in data if line['images_metric'] is None]

In [4]:
features = np.array([line['factors'][:1052] for line in data])
positions = np.array([
    line['images_metric'][0] if line['images_metric'] is not None else 100
    for line in data
])
probas = np.array([line['p'] for line in data])
labels = np.array([
    (line['images_metric'][2] - line['images_metric'][1])
    if line['images_metric'] is not None else 0
    for line in data
])

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split

In [6]:
def make_features(main_features, positions):
    return np.concatenate([main_features, np.reshape(positions, (-1, 1))], axis=1)

In [7]:
def get_positions(features_to_answer, model):
    position_variants = np.array(list(range(10)) + [100])
    answers = []
    for features in features_to_answer:
        features = np.repeat(np.reshape(features, (1, -1)), len(position_variants), axis=0)
        scores = model.predict(make_features(features, position_variants))
        answers.append(np.argmax(scores))
    return answers

In [8]:
def metric(answers_positions, target_positions, target, target_probs):
    return np.mean(
        target / target_probs * (answers_positions == target_positions)
    )

In [9]:
def validate_count(answers_positions, target_positions, target, target_probs):
    return np.mean(
        1 / target_probs * (answers_positions == target_positions)
    )

In [10]:
def get_metric(features, positions, labels, probas, model, validate=False):
    features_train, features_test,\
    positions_train, positions_test,\
    labels_train, labels_test,\
    proba_train, proba_test = train_test_split(
        features, positions, labels, probas, test_size=0.3, shuffle=True
    )
    model.fit(make_features(features_train, positions_train), labels_train)
    answers = get_positions(features_test, model)

    if validate:
        for j in range(10):
            constant_answers = [j for i in range(len(features_test))]
            print("j={}, validate: {}, metric: {}".format(
                j,
                validate_count(constant_answers, positions_test, labels_test, proba_test),
                metric(constant_answers, positions_test, labels_test, proba_test)        
            ))
    
    return metric(answers, positions_test, labels_test, proba_test)

In [14]:
scores = [
    get_metric(features, positions, labels, probas, model=xgb.XGBRegressor(), validate=False)
    for i in range(100)
]

In [15]:
np.mean(scores)

0.006136082010828766

In [16]:
np.std(scores)

0.007959242025337562

In [17]:
scores

[0.014788083681822951,
 0.016201567248922841,
 0.010480125104949995,
 0.0063166802251438852,
 -0.0051390002697873161,
 0.011303946291590771,
 0.0078216150447935444,
 0.010850569192056535,
 0.0038672451055651764,
 0.016943855040312816,
 0.017611838089727085,
 -0.0013489784772204351,
 0.0052992189545277568,
 0.0066088557971241413,
 0.0034857193746751663,
 -0.016277735358743283,
 0.017297976483561423,
 0.0022729726953382122,
 0.016678460982519346,
 0.0099319319851349872,
 0.012590878075039292,
 0.001636284188548359,
 0.010731544257155671,
 0.0048954365237883586,
 0.007343137506012779,
 0.0081156005298615944,
 0.0015229696079521234,
 0.0103981457472231,
 0.016207596414874764,
 0.0059163678864391119,
 0.0091979704238708687,
 0.0089262219550145638,
 0.0068767871661590627,
 0.014395460901123572,
 0.014396706363226772,
 -0.010283303289474091,
 0.01579767922440448,
 -0.00084297384834941297,
 0.0069330084824158564,
 0.0018560211695860494,
 0.0029605061081419042,
 0.012392635731751271,
 -0.010561