In [138]:
import gc
import math
import pickle
import warnings
from dataclasses import dataclass
from typing import Any, List

import numpy as np
from scipy.sparse import coo_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import kendalltau, spearmanr

warnings.filterwarnings('ignore')

In [45]:
@dataclass
class CommandResult:
    players: List[int]
    mask: str
    position: int


@dataclass
class TourData:
    tour_id: int
    num_questions: int
    results: List[CommandResult]

In [None]:
def get_id_by_years(y, t_dict):
    ids = []
    for id_val in t_dict:
        if t_dict[id_val]['dateStart'].startswith(y):
            ids.append(id_val)
    return ids


def get_player_ids(index, result_df):
    player_idx = []
    for idx in index:
        results = result_df[idx]
        for result in results:
            for player in result['teamMembers']:
                player_idx.append(player['player']['id'])
    return player_idx


def get_data(indexes, result_data, tournament):
    data = []
    for idx in indexes:
        result_query = []
        for result in result_data[idx]:
            pl_id = [player['player']['id']
                     for player in result['teamMembers']]
            if 'mask' in result:
                mask = result['mask']
                result_query.append(CommandResult(pl_id, mask, result['position']))
        total_cuestion = sum(tournament[idx]['questionQty'].values())
        data.append(TourData(idx, total_cuestion, result_query))
    return data


def get_train_data_tour(tour_data: TourData, total_question, quest_offset):
    X = [list() for _ in range(2)]
    y = []
    for result in tour_data.results:
        if result.mask is None:
            continue
        questions = [int(s) for s in result.mask if s in ('0', '1')]
        num_question = len(questions)
        if num_question != total_question:
            continue
        for player in result.players:
            X_player = [player] * num_question
            question_id = [quest_offset + i for i in range(num_question)]
            X[0].extend(X_player)
            X[1].extend(question_id)
            y.extend(questions)
    X = np.array(X).T
    y = np.array(y)
    return X, y


def get_test_data_comm(result: CommandResult, quest_id, train_players):
    X = [list() for _ in range(2)]
    y = []

    if result.mask:
        questions = [int(s) for s in result.mask if s in ('0', '1')]
        num_question = len(questions)
        for player in result.players:
            if player in train_players:
                X_player = [player] * num_question
                type_quest = [quest_id] * num_question
                X[0].extend(X_player)
                X[1].extend(type_quest)
                y.extend(questions)
    X = np.array(X).T
    y = np.array(y)
    return X, y


def get_train_data(train_data: List[TourData]):
    X, y = [], []
    acc_n_questions = 0
    for t in train_data:
        n_questions = t.num_questions
        X_new, y_new = get_train_data_tour(t, n_questions, acc_n_questions)
        X.append(X_new)
        y.extend(y_new)
        acc_n_questions += n_questions
    X = np.vstack(X)
    y = np.array(y)
    return X, y, acc_n_questions

## Read data 

Прочитайте и проанализируйте данные, выберите турниры, в которых есть данные о составах команд и повопросных результатах (поле mask в results.pkl).
Для унификации предлагаю:
* взять в тренировочный набор турниры с dateStart из 2019 года; 
* в тестовый — турниры с dateStart из 2020 года.


In [None]:
with open("chgk/players.pkl", "rb") as p, \
        open("chgk/results.pkl", "rb") as r, \
        open("chgk/tournaments.pkl", "rb") as t:
    players_dict = pickle.load(p)
    result_df = pickle.load(r)
    tournament = pickle.load(t)

train_id = get_id_by_years('2019', tournament)
test_id = get_id_by_years('2020', tournament)

train_players = get_player_ids(train_id, result_df)
test_players = get_player_ids(test_id, result_df)

s1 = set(train_players)
s2 = set(test_players)

train_data = get_data(train_id, result_df, tournament)
test_data = get_data(test_id, result_df, tournament)

with open("chgk/train_data.pkl", "wb") as p:
    pickle.dump(train_data, p)
with open("chgk/test_data.pkl", "wb") as p:
    pickle.dump(test_data, p)
with open("chgk/train_id.pkl", "wb") as p:
    pickle.dump(train_id, p)
with open("chgk/test_id.pkl", "wb") as p:
    pickle.dump(test_id, p)
with open("chgk/train_players.pkl", "wb") as p:
    pickle.dump(train_players, p)
with open("chgk/test_players.pkl", "wb") as p:
    pickle.dump(test_players, p)

In [46]:
with open("chgk/train_data.pkl", "rb") as p:
    train_data = pickle.load(p)
with open("chgk/test_data.pkl", "rb") as p:
    test_data = pickle.load(p)
with open("chgk/train_id.pkl", "rb") as p:
    train_id = pickle.load(p)
with open("chgk/test_id.pkl", "rb") as p:
    test_id = pickle.load(p)
with open("chgk/train_players.pkl", "rb") as p:
    train_players = pickle.load(p)
with open("chgk/test_players.pkl", "rb") as p:
    test_players = pickle.load(p)

## Train 

Постройте baseline-модель на основе линейной или логистической регрессии, которая будет обучать рейтинг-лист игроков. Замечания и подсказки:
* повопросные результаты — это фактически результаты броска монетки, и их предсказание скорее всего имеет отношение к бинарной классификации;
* в разных турнирах вопросы совсем разного уровня сложности, поэтому модель должна это учитывать; скорее всего, модель должна будет явно обучать не только силу каждого игрока, но и сложность каждого вопроса;
* для baseline-модели можно забыть о командах и считать, что повопросные результаты команды просто относятся к каждому из её игроков.


In [47]:
train_set_players = set(train_players)

В качестве признаков возьмем "силу" игрока и сложность вопроса.

In [49]:
X, y, tot_questions = get_train_data(train_data)

In [50]:
train_players = np.array(train_players).reshape((-1,1))

encoder_player = OneHotEncoder()
encoder_type = OneHotEncoder()

encoder_player.fit(train_players)
encoder_type.fit(np.arange(tot_questions).reshape(-1,1))

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

Приведем матрицу признаков к разреженному виду

In [51]:
X_oh0 = encoder_player.transform(X[:,0].reshape(-1,1))
X_oh1 = encoder_type.transform(X[:,1].reshape(-1,1))
X=hstack((X_oh0,X_oh1))

In [142]:
cls = LogisticRegression(solver='saga',random_state=123, n_jobs=-1)

In [143]:
cls.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=123, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

Получим веса "силы" игрока и сложности вопроса.

In [144]:
coef = cls.coef_[0]

players_power, quest_power = coef[:-tot_questions], coef[-tot_questions:]

## Test 

Вероятность того, что $i$ игрок ответит на $j$ вопрост будем вычислять по формуле $p_{ij}=\sigma(s_i+q_j)$, где $s_i$ - сила игрока, а $q_j$ - сложность $j$ вопроса. 

Так как сложности новых вопросов неизвестны, то в качестве вероятности ответа на вопрос будем брать матожидание по всем вопросам для каждого игрока $p_{i}=\mathbb{E}p_{ij}=\int \sigma(s_i+q)p(q)dq \approx \sigma(\frac{s_i+mu_i}{\sqrt{1+\pi\sigma(q)^2/8}})$ [см. например](http://eelxpeng.github.io/blog/2017/03/10/Tricks-of-Sigmoid-Function)

In [146]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [147]:
def get_prob_test(tour_data: TourData,
                 enc_player,
                 train_set_players,
                 players_power,
                 sigma_quest,
                 mu_quest):
    prob_list = []
    pos_list = []
    for result in tour_data.results:
        players = [p for p in result.players if p in train_set_players]
        players = np.array(players).reshape(-1, 1)
        if len(players)==0:
            continue
        p_power = encoder_player.transform(players)@players_power
        sigmoid_args = (p_power + mu_quest)/math.sqrt((1+sigma_quest*math.pi/8))
        prob = sigmoid(sigmoid_args)
        prob = 1-prob
        prob = 1-np.product(prob)
        prob_list.append(prob)
        pos_list.append(result.position)
    return np.array(prob_list), np.array(pos_list)

In [148]:
sigma_quest = quest_power.var()
mu_quest = quest_power.mean()

In [74]:
%pdb

Automatic pdb calling has been turned OFF


Для каждой игры вычислим коэффициенты ранговых корреляций и усредним по всем играм.

In [151]:
kendal_list = []
spearman_list = []
for t in test_data:
    if len(t.results) > 0:
        prob, pos = get_prob_test(t,
                                  encoder_player,
                                  train_set_players,
                                  players_power,
                                  sigma_quest,
                                  mu_quest
                                  )
        kendal_list.append(kendalltau(prob,pos[::-1]).correlation)
        spearman_list.append(spearmanr(prob,pos[::-1]).correlation)

In [152]:
kendal_list = list(filter(lambda x: not np.isnan(x), kendal_list))
spearman_list = list(filter(lambda x: not np.isnan(x), spearman_list))

In [153]:
np.mean(kendal_list)

0.58979246482652

In [154]:
np.mean(spearman_list)

0.7487482302565207