In [1]:
import numpy as np
import pandas as pd
import pickle

from collections import Counter
from scipy.stats import kendalltau, spearmanr
from sklearn.linear_model import LogisticRegression

## Подготовка данных

In [2]:
players = pickle.load(open('./data/players.pkl', 'rb'))
results = pickle.load(open('./data/results.pkl', 'rb'))
tournaments = pickle.load(open('./data/tournaments.pkl', 'rb'))

Сделаем датафрейм с игроками

In [3]:
players = pd.DataFrame(players).T
players.head(5)

Unnamed: 0,id,name,patronymic,surname
1,1,Алексей,,Абабилов
10,10,Игорь,,Абалов
11,11,Наталья,Юрьевна,Абалымова
12,12,Артур,Евгеньевич,Абальян
13,13,Эрик,Евгеньевич,Абальян


Отфильтруем результаты турниров, у которых нет поля mask

In [4]:
len(results)

5528

In [5]:
results = {k:v for k, v in results.items() if all(el.get('mask', None) is not None for el in v)}
len(results)

3890

Отфильтруем "пустые" турниры

In [10]:
for tournament_id in [el['id'] for el in tournaments.values()]:
    if tournament_id in results:
        if len(results[tournament_id]) == 0:
            del results[tournament_id]

In [12]:
len(results)

3639

Построим датафрейм с турнирами

In [13]:
tournaments = pd.DataFrame(tournaments).T
tournaments.head(5)

Unnamed: 0,id,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty
1,1,Чемпионат Южного Кавказа,2003-07-25T00:00:00+04:00,2003-07-27T00:00:00+04:00,"{'id': 2, 'name': 'Обычный'}",/seasons/1,[],,
2,2,Летние зори,2003-08-09T00:00:00+04:00,2003-08-09T00:00:00+04:00,"{'id': 2, 'name': 'Обычный'}",/seasons/1,[],,
3,3,Турнир в Ижевске,2003-11-22T00:00:00+03:00,2003-11-24T00:00:00+03:00,"{'id': 2, 'name': 'Обычный'}",/seasons/2,[],,
4,4,Чемпионат Украины. Переходной этап,2003-10-11T00:00:00+04:00,2003-10-12T00:00:00+04:00,"{'id': 2, 'name': 'Обычный'}",/seasons/2,[],,
5,5,Бостонское чаепитие,2003-10-10T00:00:00+04:00,2003-10-13T00:00:00+04:00,"{'id': 2, 'name': 'Обычный'}",/seasons/2,[],,


Оставим только те, для которых есть результаты

In [14]:
tournaments = tournaments[tournaments.id.isin(results.keys())]
tournaments.head()

Unnamed: 0,id,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty
22,22,Чемпионат России,2004-02-21T00:00:00+03:00,2004-02-22T00:00:00+03:00,"{'id': 2, 'name': 'Обычный'}",/seasons/2,[],,"{'1': 15, '2': 15, '3': 15, '4': 15, '5': 15, ..."
76,76,Чемпионат России,2005-02-26T00:00:00+03:00,2005-02-27T00:00:00+03:00,"{'id': 2, 'name': 'Обычный'}",/seasons/3,[],,"{'1': 15, '2': 15, '3': 15, '4': 15, '5': 15, ..."
141,141,Чемпионат России,2006-02-26T00:00:00+03:00,2006-02-26T00:00:00+03:00,"{'id': 2, 'name': 'Обычный'}",/seasons/4,[],,"{'1': 15, '2': 15, '3': 15, '4': 15, '5': 15, ..."
226,226,Чемпионат России,2007-02-24T00:00:00+03:00,2007-02-25T00:00:00+03:00,"{'id': 2, 'name': 'Обычный'}",/seasons/6,[],,"{'1': 15, '2': 15, '3': 15, '4': 15, '5': 15, ..."
315,315,Чемпионат России,2008-03-09T00:00:00+03:00,2008-03-10T00:00:00+03:00,"{'id': 2, 'name': 'Обычный'}",/seasons/7,[],,"{'1': 15, '2': 15, '3': 15, '4': 15, '5': 15, ..."


И разобъем на train и test в зависимости от года

In [15]:
train_tournaments = tournaments[tournaments.dateStart.apply(lambda s: s[:4] == '2019')]
train_tournaments.head()

Unnamed: 0,id,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty
4772,4772,Синхрон северных стран. Зимний выпуск,2019-01-05T19:00:00+03:00,2019-01-09T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 28379, 'name': 'Константин', 'patronym...",{'dateRequestsAllowedTo': '2019-01-09T23:59:59...,"{'1': 12, '2': 12, '3': 12}"
4973,4973,Балтийский Берег. 3 игра,2019-01-25T19:05:00+03:00,2019-01-29T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-01-28T23:59:59...,"{'1': 12, '2': 12, '3': 12}"
4974,4974,Балтийский Берег. 4 игра,2019-03-01T19:05:00+03:00,2019-03-05T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-03-04T23:59:59...,"{'1': 12, '2': 12, '3': 12}"
4975,4975,Балтийский Берег. 5 игра,2019-04-05T19:05:00+03:00,2019-04-09T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-04-08T23:59:59...,"{'1': 12, '2': 12, '3': 12}"
4986,4986,ОВСЧ. 6 этап,2019-02-15T20:00:00+03:00,2019-02-19T20:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 59140, 'name': 'Борис', 'patronymic': ...",{'dateRequestsAllowedTo': '2019-02-19T23:59:59...,"{'1': 12, '2': 12, '3': 12}"


In [16]:
test_tournaments = tournaments[tournaments.dateStart.apply(lambda s: s[:4] == '2020')]
test_tournaments.head()

Unnamed: 0,id,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty
4957,4957,Синхрон Биркиркары,2020-02-21T00:00:00+03:00,2020-02-27T23:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/53,"[{'id': 2421, 'name': 'Ася', 'patronymic': 'Се...",{'dateRequestsAllowedTo': '2020-02-27T18:00:00...,"{'1': 13, '2': 13, '3': 13}"
5414,5414,Синхрон северных стран,2020-01-03T19:00:00+03:00,2020-01-10T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/53,"[{'id': 28379, 'name': 'Константин', 'patronym...",{'dateRequestsAllowedTo': '2020-01-10T23:59:00...,"{'1': 12, '2': 12, '3': 12}"
5705,5705,Школьный Синхрон-lite. Выпуск 3.5,2020-01-01T00:05:00+03:00,2020-02-03T23:55:00+03:00,"{'id': 8, 'name': 'Асинхрон'}",/seasons/53,"[{'id': 23740, 'name': 'Владимир', 'patronymic...",{'dateRequestsAllowedTo': '2020-02-02T23:59:00...,"{'1': 12, '2': 12, '3': 12}"
5706,5706,(а)Синхрон-lite. Лига старта. Эпизод XI,2020-01-01T00:05:00+03:00,2020-02-03T23:55:00+03:00,"{'id': 8, 'name': 'Асинхрон'}",/seasons/53,"[{'id': 23740, 'name': 'Владимир', 'patronymic...",{'dateRequestsAllowedTo': '2020-02-02T23:59:00...,"{'1': 12, '2': 12, '3': 12}"
5707,5707,Школьный Синхрон-lite. Выпуск 3.6,2020-02-01T00:05:00+03:00,2020-03-09T23:55:00+03:00,"{'id': 8, 'name': 'Асинхрон'}",/seasons/53,"[{'id': 23740, 'name': 'Владимир', 'patronymic...",{'dateRequestsAllowedTo': '2020-03-06T23:59:00...,"{'1': 12, '2': 12, '3': 12}"


In [19]:
print(
    'Number of train tournaments is',
    train_tournaments.shape[0],
)

print(
    'Number of test tournaments is',
    test_tournaments.shape[0],
)

Number of train tournaments is 671
Number of test tournaments is 169


## Baseline model

In [20]:
def count_correct_answers(string):
    return len([ch for ch in list(string) if ch not in ['X', '?', '0']])

def count_stats_by_player(results_dict):
    correct_answers_by_player = Counter()
    questions_by_player = Counter()
    tournaments_by_player = Counter()
    
    for key, value in results_dict.items():
        for team_info in value:
            n_correct_answers = count_correct_answers(team_info['mask'])
            n_questions = len(team_info['mask'])
            
            for players in team_info['teamMembers']:
                player_id = players['player']['id']
                
                correct_answers_by_player[player_id] += n_correct_answers
                questions_by_player[player_id] += n_questions
                tournaments_by_player[player_id] += 1
    
    return correct_answers_by_player, questions_by_player, tournaments_by_player

# for debug
correct_answers_by_player, questions_by_player, tournaments_by_player = count_stats_by_player(results)

Каждой паре (игрок, вопрос) поставим в соответствие следующий набор признаков:

1. Общая сила игрока (т.е. отношение числа отвеченных вопросов к числу заданных)
2. Сила игрока в конкретном турнире (отношение числа отвеченных вопросов к числу заданных, но в одном турнире)
3. Бинарная переменная - принял ли игрок участие больше чем в 10 турнирах.
4. Сложность вопроса (т.е. отношение числа команд, ответивших на вопрос, к общему числу команд)

Посчитаем общую силу игрока: отношение числа отвеченных вопросов к числу заданных.

In [21]:
player_strength = {}

for player_id in tournaments_by_player:
    player_strength[player_id] = correct_answers_by_player[player_id] / questions_by_player[player_id]

Посчитаем сложность вопроса

In [22]:
def count_question_complexity(results_dict):
    result = {}
    
    for key, value in results_dict.items():
        mask_len = len(value[0]['mask'])
        matrix = np.zeros((len(value), mask_len))
        
        for i in range(len(value)):
            try:
                matrix[i] = np.array(
                    list(
                        map(
                            int,
                            list(value[i]['mask'].rjust(mask_len, '0').replace('X', '0').replace('?', '0'))
                        )
                    )
                )
            except ValueError:
                # this means that some team has different length of mask
                # we just skip these tournaments
                print(key, 'has bad mask shapes')
                break
        
        complexitites = 1 - matrix.sum(axis=0) / matrix.shape[0]
        
        for idx, c in enumerate(complexitites):
            result[(key, idx)] = c
    
    return result

Посчитаем силу игрока в конкретном турнире:

In [23]:
def count_player_strength_in_tournament(results_dict):
    result = {}
    
    for key, value in results_dict.items():
        for team_info in value:
            n_correct_answers = team_info['questionsTotal']
            for player in team_info['teamMembers']:
                player_id = player['player']['id']
                result[(key, player_id)] = n_correct_answers / len(team_info['mask'])

    return result

Наконец, для каждого игрока выясним, участвовал ли он более чем в 10 турнирах

In [24]:
train_results ={}

for tournament_id in train_tournaments['id'].values:
    train_results[tournament_id] = results[tournament_id]
    
    
correct_answers_by_player, questions_by_player, tournaments_by_player = count_stats_by_player(train_results)

player_strength = {}
for player_id in tournaments_by_player:
    player_strength[player_id] = correct_answers_by_player[player_id] / questions_by_player[player_id]
    
question_complexity = count_question_complexity(train_results)
player_strength_in_tournament = count_player_strength_in_tournament(train_results)

6026 has bad mask shapes


In [25]:
data = []
for key, value in player_strength_in_tournament.items():
    tournament_id, player_id = key
    
    players_mask = [ team_info['mask']
                     for team_info in train_results[tournament_id]
                     if len([0 for p in team_info['teamMembers'] if p['player']['id'] == player_id])][0]
    
    for question_id in range(len(train_results[tournament_id][0]['mask'])):
        
        
        data.append(
            [
                player_id,
                tournament_id,
                player_strength[player_id],
                player_strength_in_tournament.get((tournament_id, player_id), 0),
                question_complexity[(tournament_id, question_id)],
                int(tournaments_by_player[player_id] > 10),
                int(players_mask[question_id] == '1') if question_id < len(players_mask) else 0,
            ]
        )

In [26]:
df = pd.DataFrame(data, columns=[
    'player_id',
    'tournament_id',
    'player_strength',
    'player_strength_in_tournament',
    'question_complexity',
    'has_played_more_than_10_times',
    'has_answered',
])

df

Unnamed: 0,player_id,tournament_id,player_strength,player_strength_in_tournament,question_complexity,has_played_more_than_10_times,has_answered
0,6212,4772,0.706069,0.777778,0.116883,1,1
1,6212,4772,0.706069,0.777778,0.220779,1,1
2,6212,4772,0.706069,0.777778,0.554113,1,1
3,6212,4772,0.706069,0.777778,0.480519,1,1
4,6212,4772,0.706069,0.777778,0.121212,1,1
...,...,...,...,...,...,...,...
25476145,217156,6255,0.000000,0.000000,0.672115,0,0
25476146,217156,6255,0.000000,0.000000,0.578009,0,0
25476147,217156,6255,0.000000,0.000000,0.566617,0,0
25476148,217156,6255,0.000000,0.000000,0.802377,0,0


Обучим на этих данных логистическую регрессию

In [27]:
lr = LogisticRegression()

lr.fit(df.drop(['player_id', 'tournament_id', 'has_answered'], axis=1), df['has_answered'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
predicted_probs = lr.predict_proba(df.drop(['player_id', 'tournament_id', 'has_answered'], axis=1))
predicted_probs = pd.DataFrame(predicted_probs)
predicted_probs.columns = ['1 - predicted_proba', 'predicted_proba']
df = df.merge(predicted_probs, left_index=True, right_index=True)
df

Unnamed: 0,player_id,tournament_id,player_strength,player_strength_in_tournament,question_complexity,has_played_more_than_10_times,has_answered,1 - predicted_proba,predicted_proba
0,6212,4772,0.706069,0.777778,0.116883,1,1,0.042003,0.957997
1,6212,4772,0.706069,0.777778,0.220779,1,1,0.063080,0.936920
2,6212,4772,0.706069,0.777778,0.554113,1,1,0.210456,0.789544
3,6212,4772,0.706069,0.777778,0.480519,1,1,0.164383,0.835617
4,6212,4772,0.706069,0.777778,0.121212,1,1,0.042728,0.957272
...,...,...,...,...,...,...,...,...,...
25476145,217156,6255,0.000000,0.000000,0.672115,0,0,0.951981,0.048019
25476146,217156,6255,0.000000,0.000000,0.578009,0,0,0.930763,0.069237
25476147,217156,6255,0.000000,0.000000,0.566617,0,0,0.927670,0.072330
25476148,217156,6255,0.000000,0.000000,0.802377,0,0,0.971382,0.028618


Усредним полученные значения вероятностей ответить на вопрос для каждого игрока.

Для каждого игрока получили некое число от 0 до 1 - значение, претендующее на роль его силы.

In [31]:
player_by_strength = df.groupby('player_id').mean()[['predicted_proba']]
player_by_strength

Unnamed: 0_level_0,predicted_proba
player_id,Unnamed: 1_level_1
15,0.306525
16,0.438206
23,0.516329
31,0.380367
35,0.482934
...,...
224404,0.296049
224408,0.109350
224482,0.124741
224539,0.223061


Силой команды будем считать среднюю силу ее игроков. Отранжируем команды по силе на тестовом множестве и посмотрим корреляцию с ground truth

In [32]:
spearman_corr_list = []
kendall_corr_list = []

for tournament_id in test_tournaments['id'].values:
    team_strengths = []
    for team_info in results[tournament_id]:
        team_id = team_info['team']['id']
        
        try:
            team_strengths.append(np.mean(
                [player_by_strength.loc[player['player']['id']] for player in team_info['teamMembers']]
            ))
        except KeyError:
            # there is no player in train data
            pass

    team_order = np.array(team_strengths).argsort()[::-1]
    ranking = np.zeros(len(team_strengths))
    ranking[team_order] = np.arange(len(team_strengths))
    
    spearman_corr_list.append(spearmanr(range(len(team_strengths)), ranking)[0])
    kendall_corr_list.append(kendalltau(range(len(team_strengths)), ranking)[0])

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [33]:
print(
    'Spearman correlation',
    np.mean([el for el in spearman_corr_list if el == el]), # nan filtering
)

print(
    'Kendall correlation',
    np.mean([el for el in kendall_corr_list if el == el]), # nan filtering
)
# np.mean([el for el in kendall_corr_list if el == el])

Spearman correlation 0.6727223912818839
Kendall correlation 0.510767046437555


## EM-algorithm