# Santa Competitions

A look back at the Xmas Optimisation challenges!

In [1]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import HTML, Image, display
import seaborn as sns

INPUT = os.path.join('..', 'input', 'meta-kaggle')
if not os.path.isdir(INPUT):
    INPUT = os.path.join('..', 'input')


def read_csv_filtered(csv, col, values, **kwargs):
    dfs = [
        df.loc[df[col].isin(values)] for df in pd.read_csv(
            csv, chunksize=100000, low_memory=False, **kwargs)
    ]
    return pd.concat(dfs, axis=0)

In [2]:
pd.options.display.max_rows = 200

In [3]:
plt.rc('figure', figsize=(12, 8))
plt.rc('font', size=14)
plt.rc('grid', c='0.5', ls='--', lw=0.2)

In [4]:
comps = pd.read_csv(f'{INPUT}/Competitions.csv', index_col='Id')

In [5]:
dates = comps.columns[comps.columns.str.contains('Date')]

In [6]:
for c in dates:
    comps[c] = pd.to_datetime(comps[c])

In [7]:
sc = comps[comps.Title.str.contains(r'\bSanta\b')]


def link_formatter(r):
    url = f'https://www.kaggle.com/c/{r.Slug}'
    title = ('{Subtitle}\n\n'
             'DeadlineDate: {DeadlineDate}\n'
             'MaxTeamSize: {MaxTeamSize:.0f}\n'
             'RewardType: {RewardType}\n'
             'RewardQuantity: {RewardQuantity:.0f}\n'
             'NumPrizes: {NumPrizes}\n'
             'RankingPointsMultiplier: {UserRankMultiplier}\n'
             'CanQualifyTiers: {CanQualifyTiers}\n'
             'EvaluationAlgorithmName: {EvaluationAlgorithmName}').format(**r)
    return f'<a href="{url}" title="{title}">{r.Title}</a>'


tmp = sc.assign(Title=sc.apply(link_formatter, 1))

cols = [
    'EnabledDate', 'TotalTeams', 'HostName', 'TotalCompetitors',
    'TotalSubmissions'
]
tmp.set_index('Title')[cols].style.set_na_rep('')

In [8]:
plt.scatter(sc.EnabledDate, sc.EnabledDate.dt.dayofyear)
plt.grid(True)
plt.ylabel('day of year')
plt.title('Santa Launch Dates');

In [9]:
duration = (sc.DeadlineDate - sc.EnabledDate) / pd.Timedelta(1, 'd')
plt.scatter(sc.EnabledDate, duration)
plt.grid(True)
plt.title('Santa Competition Durations (days)');

In [10]:
gb = sc.groupby(sc.EnabledDate.dt.year)
gb.TotalCompetitors.sum().plot(legend=True)
gb.TotalTeams.sum().plot(title='Total Santa Teams', legend=True)
plt.grid(True);

In [11]:
gb.TotalSubmissions.sum().plot(legend=True)
gb.RewardQuantity.sum().plot(title='Total Santa Submissions vs Reward', legend=True)
plt.grid(True);

# Teams

In [12]:
teams = read_csv_filtered(csv=f'{INPUT}/Teams.csv', col='CompetitionId', values=sc.index, index_col=0)
teams.shape

In [13]:
team_members = read_csv_filtered(csv=f'{INPUT}/TeamMemberships.csv', col='TeamId', values=teams.index, index_col=0)
team_members.shape

In [14]:
subs = read_csv_filtered(csv=f'{INPUT}/Submissions.csv', col='TeamId', values=teams.index, index_col=0)
subs.shape

In [15]:
users = read_csv_filtered(csv=f'{INPUT}/Users.csv', col='Id', values=team_members.UserId).set_index('Id')
users.shape

In [16]:
team_members = team_members.join(teams, on='TeamId')
team_members = team_members.join(users, on='UserId')
team_members = team_members.join(comps, on='CompetitionId')

In [17]:
medal_teams = teams.query('Medal>=0')
medal_teams.shape

In [18]:
members_of_medal_teams = team_members[team_members.TeamId.isin(medal_teams.index)]
members_of_medal_teams.shape

In [19]:
medal_chars_list = ['🥇', '🥈', '🥉']
medal_chars = np.asarray(medal_chars_list)

In [20]:
members_of_medal_teams = members_of_medal_teams.assign(
    Medal=medal_chars[(members_of_medal_teams.Medal - 1).map(int)])

In [21]:
medal_grid = members_of_medal_teams.groupby(['UserId', 'Medal']).size().unstack().fillna(0).astype(int)
medal_grid['Total'] = medal_grid.sum(1)
medal_grid = medal_grid.join(users, on='UserId')
medal_grid = medal_grid.dropna()

In [22]:
tier_names = np.asarray(['novice', 'contributor', 'expert', 'master', 'grandmaster', 'staff'])
tier_colors = np.asarray(["#2ECB99", "#00BFF9", "#9A5289", "#FF6337", "#DFA848", "#000000"])
tier_html = np.asarray([f'<font color={c}>{n}</font>' for c, n in zip(tier_colors, tier_names)])
bar_color = '#20beff'

def user_name_link(r):
    return ('<a href="https://www.kaggle.com/{UserName}"'
            'title="{UserName}\nRegistered: {RegisterDate}">'
            '{DisplayName}</a>').format(**r)


def setup_user(df):
    uid = df.apply(user_name_link, axis=1)
    df.pop('UserName')
    df.pop('DisplayName')
    df['Tier'] = tier_html[df.PerformanceTier.fillna(0).astype(int)]
    df['DisplayName'] = uid
    df.pop('RegisterDate')
    df.pop('PerformanceTier')
    df = df.set_index('DisplayName')
    return df

# Team Medals

Vlado Boza and Farmár are nearly GMs in Santa competitions alone!

In [23]:
N_SHOW = 100
tmp = medal_grid.sort_values(medal_chars_list, ascending=False)
setup_user(tmp).head(N_SHOW).style

In [24]:
user_medal_counts = members_of_medal_teams.UserId.value_counts()
max_medals = user_medal_counts.max()
user_medal_counts.plot.hist(title='Users with # Santa Medals')
plt.grid(True);

In [25]:
max_medals

Who has the most Santa medals?
Does this make them more likely to return each year to put the record out of reach?

In [26]:
users.reindex(user_medal_counts[(user_medal_counts==max_medals)].index)

# Team Points

Alternatively who has the most ranking points from Santa comps?

In [27]:
def kaggle_points(rank=1, nteams=1, team_size=1, t=0.0, mult=1):
    return ((100000. / (team_size**0.5))        # team size factor
            * (rank**-0.75)                     # leaderboard position
            * (np.log10(1 + np.log10(nteams)))  # size of competition
            * (np.exp(-t / 500.))               # time decay (days since deadline)
            * (mult)                            # some comps are half points
            )

In [28]:
team_members['TeamSize'] = team_members.TeamId.map(team_members.TeamId.value_counts())

In [29]:
team_members['Points'] = kaggle_points(team_members.PublicLeaderboardRank,
                                       team_members.TotalTeams,
                                       team_members.TeamSize,
                                       t=0,
                                       mult=team_members.UserRankMultiplier)

In [30]:
gb = team_members.groupby('UserId')

In [31]:
user_points = pd.DataFrame({
    'Competitions': gb.size(),
    'BestRank': gb.PublicLeaderboardRank.min().fillna(9e9).astype(int),
    'TotalPoints': gb.Points.sum()
})

In [32]:
user_points = user_points.join(users, on='UserId').dropna()

Komaki comes out on top.

Points can come from excellence and/or persistence, but excellence is better, the top five by points have all *won* (Rank = 1).

(Note that I set time to 0 so it's the sum of points as they were awarded at the time, their actual ranking points values will have decayed away by now.)

In [33]:
tmp = user_points.sort_values('TotalPoints', ascending=False)
setup_user(tmp.head(N_SHOW).copy()).style

# Submissions

In [34]:
subs['SubmissionDate'] = pd.to_datetime(subs['SubmissionDate'])

In [35]:
subs.groupby('SubmissionDate').size().plot(title='Santa Submissions over Time')
plt.grid(True);

In [36]:
subs['CompetitionId'] = subs['TeamId'].map(teams['CompetitionId'])

- Santa's Uncertain Bags was about leaderboard probing so the submission level stayed quite high
- Travelling Santa 2018 has a huge peak at the end!

In [37]:
plt.figure(figsize=(16, 12))
for comp, df in subs.groupby('CompetitionId'):
    dates = df.groupby('SubmissionDate').size()
    dates.index = dates.index - dates.index.min()
    dates = dates[dates.index < pd.Timedelta(duration[comp] + 7, 'd')]
    dates = dates.rename(comps.Title[comp])
    dates.plot(legend=True)
plt.grid(True)
plt.xlabel('Relative Submission Date')
plt.title('Santa Submissions over Time');

To be continued... (Unless the 2020 competition launches! Then it may be continued in 2021 ;)