In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from __future__ import absolute_import, division, print_function
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.pyplot import GridSpec
import seaborn as sns
import numpy as np
import pandas as pd
import os, sys
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
sns.set_context("poster", font_scale=1.3)

import missingno as msno
import pandas_profiling

from sklearn.datasets import make_blobs
import time

In [2]:
dyads = pd.read_csv('dyads.csv.gz', compression = 'gzip', encoding = 'UTF8')
players = pd.read_csv('tidy_players.csv.gz', compression = 'gzip', encoding = 'UTF8', index_col = 'playerShort')

# Dyads

In [3]:
dyads.head(10)

Unnamed: 0,refNum,playerShort,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards
0,1,lucas-wilchez,1,0,0,1,0,0,0,0
1,2,john-utaka,1,0,0,1,0,1,0,0
2,3,abdon-prats,1,0,1,0,0,1,0,0
3,3,pablo-mari,1,1,0,0,0,0,0,0
4,3,ruben-pena,1,1,0,0,0,0,0,0
5,4,aaron-hughes,1,0,0,1,0,0,0,0
6,4,aleksandar-kolarov,1,1,0,0,0,0,0,0
7,4,alexander-tettey,1,0,0,1,0,0,0,0
8,4,anders-lindegaard,1,0,1,0,0,0,0,0
9,4,andreas-beck,1,1,0,0,0,0,0,0


In [4]:
dyads.games.sum() == dyads.victories.sum() + dyads.ties.sum() + dyads.defeats.sum()

True

In [5]:
dyads['total_redCards'] = dyads[['yellowReds', 'redCards']].sum(axis = 1)

In [6]:
dyads.head(10)

Unnamed: 0,refNum,playerShort,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,total_redCards
0,1,lucas-wilchez,1,0,0,1,0,0,0,0,0
1,2,john-utaka,1,0,0,1,0,1,0,0,0
2,3,abdon-prats,1,0,1,0,0,1,0,0,0
3,3,pablo-mari,1,1,0,0,0,0,0,0,0
4,3,ruben-pena,1,1,0,0,0,0,0,0,0
5,4,aaron-hughes,1,0,0,1,0,0,0,0,0
6,4,aleksandar-kolarov,1,1,0,0,0,0,0,0,0
7,4,alexander-tettey,1,0,0,1,0,0,0,0,0
8,4,anders-lindegaard,1,0,1,0,0,0,0,0,0
9,4,andreas-beck,1,1,0,0,0,0,0,0,0


In [7]:
dyads = dyads.reset_index().set_index('playerShort')

In [8]:
players.drop(columns = ['Unnamed: 0'], axis = 1, inplace = True)

In [9]:
player_dyads = players.merge(dyads, left_index= True, right_index= True); player_dyads.head()

Unnamed: 0_level_0,position,height,weight,rater1,rater2,skintone,position_agg,weightclass,heightclass,age_years,...,refNum,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,total_redCards
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaron-hughes,Center Back,182.0,71.0,0.25,0.0,0.125,Defensive,low_weight,medium_height,33.149897,...,4,1,0,0,1,0,0,0,0,0
aaron-hughes,Center Back,182.0,71.0,0.25,0.0,0.125,Defensive,low_weight,medium_height,33.149897,...,66,1,1,0,0,0,0,0,0,0
aaron-hughes,Center Back,182.0,71.0,0.25,0.0,0.125,Defensive,low_weight,medium_height,33.149897,...,77,26,13,8,5,0,0,0,0,0
aaron-hughes,Center Back,182.0,71.0,0.25,0.0,0.125,Defensive,low_weight,medium_height,33.149897,...,163,2,1,1,0,0,0,0,0,0
aaron-hughes,Center Back,182.0,71.0,0.25,0.0,0.125,Defensive,low_weight,medium_height,33.149897,...,194,16,3,5,8,0,2,0,0,0


In [10]:
clean_dyads = dyads.reset_index()
clean_dyads = clean_dyads[clean_dyads.playerShort.isin(set(player_dyads.index))].set_index(['refNum', 'playerShort'])

In [11]:
clean_dyads.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,index,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,total_redCards
refNum,playerShort,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,lucas-wilchez,0,1,0,0,1,0,0,0,0,0
2,john-utaka,1,1,0,0,1,0,1,0,0,0
4,aaron-hughes,5,1,0,0,1,0,0,0,0,0
4,aleksandar-kolarov,6,1,1,0,0,0,0,0,0,0
4,alexander-tettey,7,1,0,0,1,0,0,0,0,0


In [12]:
clean_dyads.shape

(124621, 10)

# Disaggregation

* The dyads are currently an aggregated metric summarizing all times a particular referee-player pair play were matched. To properly handle the data, we have to disaggregate the data into a tidy/long format. This means that each game is a row.

In [13]:
j = 0
opt = [0 for _ in range(sum(clean_dyads['games']))]

for index, row in clean_dyads.reset_index().iterrows():
    total_games = row['games']
    total_redcards = row['total_redCards']
    ref = row['refNum']
    player = row['playerShort']
    for game in range(total_games):
        row['total_redCards'] = 1 if (total_redcards - game ) > 0 else 0
        rowlist=list([ref, player, row['total_redCards']])
        opt[j] = rowlist
        j += 1

tidy_dyads = pd.DataFrame(opt, columns=['refNum', 'playerShort', 'redcard'],).set_index(['refNum', 'playerShort'])

In [14]:
clean_dyads.games.sum()

373067

In [15]:
tidy_dyads.shape

(373067, 1)

In [16]:
tidy_dyads.to_csv('tidy_dyads.csv.gz', compression = 'gzip', encoding = 'UTF8')