In [1]:
import pandas as pd
import numpy as np
import json
import os

In [2]:
OUT_DIR = './data/processed/soccer'
OUT_EV = f'{OUT_DIR}/events.csv'
OUT_DESC = f'{OUT_DIR}/desc.json'
OUT_EDGE_FEAT = f'{OUT_DIR}/edge_ft.npy'
IN_DIR = './data/raw/soccer'

In [3]:
matches = pd.read_csv(f'{IN_DIR}/matches.csv').assign(ts = lambda _d: (pd.to_datetime(_d['date']).astype(int) / 10**9).astype(int))

In [4]:
home_player = matches[['match_api_id', 'home_team_api_id', 'home_player_1', 'home_player_2', 'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6', 'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10', 'home_player_11', 'ts']]
away_player = matches[['match_api_id', 'away_team_api_id', 'away_player_1', 'away_player_2', 'away_player_3', 'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7', 'away_player_8', 'away_player_9', 'away_player_10', 'away_player_11', 'ts']]

In [5]:
matches = pd.merge(matches, pd.melt(home_player, id_vars=['match_api_id', 'home_team_api_id', 'ts']).groupby(['match_api_id', 'home_team_api_id', 'ts']).nunique()['value'], on='match_api_id')
matches = pd.merge(matches, pd.melt(away_player, id_vars=['match_api_id', 'away_team_api_id', 'ts']).groupby(['match_api_id', 'away_team_api_id', 'ts']).nunique()['value'], on='match_api_id')

In [6]:
matches = matches.loc[(matches['value_x'] == 11) & (matches['value_y'] == 11) & 
                      (matches['B365H'].notna()) & (matches['BWH'].notna()) & 
                      (matches['IWH'].notna()) & (matches['LBH'].notna())].reset_index(drop=True)
home_player = matches[['match_api_id', 'home_team_api_id', 'home_player_1', 
                       'home_player_2', 'home_player_3', 'home_player_4', 
                       'home_player_5', 'home_player_6', 'home_player_7', 
                       'home_player_8', 'home_player_9', 'home_player_10', 
                       'home_player_11', 'ts']]
away_player = matches[['match_api_id', 'away_team_api_id', 'away_player_1', 
                       'away_player_2', 'away_player_3', 'away_player_4', 
                       'away_player_5', 'away_player_6', 'away_player_7', 
                       'away_player_8', 'away_player_9', 'away_player_10', 
                       'away_player_11', 'ts']]

In [7]:
home_player = pd.melt(home_player, id_vars=['match_api_id', 'home_team_api_id', 'ts'])
away_player = pd.melt(away_player, id_vars=['match_api_id', 'away_team_api_id', 'ts'])

In [8]:
players = pd.concat([home_player.rename(columns={'home_team_api_id': 'team_id'}), away_player.rename(columns={'away_team_api_id': 'team_id'})])

In [9]:
matches['result'] = matches['home_team_goal'] - matches['away_team_goal']
matches.loc[matches['result'] > 0, 'result'] = 2
matches.loc[matches['result'] == 0, 'result'] = 3
matches.loc[matches['result'] < 0, 'result'] = 1

In [10]:
matches = (matches
           .rename(columns={'home_team_api_id': 'u', 'away_team_api_id': 'v', 'result': 'e_type'})
           .assign(u_type = 1)
           .assign(v_type = 1)
           .reset_index(drop=True)
           [['u', 'v', 'ts', 'e_type', 'u_type', 'v_type']])

players = (players
           .assign(e_type = 4)
           .assign(u_type = 1)
           .assign(v_type = 2)
           .rename(columns={'team_id': 'u', 'value': 'v'})
           .reset_index(drop=True)
           [['u', 'v', 'ts', 'e_type', 'u_type', 'v_type']])

matches = (matches
           .assign(ts = lambda _d: _d['ts'] + 1))

In [11]:
# matchmap = pd.DataFrame(np.concatenate((matches[['u', 'ts']].values, matches[['v', 'ts']].values))).sort_values(1).reset_index(drop=True).reset_index()
# matches['u'] = pd.merge(matches, matchmap, left_on=['u', 'ts'], right_on=[0, 1], how='left')['index']
# matches['v'] = pd.merge(matches, matchmap, left_on=['v', 'ts'], right_on=[0, 1], how='left')['index']
# players['u'] = pd.merge(players, matchmap, left_on=['u', 'ts'], right_on=[0, 1], how='left')['index']

# pastmatches = pd.merge(matchmap, matchmap, on=0)
# pastmatches = pastmatches.loc[pastmatches['index_y'] < pastmatches['index_x']].drop_duplicates(['index_x'], keep='last')

# matches = (matches
#            .assign(ts = lambda _d: _d['ts'] + 1))

# pastmatches = (pastmatches
#            .assign(e_type = 5)
#            .assign(u_type = 1)
#            .assign(v_type = 1)
#            .rename(columns={'index_x': 'u', 'index_y': 'v', '1_x': 'ts'})
#            .reset_index(drop=True)
#            [['u', 'v', 'ts', 'e_type', 'u_type', 'v_type']])

# player = players['v'].sort_values().unique()
# idx_player = np.arange(len(player))
# player_dict = {player[k]: k + len(matchmap) for k in idx_player}

# players = (players
#             .assign(v = lambda _d: _d['v'].map(lambda x: player_dict[x])))

In [12]:
teams = pd.concat([matches['u'], matches['v']]).sort_values().unique()
idx_teams = np.arange(len(teams))
teams_dict = {teams[k]: k for k in idx_teams}

player = players['v'].sort_values().unique()
idx_player = np.arange(len(player))
player_dict = {player[k]: k + len(teams) for k in idx_player}

matches = (matches
           .assign(u = lambda _d: _d['u'].map(lambda x: teams_dict[x]))
           .assign(v = lambda _d: _d['v'].map(lambda x: teams_dict[x])))

players = (players
           .assign(u = lambda _d: _d['u'].map(lambda x: teams_dict[x]))
           .assign(v = lambda _d: _d['v'].map(lambda x: player_dict[x])))

In [13]:
pd.DataFrame.from_dict(player_dict, orient='index').to_csv(f'{OUT_DIR}/player_dict.csv')
pd.DataFrame.from_dict(teams_dict, orient='index').to_csv(f'{OUT_DIR}/teams_dict.csv')

In [14]:
events = pd.concat([matches, players]).sort_values('ts').reset_index(drop=True)

In [15]:
NUM_NODE = len(player) + len(teams)
NUM_EV = len(events)
NUM_N_TYPE = 2
NUM_E_TYPE = 4
CLASSES = [1, 2, 3]

In [16]:
events = (events
          .assign(e_idx = np.arange(1, NUM_EV + 1))
          [['u', 'v', 'u_type', 'v_type', 'e_type', 'ts', 'e_idx']])

In [17]:
print("num node:", NUM_NODE)
print("num events:", NUM_EV)
events.to_csv(OUT_EV, index=None)
desc = {
        "num_node": NUM_NODE,
        "num_edge": NUM_EV,
        "num_node_type": NUM_N_TYPE,
        "num_edge_type": NUM_E_TYPE,
        "classes": CLASSES
    }
with open(OUT_DESC, 'w') as f:
    json.dump(desc, f, indent=4)

num node: 9947
num events: 450593
