In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
OUT_DIR = './data/processed/nba'
OUT_EV = f'{OUT_DIR}/events.csv'
OUT_DESC = f'{OUT_DIR}/desc.json'
OUT_EDGE_FEAT = f'{OUT_DIR}/edge_ft.npy'
IN_DIR = './data/raw/nba'

In [3]:
def calctime(time):
    times = time.split(':')
    if len(times) == 2:
        return (60 * int(float(times[0]))) + int(float(times[1]))
    elif len(times) == 1:
        return (60 * int(float(times[0])))
    else:
        print(times)
        return

In [4]:
matches = pd.read_csv('./data/raw/nba/games.csv').assign(ts = lambda _d: (pd.to_datetime(_d['GAME_DATE_EST']).astype(int) / 10**9).astype(int))
players = pd.read_csv('./data/raw/nba/games_details.csv')

matches_cols = ['SEASON', 'PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home',
                'AST_home', 'REB_home', 'PTS_away', 'FG_PCT_away', 'FT_PCT_away',
                'FG3_PCT_away', 'AST_away', 'REB_away']
players_cols = ['MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM',
                'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO',
                'PF', 'PTS', 'PLUS_MINUS']
# matches_cols = ['SEASON', 'PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home',
#                 'AST_home', 'REB_home', 'PTS_away', 'FG_PCT_away', 'FT_PCT_away',
#                 'FG3_PCT_away', 'AST_away', 'REB_away']
# players_cols = ['FGM', 'FGA', 'FG3M', 'FG3A', 'FTM',
#                 'FTA', 'OREB', 'DREB', 'AST', 'STL', 'BLK', 'TO',
#                 'PF', 'PTS', 'PLUS_MINUS']

  players = pd.read_csv('./data/raw/nba/games_details.csv')


In [5]:
matchesfeat = matches[['HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'HOME_TEAM_WINS', 'ts'] + matches_cols].dropna().drop_duplicates()
playersfeat = players.loc[players['MIN'].notna(), ['GAME_ID', 'TEAM_ID', 'PLAYER_ID'] + players_cols].dropna().drop_duplicates()

timestamps = matches[['GAME_ID', 'ts']].dropna().drop_duplicates()
matches = matches[['HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'HOME_TEAM_WINS', 'ts']].dropna().drop_duplicates()
players = players.loc[players['MIN'].notna(), ['GAME_ID', 'TEAM_ID', 'PLAYER_ID']].dropna().drop_duplicates()
gamelength = 2880

In [6]:
players = (players
           .merge(timestamps, on='GAME_ID', how='inner')
           .rename(columns={'TEAM_ID': 'u', 'PLAYER_ID': 'v'})
           [['u', 'v', 'ts']])

In [7]:
playersfeat = (playersfeat
               .merge(timestamps, on='GAME_ID', how='inner')
               .rename(columns={'TEAM_ID': 'u', 'PLAYER_ID': 'v'})
               .drop(columns=['GAME_ID']))

In [8]:
playersfeat['MIN'] = playersfeat['MIN'].astype(str).apply(calctime)
matchesfeat[matches_cols] = matchesfeat[matches_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
playersfeat[players_cols] = playersfeat[players_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [9]:
matches = (matches
           .assign(ts = lambda _d: _d['ts'] + gamelength)
           .rename(columns={'HOME_TEAM_ID': 'u', 'VISITOR_TEAM_ID': 'v', 'HOME_TEAM_WINS': 'e_type'})
           .assign(e_type = lambda _d: _d['e_type'] + 1)
           .assign(u_type = 1)
           .assign(v_type = 1)
           .reset_index(drop=True))

players = (players
           .assign(u_type = 1)
           .assign(v_type = 2)
           .assign(e_type = 3)
           .reset_index(drop=True))

In [10]:
matchesfeat = (matchesfeat
               .assign(ts = lambda _d: _d['ts'] + gamelength + 1)
               .rename(columns={'HOME_TEAM_ID': 'u', 'VISITOR_TEAM_ID': 'v', 'HOME_TEAM_WINS': 'e_type'})
               .assign(e_type = 4)
               .assign(u_type = 1)
               .assign(v_type = 1)
               .reset_index(drop=True))

playersfeat = (playersfeat
               .assign(ts = lambda _d: _d['ts'] + gamelength + 1)
               .assign(u_type = 1)
               .assign(v_type = 2)
               .assign(e_type = 4)
               .reset_index(drop=True))

In [11]:
teams = pd.concat([matches['u'], matches['v']]).sort_values().unique()
idx_teams = np.arange(len(teams))
teams_dict = {teams[k]: k for k in idx_teams}

player = players['v'].sort_values().unique()
idx_player = np.arange(len(player))
player_dict = {player[k]: k + len(teams) for k in idx_player}

In [12]:
matches = (matches
           .assign(u = lambda _d: _d['u'].map(lambda x: teams_dict[x]))
           .assign(v = lambda _d: _d['v'].map(lambda x: teams_dict[x])))

players = (players
           .assign(u = lambda _d: _d['u'].map(lambda x: teams_dict[x]))
           .assign(v = lambda _d: _d['v'].map(lambda x: player_dict[x])))

In [13]:
matchesfeat = (matchesfeat
           .assign(u = lambda _d: _d['u'].map(lambda x: teams_dict[x]))
           .assign(v = lambda _d: _d['v'].map(lambda x: teams_dict[x])))

playersfeat = (playersfeat
           .assign(u = lambda _d: _d['u'].map(lambda x: teams_dict[x]))
           .assign(v = lambda _d: _d['v'].map(lambda x: player_dict[x])))

In [14]:
events = pd.concat([matches, players, matchesfeat, playersfeat]).sort_values('ts').reset_index(drop=True).fillna(0)

In [15]:
e_ft = events.iloc[:, 6:].values
max_dim = e_ft.shape[1]
max_dim = max_dim + 4 - (max_dim % 4)
empty = np.zeros((e_ft.shape[0], max_dim-e_ft.shape[1]))
e_ft = np.hstack([e_ft, empty])
e_feat = np.vstack([np.zeros(max_dim), e_ft])

In [16]:
e_feat.shape

(1147122, 36)

In [17]:
e_feat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
NUM_NODE = len(player) + len(teams)
NUM_EV = len(events)
NUM_N_TYPE = 2
NUM_E_TYPE = 4

In [19]:
events = (events
          .assign(e_idx = np.arange(1, NUM_EV + 1))
          [['u', 'v', 'u_type', 'v_type', 'e_type', 'ts', 'e_idx']])

In [20]:
print("num node:", NUM_NODE)
print("num events:", NUM_EV)
np.save(OUT_EDGE_FEAT, e_feat)
events.to_csv('./data/processed/nba/events.csv', index=None)
desc = {
        "num_node": NUM_NODE,
        "num_edge": NUM_EV,
        "num_node_type": NUM_N_TYPE,
        "num_edge_type": NUM_E_TYPE
    }
with open('./data/processed/nba/desc.json', 'w') as f:
    json.dump(desc, f, indent=4)

num node: 2641
num events: 1147121
