In [17]:
import pandas as pd
import tqdm
import torch
import numpy as np

In [18]:
# pre-process basic_stats
basic_stats = pd.read_csv('/Volumes/TOSHIBA EXT TIMOTHY DELILLE/GitHub/fantasy-nba/data/basic_stats.csv')
basic_stats.dropna(subset=['game_id', 'player_id'], inplace=True)

# convert minutes played to actual number of minutes
basic_stats['mp'] = basic_stats["mp"].str.split(":")\
                                     .fillna("00")\
                                     .apply(lambda x: sum([int(x[-k - 1])/(60**(len(x) - k - 1)) 
                                                           for k in range(len(x))]))

# is starter?
basic_stats['is_starter'] = basic_stats['type'] == 'Starter'

# one hot vector encoding of team names
abbreviations = pd.read_csv('/Volumes/TOSHIBA EXT TIMOTHY DELILLE/GitHub/fantasy-nba/data/abbreviations.csv')

basic_stats['team'] = basic_stats['team'].replace({'CHA': 'CHO'})

assert basic_stats['team'].isin(abbreviations['basketball_reference_abbreviation']).all(),\
      'Some team names in basic_stats are not basketball reference abbreviations'

for team in abbreviations['basketball_reference_abbreviation']:
    basic_stats[team] = basic_stats['team'] == team

# add info about the game
schedules = pd.read_csv('/Volumes/TOSHIBA EXT TIMOTHY DELILLE/GitHub/fantasy-nba/data/schedules.csv')
schedules.dropna(subset=['game_id'], inplace=True)

assert schedules['game_id'].is_unique, 'Duplicate game_id in schedules'

basic_stats = pd.merge(basic_stats, schedules, on='game_id', how='left')
basic_stats['reason'] = basic_stats['reason'].replace({'\xa0', ''})

# convert date and time to datetime objects
basic_stats['date'] = pd.to_datetime(basic_stats['date']).dt.date
basic_stats['time'] = pd.to_datetime(basic_stats['time']).dt.time
# combine date and time into datetime
basic_stats['datetime'] = basic_stats.apply(lambda row: pd.Timestamp.combine(row['date'], row['time']), axis=1)
# convert datetime to unix timestamp
basic_stats['timestamp'] = (basic_stats['datetime'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
# is the game taking place at home?
basic_stats['is_at_home'] = basic_stats['HOME'] == basic_stats['team']


In [19]:
player_ids = basic_stats['player_id'].dropna().unique()
tag2id = {tag: id for id, tag in enumerate(player_ids)} # start at 1
id2tag = {id: tag for tag, id in tag2id.items()}

In [20]:
def compute_dk_pts(features):
    """
    Args:
        pred_features: tensor of shape (-1, num labels) in this specific order
                       pts, fg3, orb, ast, stl, blk, tov
    """
    # inputs are tensors of shape (num players,)
    # double-double: max 1 per player, Points, Rebounds, Assists, Blocks, Steals
    # triple-double: max 1 per player, Points, Rebounds, Assists, Blocks, Steals

    factors = torch.tensor([1, 0.5, 1.25, 1.5, 2, 2, -0.5])

    # ids used in the double-double or triple-double calculations
    num_dbl = torch.where(torch.tensor(features[:, [0, 2, 3, 4, 5]]) >= 10, 
                          torch.tensor(1), 
                          torch.tensor(0)) # (-1, 5)
    num_dbl = num_dbl.sum(-1) # (-1,)

    dbl_dbl = torch.where(num_dbl >= 2, torch.tensor(1), torch.tensor(0)) # (-1, )
    trpl_dbl = torch.where(num_dbl >= 3, torch.tensor(1), torch.tensor(0)) # (-1, )

    dk_pts = torch.einsum('ni, ...i -> n', torch.tensor(features).float(), factors) + 1.5*dbl_dbl + 3*trpl_dbl # (-1,)
    return dk_pts

In [33]:
features = ['ast', 'blk', 'drb', 'fg', 'fg3', 'fg3_pct', 'fg3a', 'fg_pct', 'fga', 'ft',\
            'ft_pct', 'fta', 'mp', 'orb', 'pf', 'plus_minus', 'pts', 'stl', 'tov', 'trb',\
            'is_starter', 'is_at_home']

features += abbreviations['basketball_reference_abbreviation'].tolist()

all_nodes = []
all_edges = []
all_ts = []
all_dk_points = []

last_features = pd.DataFrame(index=player_ids, columns=features) # stores a player's last game features
avg_feats = basic_stats.loc[(basic_stats['scope'] == 'game'), features].astype(float).fillna(0.).values.mean()

last_features.loc[player_ids, :] = avg_feats # allow for a small data leakage by 
                                             # taking the average over the entire dataset
    
for game_id in tqdm.tqdm_notebook(basic_stats['game_id'].dropna().unique()):
    in_scope = basic_stats[(basic_stats['game_id'] == game_id) & (basic_stats['scope'] == 'game')]
    
    assert len(in_scope['timestamp'].unique()) == 1, 'Different timestamp for the same game'
    timestamp = in_scope['timestamp'].iloc[0]
    assert in_scope['player_id'].is_unique, 'Multiple entries for the same player'
    
    # get last features for each player
    feats = last_features.loc[in_scope['player_id'], :].values
    num_players = feats.shape[0]
    
    # update last_features
    current_feats = in_scope[features].astype(float).fillna(0.).values
    last_features.loc[in_scope['player_id'], :] = current_feats
    
    # map player_id to node id
    ids = in_scope['player_id'].map(tag2id).astype(int).values
    
    # create edges: e_ij is the concatenation of the features from player i and player j
    edges = np.concatenate([feats[:, None, :].repeat(num_players, axis=1),
                            feats[None, :, :].repeat(num_players, axis=0)], axis=-1)
    # [num_players, num_players, 2*num_features]
    # e_ijk is the k-th feature value of edge i to j
    
    # create nodes: n_ij will be associated with e_ij
    nodes = np.concatenate([ids[None, :, None].repeat(num_players, axis=0), 
                            ids[:, None, None].repeat(num_players, axis=1)], axis=-1)
    nodes = nodes.reshape(num_players*num_players, 2) 
    
    # flatten the nodes and edges matrix / delete self connections
    ignore_idx = np.ravel_multi_index([np.arange(num_players), np.arange(num_players)], 
                                      dims=(num_players, num_players))
    
    nodes = np.delete(nodes, ignore_idx, axis=0)
    edges = edges.reshape(num_players*num_players, 2*len(features))
    edges = np.delete(edges, ignore_idx, axis=0)
    
    # compute actual DraftKings points
    pts_feats = in_scope[['pts', 'fg3', 'orb', 'ast', 'stl', 'blk', 'tov']].astype(float).fillna(0.).values
    # [num_players, ]
    dk_points = compute_dk_pts(pts_feats).numpy()
    dk_points = dk_points[:, None].repeat(num_players, axis=1).reshape(num_players*num_players)
    dk_points = np.delete(dk_points, ignore_idx, axis=0)
    
    # append to global lists
    all_nodes.append(nodes)
    all_edges.append(edges)
    ts_array = np.full(nodes.shape[0], fill_value=timestamp)
    all_ts.append(ts_array)
    all_dk_points.append(dk_points)
    
# concatenate all games
all_nodes = np.concatenate(all_nodes, axis=0)
all_edges = np.concatenate(all_edges, axis=0).astype(float)
all_ts = np.concatenate(all_ts, axis=0)
all_dk_points = np.concatenate(all_dk_points, axis=0)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/14222 [00:00<?, ?it/s]

In [34]:
# save pre-processed files
nodes_csv = pd.DataFrame(all_nodes, columns=['u', 'i'])
nodes_csv['idx'] = np.arange(len(nodes_csv)) # + 1 # begin indexing at 1
nodes_csv['ts'] = all_ts
nodes_csv['label'] = all_dk_points

nodes_csv.to_csv('./data/ml_nba.csv', index=False)

In [35]:
# edge features
np.save('./data/ml_nba.npy', all_edges)


In [1]:
import numpy as np
#edges = np.load('./data/ml_nba.npy', allow_pickle=True)

In [2]:
a = np.load('./data/ml_nba_node.npy', allow_pickle=True, mmap_mode='r')

In [None]:
edges[:2]

In [96]:
# NODE FEATURES
# add general info about the player
general_info = pd.read_csv('/Volumes/TOSHIBA EXT TIMOTHY DELILLE/GitHub/fantasy-nba/data/general_info.csv')
general_info = general_info.dropna(subset=['id'])

general_info['birth_date'] = pd.to_datetime(general_info['birth_date'])
general_info['birth_date_ts'] = (general_info['birth_date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

general_info['nba_debut'] = pd.to_datetime(general_info['nba_debut'])
general_info['nba_debut_ts'] = (general_info['nba_debut'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

general_info['draft_year'] = general_info['draft_year'].str.replace(r'\D', '')\
                                                       .fillna('1970')\
                                                       .astype(int, errors='ignore')
general_info['draft_year_ts'] = general_info['draft_year'].apply(lambda x: pd.Timestamp(str(x)))
general_info['draft_year_ts'] = (general_info['draft_year_ts'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

general_info['height_cm'] = general_info['height_cm'].astype(float)
general_info['weight_kg'] = general_info['weight_kg'].astype(float)

one_hot_teams = pd.get_dummies(general_info['draft_team'])
general_info[one_hot_teams.columns] = one_hot_teams

# one_hot_colleges = pd.get_dummies(general_info['college'])
# general_info[one_hot_colleges.columns] = one_hot_colleges

In [97]:
node_features = ['height_cm', 'weight_kg', 'birth_date_ts', 'nba_debut_ts', 'draft_year_ts']\
               + one_hot_teams.columns.tolist()

node_feats = pd.DataFrame(index=player_ids, columns=node_features)

general_info = general_info.set_index('id')
common_players = set(player_ids).intersection(set(general_info.index))
node_feats.loc[common_players, :] = general_info.loc[common_players, node_features]\
                                                 .astype(float)\
                                                 .fillna(0.)\
                                                 .values
node_feats = node_feats.fillna(0.)

np.save('./data/ml_nba_node.npy', node_feats.values)

In [98]:
np.save('./data/ml_nba_node.npy', np.zeros((len(player_ids), 172))) # memory_dim = 172

python train_self_supervised.py --use_memory --prefix tgn-attn --n_runs 1 --n_epoch 10 --data nba

python train_supervised.py --use_memory --prefix tgn-attn --n_runs 1 --n_epoch 1 --data nba

In [100]:
ml_reddit = pd.read_csv('/Volumes/TOSHIBA EXT TIMOTHY DELILLE/GitHub/tgn/data/ml_reddit.csv', nrows=199)

In [41]:
val_time, test_time = list(np.quantile(nodes_csv.ts, [0.70, 0.85]))

In [42]:
val_time

1489777200.0

In [None]:
np.save('./data/ml_nba_test.npy', all_edges.astype(float))

In [51]:
(np.diff(nodes_csv.loc[nodes_csv['ts'] <= val_time, 'ts']) >= 0).all()

True

In [52]:
nodes_csv.index

RangeIndex(start=0, stop=8683370, step=1)

In [54]:
nodes_csv

Unnamed: 0,u,i,idx,ts,label
0,1,0,0,1256671800,26.0
1,2,0,1,1256671800,26.0
2,3,0,2,1256671800,26.0
3,4,0,3,1256671800,26.0
4,5,0,4,1256671800,26.0
...,...,...,...,...,...
8683365,543,135,8683365,1610920800,0.0
8683366,1263,135,8683366,1610920800,0.0
8683367,1351,135,8683367,1610920800,0.0
8683368,1376,135,8683368,1610920800,0.0


In [101]:
ml_reddit.describe()

Unnamed: 0.1,Unnamed: 0,u,i,ts,label,idx
count,199.0,199.0,199.0,199.0,199.0,199.0
mean,99.0,80.331658,10033.743719,455.250719,0.0,100.0
std,57.590508,45.259974,26.810967,253.704735,0.0,57.590508
min,0.0,1.0,10001.0,0.0,0.0,1.0
25%,49.5,44.0,10007.0,260.0415,0.0,50.5
50%,99.0,80.0,10031.0,454.686,0.0,100.0
75%,148.5,116.5,10054.5,657.5295,0.0,149.5
max,198.0,163.0,10092.0,850.441,0.0,199.0


In [87]:
edge_features = np.load('./data/ml_{}.npy'.format('nba'), allow_pickle=True)

In [None]:
def preprocess(basic_stats):
    u_list, i_list, ts_list, label_list = [], [], [], []
    feat_l = []
    idx_list = []

    with open(data_name) as f:
        s = next(f)
        for idx, line in enumerate(f):
            e = line.strip().split(',')
            u = int(e[0])
            i = int(e[1])

            ts = float(e[2])
            label = float(e[3])  # int(e[3])

            feat = np.array([float(x) for x in e[4:]])

            u_list.append(u)
            i_list.append(i)
            ts_list.append(ts)
            label_list.append(label)
            idx_list.append(idx)

            feat_l.append(feat)
            
    return pd.DataFrame({'u': u_list,
                         'i': i_list,
                         'ts': ts_list,
                         'label': label_list,
                         'idx': idx_list}), np.array(feat_l)

def reindex(df, bipartite=True):
    new_df = df.copy()
    if bipartite:
        assert (df.u.max() - df.u.min() + 1 == len(df.u.unique()))
        assert (df.i.max() - df.i.min() + 1 == len(df.i.unique()))

        upper_u = df.u.max() + 1
        new_i = df.i + upper_u

        new_df.i = new_i
        new_df.u += 1
        new_df.i += 1
        new_df.idx += 1
    else:
        new_df.u += 1
        new_df.i += 1
        new_df.idx += 1

    return new_df


def run(data_name, bipartite=True):
    Path("data/").mkdir(parents=True, exist_ok=True)
    PATH = './data/{}.csv'.format(data_name)
    OUT_DF = './data/ml_{}.csv'.format(data_name)
    OUT_FEAT = './data/ml_{}.npy'.format(data_name)
    OUT_NODE_FEAT = './data/ml_{}_node.npy'.format(data_name)

    df, feat = preprocess(PATH)
    new_df = reindex(df, bipartite)

    empty = np.zeros(feat.shape[1])[np.newaxis, :]
    feat = np.vstack([empty, feat])

    max_idx = max(new_df.u.max(), new_df.i.max())
    rand_feat = np.zeros((max_idx + 1, 172))

    new_df.to_csv(OUT_DF)
    np.save(OUT_FEAT, feat)
    np.save(OUT_NODE_FEAT, rand_feat)
