In [1]:
#clalify the google drive path
from google.colab import drive
drive.mount("/content/drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
#import things to use / define the functions to use
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import time as tm

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)



def get_duration(start_time_point, end_time_array):
  x = end_time_array - start_time_point
  x = x[x>0]
  if len(x)>0:
    return (min(x))
  else:
    return (1000) #just dummy, 1000 means that it is not a valid action

def output_action_periods(df_large, playerID=67348, rolling_window_n=100, 
                          min_action_length=1, min_speed=3, output_file_path="path"):
  #everything is in the order of "second" and "meter", not mili second
  df = df_large[df_large.playerID==playerID]
  #calculate dt [s]
  tm = np.array(df.timestamp)
  dt = (tm - np.array([tm[0]] + list(tm[:-1]))) /10**3
  #calculate dp (position) [m]
  x = np.array(df.X)
  y = np.array(df.Y)
  x_before = np.array([x[0]] + list(x[:-1]))
  y_before = np.array([y[0]] + list(y[:-1]))
  dp = np.sqrt( (x - x_before)**2 + (y - y_before)**2 ) /10**3
  #v = dp / dt [m/s]
  v = np.array(dp/dt)
  #filter out the head, concat with the df
  df = df.iloc[1:,:]
  df["v"] = v[1:]
  #roll the df
  df.index = df.timestamp

  dfroll = df[["timestamp","v", "X", "Y"]].rolling(window=rolling_window_n).mean() #100 ~= 1s resolution 
  vroll = np.array(dfroll.v)

  #annotate the action moment
  vroll_before = np.array([vroll[0]] + list(vroll[:-1]))
  action_started = np.array((vroll>=min_speed)&(vroll_before<min_speed)) #when >=3m/s
  action_ended = np.array((vroll<min_speed)&(vroll_before>=min_speed)) #when < than 3m/s

  dfroll["action_started"] = action_started
  dfroll["action_ended"] = action_ended

  starttime = np.array(dfroll[dfroll.action_started].timestamp)
  endtime =  np.array(dfroll[dfroll.action_ended].timestamp)
  action_duration = np.array(pd.Series(starttime).apply(lambda x: get_duration(x, endtime)))

  dfaction = dfroll[dfroll.action_started]
  dfaction["action_duration"] = action_duration / 10**3 #second
  dfaction = dfaction[dfaction.action_duration>min_action_length]

  dfaction.to_csv(output_file_path, sep="\t")
  print ("wrote {0} actions for player {1}".format(dfaction.shape, playerID))



def annotate_ontheball(t, x, y, balldf, dist_thres = 0.5): #True if on the ball, = within 50cm 
  balldf = balldf[abs(balldf.timestamp - t) < 200] #let's do 0.2s from action start
  if balldf.shape[0]==0:
    return (False) #no ball around. good.
  else:
    dist_min = min(np.sqrt( (balldf.X-x)**2 + (balldf.Y-y)**2 )) / 10**3 #meter
    return (dist_min<0.5)



def get_nearby_consequence_score(time, events, cutoff=10, flip=False): 
  #if things are not within 10s of the action start, then no count
  #flip=True if it is their action
  df = events[events.Timestamp>time] #after the action only
  if df.shape[0] == 0:
    return (0)
  df = df.head(1) #because it is sorted, taking the first element is fine
  if df.Timestamp.values[0] - time > cutoff*10**3: 
    return (0) #doesn't count if more than 10s = 10*10**3 ms
  else:
    score = df.action_score.values[0]
    if flip: 
      score = score * -1
    return (score)


def score_all_action(events, actions, teamID):
  #1. score my team action
  my_team_events = events[events.TeamId==teamID]
  s1 = actions.timestamp.apply(lambda x: get_nearby_consequence_score(x, my_team_events))
  their_team_events = events[events.TeamId != teamID]
  s2 = actions.timestamp.apply(lambda x: get_nearby_consequence_score(x, their_team_events, flip=True))
  return (np.array(s1) + np.array(s2)) #returing the score of the action (in future: also the events)

In [7]:
#first, annotate all the action period
#focusing on a single game, but easily generalizable

#get unique player ID first
heads = pd.read_csv("/content/drive/My Drive/ssac_hackthon_2020/explore-shottracker/timeseries/M_17472065-4ad8-11ea-9084-0242bdc61da9/M_17472065-4ad8-11ea-9084-0242bdc61da9_playerLocations.csv", 
                        sep=",", nrows = 100000)
players = heads.playerID.unique()


#do by chunk
chunksize = 1000000
reader = pd.read_csv("/content/drive/My Drive/ssac_hackthon_2020/explore-shottracker/timeseries/M_17472065-4ad8-11ea-9084-0242bdc61da9/M_17472065-4ad8-11ea-9084-0242bdc61da9_playerLocations.csv", 
                        sep=",", chunksize=chunksize)
i = 0
for chunk in reader:
  outpath_base = "/content/drive/My Drive/ssac_hackthon_2020/action_output_test/"
  for player in players:
    outpath = outpath_base+str(player)+"_chunk{0}.tsv".format(i)
    output_action_periods(chunk, playerID=player, output_file_path = outpath)
  i += 1
  print ("done chunk {0}".format(i))
  print (tm.ctime())

#and collect
outpath_base = "/content/drive/My Drive/ssac_hackthon_2020/action_output_test/"
for player in  players:
  print ("starting player {0}".format(player))
  dfall = []
  for i in range(20):  
    outpath = outpath_base+str(player)+"_chunk{0}.tsv".format(i)
    df = pd.read_csv(outpath, sep="\t")
    if df.shape[0]>0:  
      dfall.append(df)
  if len(dfall)>0:
    dfall = pd.concat(dfall, axis=0)
    print ("final shape:")
    print(dfall.shape)
    dfall.to_csv(outpath_base+str(player)+"allchunk.tsv", sep="\t")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


wrote (9, 7) actions for player 67348


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


wrote (15, 7) actions for player 67351




wrote (16, 7) actions for player 67346
wrote (18, 7) actions for player 67363
wrote (15, 7) actions for player 67361
wrote (0, 7) actions for player 67342
wrote (20, 7) actions for player 67367
wrote (0, 7) actions for player 75482
wrote (0, 7) actions for player 67344
wrote (16, 7) actions for player 67339
wrote (0, 7) actions for player 75487
wrote (0, 7) actions for player 67364
wrote (12, 7) actions for player 79832
wrote (9, 7) actions for player 67368
wrote (0, 7) actions for player 75486
wrote (0, 7) actions for player 67347
wrote (0, 7) actions for player 67337
wrote (0, 7) actions for player 80073
wrote (13, 7) actions for player 67365
wrote (0, 7) actions for player 75481
wrote (0, 7) actions for player 67340
wrote (1, 7) actions for player 67369
done chunk 1
Thu Mar  5 18:55:20 2020
wrote (4, 7) actions for player 67348
wrote (5, 7) actions for player 67351
wrote (3, 7) actions for player 67346
wrote (8, 7) actions for player 67363
wrote (4, 7) actions for player 67361
wrote

In [0]:
#also define the score for each event in the game
events = pd.read_csv("/content/drive/My Drive/ssac_hackthon_2020/explore-shottracker/playbyplay/M_17472065-4ad8-11ea-9084-0242bdc61da9.csv", sep=",")


score_dict = {"STL":1, "TO":1, "FL":-1, "DEFENSIVE_REB":0.5, "OFFENSIVE_REB":0.2, 
              "FG2":2, "FG3":3, "PASS3_5":0.2, "PASS6":0.4, "PT0":0.2, "TRANSITION":1}
for key in score_dict.keys():
  events.loc[~np.array(events[key].isna()), "action_score"] = score_dict[key]
events.dropna(subset=['action_score'], inplace=True)

events.to_csv("/content/drive/My Drive/ssac_hackthon_2020/play_output_test/M_17472065-4ad8-11ea-9084-0242bdc61da9_playscores.tsv", sep="\t")


In [13]:
#now, score each player in this game
events = pd.read_csv("/content/drive/My Drive/ssac_hackthon_2020/play_output_test/M_17472065-4ad8-11ea-9084-0242bdc61da9_playscores.tsv", sep="\t")

ballloc = pd.read_csv("/content/drive/My Drive/ssac_hackthon_2020/explore-shottracker/timeseries/M_17472065-4ad8-11ea-9084-0242bdc61da9/M_17472065-4ad8-11ea-9084-0242bdc61da9_ballLocations.csv", sep=",")
ballloc_approx = ballloc.iloc[::10,:] #1 in 10 = 1 in 0.1s, to reduce the search space

for player in players:#for each player
  action_data_path = "/content/drive/My Drive/ssac_hackthon_2020/action_output_test/{0}allchunk.tsv".format(int(player))
  actions = pd.read_csv(action_data_path, sep="\t", index_col=0)
  
  #only if the player is relevant to any event:
  if sum(events.PlayerId==player)>0:
    #get the team ID for the player
    my_team_ID = events[events.PlayerId==player].TeamId.unique()[0]

    #filter to off-the-ball actions
    actions["on_the_ball"] = actions.apply(lambda l: annotate_ontheball(l["timestamp"], l["X"], l["Y"], ballloc_approx), axis=1)
    actions = actions[~actions.on_the_ball]
    #and get the score
    action_scores = score_all_action(events, actions, teamID=my_team_ID)
    #save it as a dataframe
    pd.Series(action_scores).to_csv("/content/drive/My Drive/ssac_hackthon_2020/play_output_test/action_scores_player{0}.tsv".format(player), sep="\t")
    print ("done scoring player {0}".format(player))
    print (tm.ctime())





done scoring player 67348
Thu Mar  5 19:03:27 2020
done scoring player 67351
Thu Mar  5 19:03:28 2020
done scoring player 67346
Thu Mar  5 19:03:29 2020
done scoring player 67363
Thu Mar  5 19:03:30 2020
done scoring player 67361
Thu Mar  5 19:03:30 2020
done scoring player 67342
Thu Mar  5 19:03:31 2020
done scoring player 67367
Thu Mar  5 19:03:31 2020
done scoring player 75482
Thu Mar  5 19:03:32 2020
done scoring player 67344
Thu Mar  5 19:03:32 2020
done scoring player 67339
Thu Mar  5 19:03:33 2020
done scoring player 79832
Thu Mar  5 19:03:34 2020
done scoring player 67368
Thu Mar  5 19:03:35 2020
done scoring player 67347
Thu Mar  5 19:03:35 2020
done scoring player 67337
Thu Mar  5 19:03:35 2020
done scoring player 67365
Thu Mar  5 19:03:36 2020
done scoring player 75481
Thu Mar  5 19:03:37 2020
done scoring player 67369
Thu Mar  5 19:03:38 2020


In [0]:
#now we explore these dat ain demo_data_exploration