<a href="https://colab.research.google.com/github/VegaSera/World-Archery-Analysis/blob/master/WA_Arrow_Averages_Qualifiers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import json
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import matplotlib.pyplot as plt
import os, sys
import requests
from statistics import mean
import time

In [0]:
#Global Variables

#R, C = Recurve, Compound
#M, W = Men, Women
category = 'RW' #Can only be RM, CM, RW, CW
year_min = 2000 #Beginning of the range we want to look at
year_max = 2020 #End of the range we want to look at.

In [0]:
def make_competition_json(year): 
  #Creates a json file for a competition year
  #Output json file will include all competitions within a given year that have results in them.

  json_type = "COMPETITIONS"

  pathname = f'{json_type}'
  filename = f'{pathname}/COMP_{year}.json' #Dynamic naming

  if os.path.exists(filename): #Check to see if it already exists
    print('File ' + filename + ' already exists!')
  else:
    if not os.path.exists(pathname): #If the directory does not exist, then make it.
      os.makedirs(pathname)
    with open(filename, 'w') as f:
      request_data = requests.get(f'https://api.worldarchery.org/v3/COMPETITIONS/?StartDate={year}-01-01&EndDate={year}-12-31&WithResults=1&RBP=999999')
      json_data = request_data.json()
      json.dump(json_data, f)
      print("File " + filename + " has been created.")

In [0]:
def make_match_json(comp, year, cat=None): 
  #Creates a json file for an individual match.
  #Requires a competition id and a year, category is optional but recommended
  #Output json will include all individual matches for a given competition.

  json_type = "INDIVIDUALMATCHES" #This function is purely for individual matches.
  CatUrlCode = '' #Empty category code URL piece by default. Only gets filled in if a category code has been specified.
  if cat is not None: #Cat has been specified, and is preferred, but is not required. 
    if cat in ['CM', 'CW', 'RM', 'RW']: #This is only a small subsection of the 200 possible codes. We only care about Compound and Recurve for Men and Women.
      CatUrlCode = f'&CatCode={cat}'
    else: #Obligatory unexpected cat response.
      print('You gave me a weird cat. Category that is. CM, CW, RM, RW only.')

  pathname = f'{json_type}'
  filename = f'{pathname}/{year}_{comp}_{cat}.json'
  if os.path.exists(filename):
    print('File ' + filename + ' already exists!')
  else:
    if not os.path.exists(pathname): #If the directory does not exist, then make it.
      os.makedirs(pathname)

    request_data = requests.get(f'https://api.worldarchery.org/v3/INDIVIDUALMATCHES/?CompId={comp}{CatUrlCode}&RBP=999999')
    json_data = request_data.json()
    if sys.getsizeof(str(json_data)) < 200: #Checking for a nearly empty file.
      print (f"File {filename} - Rejected Due to File Size")
    else:
      with open(filename, 'w') as f:
        json.dump(json_data, f)
        print("File " + filename + " has been created.")

In [0]:
def make_qualifier_json(comp, id, year, cat):
  #Creates a json file for an individual competitor's qualifying arrows.
  #Requires a competition ID, a competitor ID, a year, and a category code
  #Output json will include qualifier information for that competitor at that competition

  json_type = 'QUALIFIERS'
  pathname = f'{json_type}/{year}'
  filename = f'{pathname}/{comp}_{id}_{cat}.json'

  if os.path.exists(filename):
    print('File ' + filename + ' already exists!')
  else:
    if not os.path.exists(pathname): #If the directory does not exist, then make it.
      os.makedirs(pathname)
    request_data = requests.get(f'https://api.worldarchery.org/v3/INDIVIDUALQUALIFICATIONARROWS/?CompId={comp}&Id={id}&CatCode={cat}&RBP=999999')
    json_data = request_data.json()
    if sys.getsizeof(str(json_data)) < 250: #Checking for a nearly empty file.
      print (f"File {filename}- Rejected Due to File Size")
    else:
      with open(filename, 'w') as f:
        json.dump(json_data, f)
        print(f"File {filename} has been created.")

In [0]:
def get_qual_avg(filename):
  #Returns the average arrow score during qualifiers.
  #Luckily we dont have to parse individual arrows, we only need the sum of the sum column.

  data = json.load(open(filename))
  df = json_normalize(data=data['items'])
  return (df['Score'].sum()/len(df))/36

In [0]:
def process_qual_averages(cat, year_min, year_max):
  #Full processing of our data into a single table.

  #Initializing our return dataframe
  df = pd.DataFrame(data=None, columns=['CompetitionID', 'CompetitorID', 'CompetitionDate', 'QArrAvg', 'Time_DoY', 'Time_M', 'Time_W']) 
  start_time = time.time()

  for year in range(year_min, year_max): #For every year in our year range
    make_competition_json(year)
    data = json.load(open(f'COMPETITIONS/COMP_{year}.json'))
    competition = pd.DataFrame(data['items'])
    #In our competition condition, we set 1-10 for all levels, 1-3 for World Cup/Olympics, and 4-10 for Non-world cup levels.
    condition = ((competition['ComLevel'].isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) & (~competition['ComSubLevel'].isin([7, 8, 14, 15, 16, 17, 35, 36, 51, 52]))) #Removes all of the field/indoor competitions
    print(sys.getsizeof(str(competition[condition])))
    comp_Data = competition[condition]

    for comp_id in comp_Data['ID']: #For every competition in our yearly competition data
      CompDate = comp_Data[comp_Data['ID'] == comp_id].DFrom.to_string(index=False).strip() #Pulls the date from the competition data.
      CompDate = pd.to_datetime(CompDate)
      Time_DoY = CompDate.dayofyear
      Time_M = CompDate.month
      Time_W = CompDate.weekofyear

      make_match_json(comp_id, year, cat)
      match_file = f'INDIVIDUALMATCHES/{year}_{comp_id}_{cat}.json'
      if os.path.exists(match_file):
        match_data = json.load(open(match_file))
        match_df = json_normalize(data=match_data['items'], record_path='Matches')

        #Getting unique athlete IDs for this match
        athlete1 = match_df['Competitor1.Athlete.Id'].unique()
        athlete2 = match_df['Competitor2.Athlete.Id'].unique()
        athletes = np.append(athlete1, athlete2)
        athletes = np.unique(athletes)

        for Ath_ID in athletes: #For every athlete we have from the match
          if int(Ath_ID) > 0:
            make_qualifier_json(comp_id, Ath_ID, year, cat)
            qual_file = f'QUALIFIERS/{year}/{comp_id}_{Ath_ID}_{cat}.json'
            if os.path.exists(qual_file):
              AvgQual = get_qual_avg(qual_file)
              entry = {'CompetitionID':comp_id, 'CompetitorID':Ath_ID, 'CompetitionDate':CompDate, 'QArrAvg':AvgQual,"Time_DoY":Time_DoY, "Time_M":Time_M, 'Time_W':Time_W}
              df = df.append(entry, ignore_index=True)

  print(f'Time of operation - {time.time() - start_time}')
  return df


In [0]:
#This runs for a long time. Uncomment at your peril
#This calls our function to call and process all of our qualifier data into a single dataframe.

#v = process_qual_averages(category, year_min, year_max)
#v

In [9]:
#data = json.load(open('QUALIFIERS/2012/433_8239_RW.json'))
data = requests.get('https://api.worldarchery.org/v3/INDIVIDUALQUALIFICATIONARROWS/?CompId=428&Id=7929&CatCode=RW')
json_data = data.json()
#df = json_normalize(data=data['items'])
#(df['Score'].sum()/len(df))/36
sys.getsizeof(str(json_data))

231

In [0]:
#v.to_csv("2000_2019_RW_ama.csv")

In [0]:
#!rm -rf QUALIFIERS/

In [0]:
#v.Time_M.value_counts()

In [0]:
#v1 = v[v['Time_M'] > 1]

In [0]:
import plotly.express as px

#fig = px.box(v1[v1['QArrAvg']>5], x='Time_M' ,y="QArrAvg", notched=True)
#fig.show()

In [0]:
#Pulling data from the API takes forever
#I have uploaded the collected qualifier data to github to work with.
rm_all = pd.read_csv('https://raw.githubusercontent.com/VegaSera/World-Archery-Analysis/master/WADatasets/QualifierArrAvg/2000_2019_RM.csv')
rm_wc = pd.read_csv('https://raw.githubusercontent.com/VegaSera/World-Archery-Analysis/master/WADatasets/QualifierArrAvg/2000_2019_RM_trim.csv')
rw_all = pd.read_csv('https://raw.githubusercontent.com/VegaSera/World-Archery-Analysis/master/WADatasets/QualifierArrAvg/2000_2019_RW.csv')
rw_wc = pd.read_csv('https://raw.githubusercontent.com/VegaSera/World-Archery-Analysis/master/WADatasets/QualifierArrAvg/2000_2019_RW_trim.csv')
rm_am = pd.read_csv('https://raw.githubusercontent.com/VegaSera/World-Archery-Analysis/master/WADatasets/QualifierArrAvg/2000_2019_RM_ama.csv')
rw_am = pd.read_csv('https://raw.githubusercontent.com/VegaSera/World-Archery-Analysis/master/WADatasets/QualifierArrAvg/2000_2019_RW_ama.csv')

In [0]:
rm_all = rm_all[rm_all['Time_M']>1]
rm_wc = rm_wc[rm_wc['Time_M']>1]
rm_am = rm_am[rm_am['Time_M']>1]

In [17]:
print(f'Recurve men: All - {len(rm_all)}')
print(f'Recurve men: World Cup - {len(rm_wc)}')
print(f'Recurve men: Non-world cup - {len(rm_am)}')
print(f'Recurve women: All - {len(rw_all)}')
print(f'Recurve women: World Cup - {len(rw_wc)}')
print(f'Recurve women: Non-world cup - {len(rw_am)}')

Recurve men: All - 4750
Recurve men: World Cup - 3307
Recurve men: Non-world cup - 3140
Recurve women: All - 4089
Recurve women: World Cup - 2887
Recurve women: Non-world cup - 2489


In [34]:
#Recurve men, all levels
import plotly.graph_objects as go
fig = px.box(rm_all[rm_all['QArrAvg']>7.9], x='Time_M' ,y="QArrAvg", notched=True, 
             labels={'QArrAvg':'Qualifier Arrow Average', 'Time_M':'Month'}, 
             title='Recurve Men - All Levels')
fig.show()

In [26]:
#Recurve men, World Cup and Olympics
fig = px.box(rm_wc[rm_wc['QArrAvg']>6], x='Time_M' ,y="QArrAvg", notched=True)

fig.show()

In [36]:
#Recurve men, Non-world cup/olympics
fig = px.box(rm_am[rm_am['QArrAvg']>6], x='Time_M' ,y="QArrAvg", notched=True,
             labels={'QArrAvg':'Qualifier Arrow Average', 'Time_M':'Month'}, 
             title='Recurve Men - Non-World Level Competitions')

fig.show()

In [0]:
rw_all = rw_all[rw_all['Time_M']>1]
rw_wc = rw_wc[rw_wc['Time_M']>1]
rw_am = rw_am[rw_am['Time_M']>1]

In [35]:
#Recurve women, all levels
fig = px.box(rw_all[rw_all['QArrAvg']>7], x='Time_M' ,y="QArrAvg", notched=True,
             labels={'QArrAvg':'Qualifier Arrow Average', 'Time_M':'Month'},
             title='Recurve Women - All Levels')
fig.show()

In [23]:
#Recurve women, World Cup and Olympics
fig = px.box(rw_wc[rw_wc['QArrAvg']>7], x='Time_M' ,y="QArrAvg", notched=True)
fig.show()

In [39]:
#Recurve women, Non-world cup and olympics
fig = px.box(rw_am[rw_am['QArrAvg']>6], x='Time_M' ,y="QArrAvg", notched=True,
             labels={'QArrAvg':'Qualifier Arrow Average', 'Time_M':'Month'}, 
             title='Recurve Women - Non-World Level Competitions')
fig.show()

The supposed seasonal bias I was hunting down seems to ***only exist in the World Cup.***


In [25]:
data = json.load(open(f'COMPETITIONS/COMP_2013.json'))
competition = pd.DataFrame(data['items'])
condition = ((competition['ComLevel'].isin([1, 2, 3])) & (~competition['ComSubLevel'].isin([7, 8, 14, 15, 16, 17, 35, 36, 51, 52])))
competition[condition]



FileNotFoundError: ignored

In [0]:
def compile_world_cup():
  data = json.load(open(f'COMPETITIONS/COMP_2013.json'))
  df = pd.DataFrame(data['items'])
  condition = ((df['ComLevel'].isin([1, 2, 3])) & (~df['ComSubLevel'].isin([7, 8, 14, 15, 16, 17, 32, 35, 36, 51, 52])))
  df = df[condition]
  for year in range(2000, 2020):
    make_competition_json(year)
    data = json.load(open(f'COMPETITIONS/COMP_{year}.json'))
    competition = pd.DataFrame(data['items'])
    condition = ((competition['ComLevel'].isin([1, 2, 3])) & (~competition['ComSubLevel'].isin([7, 8, 14, 15, 16, 17, 32, 35, 36, 51, 52])))
    df = df.append(competition[condition])
  return df

In [0]:
df2 = compile_world_cup()
df2['DFrom'] = pd.to_datetime(df2['DFrom'], infer_datetime_format=True)

df2.loc[(df2['DFrom'].dt.month==7)]

In [0]:
from statistics import mean

def avg_comp_arr(comp_id):
  Qual_Avgs= []
  year = 0000
  cat = category
  make_match_json(comp_id, year=year, cat=category)
  match_file = f'INDIVIDUALMATCHES/{year}_{comp_id}_{cat}.json'
  if os.path.exists(match_file):
    match_data = json.load(open(match_file))
    match_df = json_normalize(data=match_data['items'], record_path='Matches')

    #Getting unique athlete IDs for this match
    athlete1 = match_df['Competitor1.Athlete.Id'].unique()
    athlete2 = match_df['Competitor2.Athlete.Id'].unique()
    athletes = np.append(athlete1, athlete2)
    athletes = np.unique(athletes)

    for Ath_ID in athletes:
      if int(Ath_ID) > 0:
        make_qualifier_json(comp_id, Ath_ID, year, cat)
        qual_file = f'QUALIFIERS/{year}/{comp_id}_{Ath_ID}_{cat}.json'
        if os.path.exists(qual_file):
          AvgQual = get_qual_avg(qual_file)
          Qual_Avgs.append(AvgQual)
  print(Qual_Avgs)
  if len(Qual_Avgs) > 0:
    return mean(Qual_Avgs)
  else:
    return np.NaN

In [0]:
vi = []
for id in df2['ID']:
  vi.append(avg_comp_arr(id))
vi

In [0]:
df2['Avg'] = vi
df2.head(50)

In [0]:
df3 = df2[df2['Avg'].notnull()]
df3.head(100)

In [0]:
df4 = df3[df3['DFrom'].dt.month == 7]
df4