## Scrape Athlete List

This file pulls lists of athletes and their schools from www.athletic.net for a particular event.

Another ipynb file will be used to take this list of athletes and pull their detailed information for each event.

Note:  The data pulled is for 2A, 3A & 4A athletic divisions in all of Washington state from 2006-2017.

In [6]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from time import sleep

In [42]:
# Select the Event
# will be used to determine the url to scrape

#Event = "&Event=21" #girl 400
#Event_file = 'girl400'
#Event = "&Event=3"   #boy 400
#Event_file = 'boy400'

#Event = "&Event=53" #girl 1600
#Event_file = 'boy1600'
Event = "&Event=52" #boy 1600
Event_file = 'boy1600'

#Event = "&Event=36"  #girl Triple Jump
#Event_file = 'girlTripleJump'

In [39]:
# page number ranges each year to pull data
# pulls data from all districts and subdistricts (easier to pull duplicates and remove later)
# will be used to determine the url to scrape

district_ranges = ([[12173,12236],
                   [19810,19875],
                   [30600,30666],
                   [34898,34960],
                   [45786,45857],
                   [62303,62378],
                   [73261,73336],
                   [82437,82508],
                   [91690,91760]])

In [40]:
# loop through the HTML pages for each district (and subdistricts) and
# pull the athlete names, school, etc.

# it's easier to loop districts and subdistricts but this does produce duplicate
# data which is removed later

def read_div_year(Div,Div_id,Event):
    id_key_string =     "../Athlete.aspx?AID="
    school_key_string = "../School.aspx?SchoolID="

    athlete_list_url="https://www.athletic.net/TrackAndField/Division/Event.aspx?DivID="+Div_id+Event

    response = requests.get(athlete_list_url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    
    cols = ['ID','Name','School_ID','School','Division']
    athlete_list = pd.DataFrame(columns=cols)

    tables=soup.find_all("table")
     
    for i in tables:
        for row in i:
            try:
                str_row = str(row)
                id_index = str_row.find(id_key_string)
                if id_index != -1:
                    # extract the athlete ID
                    str_row = str_row[id_index+20:]            
                    id_str = str_row[0:8]
                    id_str = id_str.replace('\"',"")
                    id_str = id_str.replace('>','')
                    #ID = int(id_str)
                    # extract the athlete Name
                    name_index = str_row.find(">") + 1
                    str_row = str_row[name_index:]
                    name_index = str_row.find("<")
                    Name = str_row[0:name_index] # save this data
                    # extract the athlete school
                    school_index = str_row.find(school_key_string)
                    str_row = str_row[school_index+24:]
                    school_index = str_row.find('\">')
                    school_id = int(str_row[0:school_index]) # save this data
                    str_row = str_row[school_index+2:]
                    school_index = str_row.find("<")
                    school_name = str_row[0:school_index] #save this data
                    if id_str.isdigit():
                        ID = int(id_str)
                        athlete_list = athlete_list.append({'ID': ID, 'Name': Name, 'School_ID': school_id, 'School': school_name, 'Division': Div}, ignore_index=True)
            except ValueError as ve:
                print(f'Encountered error {ve} for athlete')
                print(Name)
    athlete_list = athlete_list.set_index(['ID'])
    return athlete_list


In [None]:
#loop through ranges of district data for each year
# Create a long list of athletes doing this Event

all_athlete_list = pd.DataFrame()
for dist in district_ranges:
    start = dist[0]
    end = dist[1]
    print("starting new year",start,end)

    for d in range(start,end):
        new_list = read_div_year(0,str(d),Event)
        all_athlete_list = all_athlete_list.append(new_list)

starting new year 12173 12236


In [29]:
# remove duplicate athletes in the list
all_athlete_list.drop_duplicates(inplace=True)
all_athlete_list.shape

(4371, 4)

In [30]:
all_athlete_list.head()

Unnamed: 0_level_0,Name,School_ID,School,Division
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
216723,Katharine Lotze,377,Kiona-Benton,0
187860,Liz Cobb,387,Chewelah,0
359942,Lexi Petersen,312,Seattle Christian,0
334500,MacKenzie Altig,301,Bellevue Christian,0
157130,Claire Boutillier,378,Naches Valley,0


Correlate the district number to the athlete

In [31]:
# read file with schools for each district
district_list = pd.read_csv('../WA_athletic_districts1.csv')

all_athlete_list['District'] = np.nan  # for easier filtering out of unfound schools

for index, athlete in all_athlete_list.iterrows():
    School = athlete['School']
    
    for district in district_list: 
        for item in district_list[district]:
            if item == School:
                all_athlete_list.District[index] = district


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [32]:
all_athlete_list.head()

Unnamed: 0_level_0,Name,School_ID,School,Division,District
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
216723,Katharine Lotze,377,Kiona-Benton,0,District 5
187860,Liz Cobb,387,Chewelah,0,
359942,Lexi Petersen,312,Seattle Christian,0,District 3
334500,MacKenzie Altig,301,Bellevue Christian,0,District 3
157130,Claire Boutillier,378,Naches Valley,0,District 5


In [33]:
# How many unique schools are on the list
print("number of athletes ",all_athlete_list.shape)
print("Unique Schools ",all_athlete_list['School'].drop_duplicates().shape)

number of athletes  (4371, 5)
Unique Schools  (275,)


In [34]:
# Remove rows with incomplete school information
print("number of athletes before removing incompletes ",all_athlete_list.shape)
print("number of athletes after removing incompletes ",all_athlete_list.dropna().shape)

all_athlete_list.dropna(inplace=True)

number of athletes before removing incompletes  (4371, 5)
number of athletes after removing incompletes  (3587, 5)


In [35]:
# write to file
all_athlete_list.to_csv('../athlete_list_'+Event_file+'.csv')