## Scraping skatingscores.com

The purpose of this notebook is to test and then sucesfully scrape https://skatingscores.com/. Practically, I could ask for the data, but because this is a learning project we will scrape assuming we couldn't get it.

In [34]:
#pip install lxml html5lib beautifulsoup4
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

Better data of what I want
- each skater can be described by a set of 11 tables


https://skatingscores.com/2021/wc/sr/women/long/

Called the "Enhanced Protocol" according to skatingscores

In [2]:
# #world championship 2021, women's long
# wc2021_wl = scrapeEvent(website = 'https://skatingscores.com/2021/wc/sr/women/long/')
# wc2021_wl

In [139]:
class scrapeEvent:
    """
    Class for scraping the "Enhanced Protocol" scoresheet from skatingscores.com, initates by recording all skaders
    """
    def __init__(self, website, event_name):

        self.event_name = event_name
        self.dfs = pd.read_html(website)

        self.start = 0
        self.n = len(self.dfs)
        self.stop = self.n - 2
        self.sets = []

        start = self.start
        stop = self.stop

        all_skaters = []
        while start <= stop:
            left = start
            start += 11
            right = start

            #bio is located in first dataframe in sets of 11
            raw_bio = self.dfs[left:right][1].iloc[:, 1:] 
            raw_bio.columns = raw_bio.iloc[0,:]
            bio = raw_bio.iloc[1:,:].rename(columns={raw_bio.columns[0]: "Name", raw_bio.columns[1]: "Country"})
            all_skaters.append(bio)

        self.all_skaters = pd.concat(all_skaters, axis = 0).reset_index(drop = True)

    def get_component_score(self, drop_rank = True):
        """
        Aggregates all component scores by skater 
        """
        start = self.start
        stop = self.stop

        all_components = []
        skater_count = 0

        while start <= stop:
            left = start
            start += 11
            right = start

            #bio is located in TENTH dataframe in sets of 11
            raw_component = self.dfs[left:right][10].iloc[:, 1:] 
            raw_component.columns = raw_component.iloc[0,:]
            raw_component = raw_component.loc[:,~raw_component.columns.duplicated()]

            component = raw_component.iloc[1:-1, :-1]
            component.index = component['Component']
            component.drop(columns = 'Component', inplace = True)

            component.insert(0, 'Name', self.all_skaters['Name'][skater_count])
            component.insert(1, 'Country', self.all_skaters['Country'][skater_count])
            skater_count += 1     

            all_components.append(component)

        all_components = pd.concat(all_components, axis = 0)
        for c in all_components.loc[:, ~(all_components.columns.isin(['NAME', 'COUNTRY', 'Factor']))].columns:
            all_components[c] = all_components[c].str.split(' ').str[1]

        all_components['Event'] = self.event_name
        self.all_components = all_components



    def get_technical_score(self):
        """
        Aggregates all technical scores by skater
        """
        start = self.start
        stop = self.stop

        all_technical = []
        skater_count = 0

        while start <= stop:
            left = start
            start += 11
            right = start
        
         #bio is located in THIRD dataframe in sets of 11
            raw_technical = self.dfs[left:right][3].iloc[:, 1:] 
            raw_technical = raw_technical.dropna(axis = 1)
            raw_technical.columns = raw_technical.iloc[0,:] #use 1st row as column
            technical = raw_technical.iloc[1:-1, :] # remove last row which contains column sums and the first row containing columns names

            technical.insert(0, 'Name', self.all_skaters['Name'][skater_count])
            technical.insert(1, 'Country', self.all_skaters['Country'][skater_count])
            skater_count += 1     

            all_technical.append(technical)

        all_technical = pd.concat(all_technical, axis = 0).reset_index(drop = True)
        for i in all_technical.columns[3:]:
            all_technical[i] = all_technical[i].astype(float, errors = 'ignore')

        all_technical['Event'] = self.event_name
        self.all_technical = all_technical

wc2021_wl = scrapeEvent(website = 'https://skatingscores.com/2122/crncup/sr/women/long/', event_name = "World Championship Women")

In [140]:
wc2021_wl

<__main__.scrapeEvent at 0x7f99255c48d0>

In [141]:
wc2021_wl.get_component_score()
wc2021_wl.all_components

Unnamed: 0_level_0,Name,Country,Factor,J1 🇲🇽,J2 🇹🇷,J3 🇨🇦,J4 🇺🇸,J5 🇸🇪,Event
Component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Skating Skills,LIU,,1.60,7.50,7.25,8.50,7.75,7.75,World Championship Women
Transitions,LIU,,1.60,7.50,7.00,8.00,8.25,8.00,World Championship Women
Performance,LIU,,1.60,7.25,7.50,8.50,8.50,8.50,World Championship Women
Composition,LIU,,1.60,8.00,7.00,8.50,9.00,8.25,World Championship Women
Interpretation,LIU,,1.60,8.00,7.25,8.25,8.75,8.25,World Championship Women
...,...,...,...,...,...,...,...,...,...
Skating Skills,MONTESINOS,,1.60,6.50,5.75,5.50,4.75,4.75,World Championship Women
Transitions,MONTESINOS,,1.60,6.25,5.50,4.50,4.00,4.25,World Championship Women
Performance,MONTESINOS,,1.60,5.75,5.50,4.25,4.25,4.50,World Championship Women
Composition,MONTESINOS,,1.60,6.75,5.50,4.75,4.50,4.25,World Championship Women


In [18]:
wc2021_wl.get_technical_score()
wc2021_wl.all_technical

Unnamed: 0,Name,Country,Element,BaseValue,GOETotal,HKG 🇭🇰,CZE 🇨🇿,SWE 🇸🇪,RUS 🇷🇺,SLO 🇸🇮,SVK 🇸🇰,DEN 🇩🇰,USA 🇺🇸,NED 🇳🇱,Event
0,Alexandra TRUSOVA,RUS,4F!,11.00,1.57,3,1,2,1,2,1,1,1,2,World Championship Women
1,Alexandra TRUSOVA,RUS,4S,9.70,-4.71,-5,-5,-5,-5,-5,-5,-4,-5,-4,World Championship Women
2,Alexandra TRUSOVA,RUS,4Lz+3T,15.70,3.78,4,3,3,4,4,3,3,3,3,World Championship Women
3,Alexandra TRUSOVA,RUS,2A,3.30,0.71,2,2,2,3,3,2,2,2,2,World Championship Women
4,Alexandra TRUSOVA,RUS,CCoSp4,3.50,0.55,1,2,0,1,2,1,2,2,2,World Championship Women
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,Jenni SAARINEN,FIN,3T<<,1.43,-0.65,-5,-5,-5,-5,-5,-5,-5,-5,-5,World Championship Women
284,Jenni SAARINEN,FIN,ChSq121,3.00,0.50,0,1,1,2,1,0,2,1,1,World Championship Women
285,Jenni SAARINEN,FIN,3S<+REP,2.65,-1.72,-5,-5,-5,-5,-5,-5,-5,-5,-5,World Championship Women
286,Jenni SAARINEN,FIN,StSq224,2.60,-1.15,-5,-5,-5,-3,-4,-5,-5,-4,-3,World Championship Women


todo

-element needs to be cleaned up because it contains annotations (ex: Elemeent = 3Aq which denotes a triple axel, but something is wrong with it)

In [None]:
# dfs = pd.read_html('https://skatingscores.com/2021/wc/sr/women/long/')
# for i in np.arange(13):
#     print(i)
#     display(dfs[i].head(), dfs[i].shape)

## Scraping the events page

In [90]:
events2122_raw = pd.read_html('https://skatingscores.com/2122/')[1]
events2122_raw.columns = ['Country', 'Event_Abbreviation', 'Event_Name', 'Event_Begin']
events2122_raw

Unnamed: 0,Country,Event_Abbreviation,Event_Name,Event_Begin
0,🇺🇸,CRNCUP,Cranberry Cup,"Aug 11, 2021"
1,🇺🇸,LPIDI,Lake Placid Ice Dance Intl,"Aug 12, 2021"
2,🇫🇷,JGPFRA,JGP de Courchevel I,"Aug 18, 2021"
3,🇨🇦,JGPCAN,JGP Canada,"Aug 25, 2021"
4,🇫🇷,JGPFRA2,JGP de Courchevel II,"Aug 25, 2021"
...,...,...,...,...
118,🇧🇬,WJC,World Jr Championships,"Mar 07, 2022"
119,🇱🇺,COUPRI,Coupe du Printemps,"Mar 18, 2022"
120,🇸🇮,TRITRO,Triglav Trophy,"Mar 18, 2022"
121,🇫🇷,WC,World Championships,"Mar 21, 2022"


In [91]:
## CODE CREDIT GOES TO https://stackoverflow.com/questions/56757261/extract-href-using-pandas-read-html, the work below has been modified and is not fully my own

url = 'https://skatingscores.com/2122/'
df = pd.read_html(url)[1]

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
soup.prettify()
tables = soup.find_all("tbody")

links = []
for tr in tables[1].findAll("tr"):
    trs = tr.findAll("td")
    for each in trs:
        try:
            link = each.find('a')['href']
            links.append(link)
        except:
            pass
links_to_events = [url + i[6:] for i in links]
abbrv_of_valid_events = [i[len(url):-1].upper() for i in links_to_events] # some events were cancelled and are not pulled via this method above
links_to_completed_events = pd.DataFrame()
links_to_completed_events['Abbreviation'] = abbrv_of_valid_events
links_to_completed_events['URL'] = links_to_events
links_to_completed_events

Unnamed: 0,Abbreviation,URL
0,CRNCUP,https://skatingscores.com/2122/crncup/
1,LPIDI,https://skatingscores.com/2122/lpidi/
2,JGPFRA,https://skatingscores.com/2122/jgpfra/
3,JGPFRA2,https://skatingscores.com/2122/jgpfra2/
4,JGPSVK,https://skatingscores.com/2122/jgpsvk/
...,...,...
82,NATUSA,https://skatingscores.com/2122/natusa/
83,NATUSA,https://skatingscores.com/2122/natusa/
84,NATCAN,https://skatingscores.com/2122/natcan/
85,NATKOR,https://skatingscores.com/2122/natkor/


In [94]:
events_2122 = pd.merge(left = events2122_raw, right = links_to_completed_events, left_on = 'Event_Abbreviation', right_on = 'Abbreviation', how = 'inner')
events_2122 

Unnamed: 0,Country,Event_Abbreviation,Event_Name,Event_Begin,Abbreviation,URL
0,🇺🇸,CRNCUP,Cranberry Cup,"Aug 11, 2021",CRNCUP,https://skatingscores.com/2122/crncup/
1,🇺🇸,LPIDI,Lake Placid Ice Dance Intl,"Aug 12, 2021",LPIDI,https://skatingscores.com/2122/lpidi/
2,🇫🇷,JGPFRA,JGP de Courchevel I,"Aug 18, 2021",JGPFRA,https://skatingscores.com/2122/jgpfra/
3,🇫🇷,JGPFRA2,JGP de Courchevel II,"Aug 25, 2021",JGPFRA2,https://skatingscores.com/2122/jgpfra2/
4,🇸🇰,JGPSVK,JGP Slovakia,"Sep 01, 2021",JGPSVK,https://skatingscores.com/2122/jgpsvk/
...,...,...,...,...,...,...
99,🇨🇦,NATCAN,Canadian Nationals,"Jan 07, 2022",NATCAN,https://skatingscores.com/2122/natcan/
100,🇰🇷,NATKOR,Korean Nationals,"Jan 07, 2022",NATKOR,https://skatingscores.com/2122/natkor/
101,🇰🇷,NATKOR,Korean Nationals,"Jan 07, 2022",NATKOR,https://skatingscores.com/2122/natkor/
102,🇰🇷,NATKOR,Korean Nationals,"Jan 07, 2022",NATKOR,https://skatingscores.com/2122/natkor/


<generator object DataFrame.iterrows at 0x7f99258383d0>

In [154]:
ALL_COMPONENTS = []
'https://skatingscores.com/2122/crncup/'
default = 'sr/women/long/'


for index, row in events_2122[0:1][['Event_Abbreviation','URL']].iterrows():
    try:
        event = scrapeEvent(event_name=row['Event_Abbreviation'],website=row['URL'] + default)
        event.get_component_score()
        ALL_COMPONENTS.append(event.all_components)
    except Exception as e:
        print("blah")
        pass

https://skatingscores.com/2122/crncup/sr/women/long/


In [155]:
ALL_COMPONENTS

[0                     Name  Country Factor J1 🇲🇽 J2 🇹🇷 J3 🇨🇦 J4 🇺🇸 J5 🇸🇪  \
 Component                                                                  
 Skating Skills         LIU      NaN   1.60  7.50  7.25  8.50  7.75  7.75   
 Transitions            LIU      NaN   1.60  7.50  7.00  8.00  8.25  8.00   
 Performance            LIU      NaN   1.60  7.25  7.50  8.50  8.50  8.50   
 Composition            LIU      NaN   1.60  8.00  7.00  8.50  9.00  8.25   
 Interpretation         LIU      NaN   1.60  8.00  7.25  8.25  8.75  8.25   
 ...                    ...      ...    ...   ...   ...   ...   ...   ...   
 Skating Skills  MONTESINOS      NaN   1.60  6.50  5.75  5.50  4.75  4.75   
 Transitions     MONTESINOS      NaN   1.60  6.25  5.50  4.50  4.00  4.25   
 Performance     MONTESINOS      NaN   1.60  5.75  5.50  4.25  4.25  4.50   
 Composition     MONTESINOS      NaN   1.60  6.75  5.50  4.75  4.50  4.25   
 Interpretation  MONTESINOS      NaN   1.60  6.50  5.25  5.00  4.25  4.00   