## Scraping skatingscores.com

The purpose of this notebook is to test and then sucesfully scrape https://skatingscores.com/. Practically, I could ask for the data, but because this is a learning project we will scrape assuming we couldn't get it.

In [13]:
from scrapeEnhancedProtocol import *
from scrapeEventsPage import * 

In [14]:
season_2122 = scrapeEvents(url = 'https://skatingscores.com/2122/')
season_2122.scrape()
events_2122 = season_2122.events

In [15]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup
import time

from technical import *

import warnings
warnings.filterwarnings("ignore")

class scrapeEvent:
    """
    Class for scraping the "Enhanced Protocol" scoresheet from skatingscores.com, initates by recording all skaders
    """
    def __init__(self, website, event_name, long = True):

        self.event_name = event_name
        self.dfs = pd.read_html(website)

        self.start = 0
        self.n = len(self.dfs)
        self.stop = self.n - 2
        self.sets = []

        start = self.start
        stop = self.stop

        all_skaters = []

        
        self.program_len = 11 if long else 6 #long program = 12 elements, short = 7
        while start <= stop:
            left = start
            start += self.program_len
            right = start

            #bio is located in first dataframe in sets of 11
            raw_bio = self.dfs[left:right][1].iloc[:, 1:] 
            raw_bio.columns = raw_bio.iloc[0,:]
            bio = raw_bio.iloc[1:,:].rename(columns={raw_bio.columns[0]: "Name", raw_bio.columns[1]: "Country"})
            all_skaters.append(bio)

        all_skaters = pd.concat(all_skaters, axis = 0).reset_index(drop = True)
        all_skaters['Event_Name'] = event_name
        self.all_skaters = all_skaters
                    
        time.sleep(0.33) # be nice to the generous server admins!

    def get_component_score(self, drop_rank = True):
        """
        Aggregates all component scores by skater 
        """
        start = self.start
        stop = self.stop

        all_components = []
        skater_count = 0

        while start <= stop:
            left = start
            start += self.program_len
            right = start

            #bio is located in TENTH dataframe in sets of 11
            raw_component = self.dfs[left:right][10].iloc[:, 1:] 
            raw_component.columns = raw_component.iloc[0,:]
            raw_component = raw_component.loc[:,~raw_component.columns.duplicated()]

            component = raw_component.iloc[1:-1, :-1]
            component.index = component['Component']
            component.drop(columns = 'Component', inplace = True)

            component.insert(0, 'Name', self.all_skaters['Name'][skater_count])
            component.insert(1, 'Country', self.all_skaters['Country'][skater_count])
            skater_count += 1     

            all_components.append(component)

        all_components = pd.concat(all_components, axis = 0)
        for c in all_components.loc[:, ~(all_components.columns.isin(['Name', 'Country', 'Factor']))].columns:
            all_components[c] = all_components[c].str.split(' ').str[1]

        all_components['Event'] = self.event_name
        self.all_components = all_components



    def get_technical_score(self):
        """
        Aggregates all technical scores by skater, skates are denoted

        Source: [Wikipedia](https://en.wikipedia.org/wiki/ISU_Judging_System)
        - `*`: element exceeds allotted amount, receives GOE of 0
        - `+REP`: denotes a solo jump that has been performed twice, receiving 70% base value
        - `x`: denotes a 10% bonus halfway after the program
        - `!`: unclear takeoff edge with deduction of [-1,-2] on GOE
        - `e`:incorrect takeoff edge (i.e Lutz with inside edge)
        - `<`: denote under-rotation beginning on ice with [-1/4, -1/2] rotation happening before the jump
        - `<<`: denote under-rotation beginning on ice greater than -1/2 rotation happening before the jump; jump is downgraded one jump
        """
        start = self.start
        stop = self.stop

        all_technical = []
        skater_count = 0


        while start <= stop:
            left = start
            start += self.program_len # long program
            right = start
        
         #bio is located in THIRD dataframe in sets of self.program_len
            raw_technical = self.dfs[left:right][3].iloc[:, 1:] 
            raw_technical = raw_technical.dropna(axis = 1)
            raw_technical.columns = raw_technical.iloc[0,:] #use 1st row as column
            technical = raw_technical.iloc[1:-1, :] # remove last row which contains column sums and the first row containing columns names

            technical.insert(0, 'Name', self.all_skaters['Name'][skater_count])
            technical.insert(1, 'Country', self.all_skaters['Country'][skater_count])
            technical['Event'] = self.event_name
            #technical = deepen_technical(technical)
            skater_count += 1     

            all_technical.append(technical)

        all_technical = pd.concat(all_technical, axis = 0).reset_index(drop = True)
        for i in all_technical.columns[3:]:
            all_technical[i] = all_technical[i].astype(float, errors = 'ignore')

        self.all_technical = all_technical


In [16]:
## LONG
DIVISION = ['sr']
SEX = ['women', 'men']
PROGRAM = ['long']

def main(event_name, website):
    long = scrapeEvent(event_name = event_name, website = website, long = True)

    long.get_component_score()
    LONG_ALL_COMPONENTS.append(long.all_components)
    
    long.get_technical_score()
    LONG_ALL_TECHNICAL.append(long.all_technical)

    LONG_ALL_SKATER.append(long.all_skaters)


LONG_ALL_COMPONENTS = []
LONG_ALL_TECHNICAL = []
LONG_ALL_SKATER = []
for index, row in events_2122[0:10][['Event_Abbreviation','URL']].iterrows():
    for s in SEX:
        for p in PROGRAM:
            ending = f'sr/{s}/{p}'
            try:
                
                main(event_name = row['Event_Abbreviation'],website = row['URL'] + ending)
                time.sleep(0.33) # be nice to the generous server admins!
            
            except Exception as e:

                #errors can occur because event got cancelled (from the panini) or event may be juniors only (out of scope)
                print(row['Event_Abbreviation'], f"{s}, {p} not scrapable")
                pass
                

SKTHEL women, long not scrapable
SKTHEL men, long not scrapable
LPIDI women, long not scrapable
LPIDI men, long not scrapable
JGPFRA women, long not scrapable
JGPFRA men, long not scrapable
JGPFRA2 women, long not scrapable
JGPFRA2 men, long not scrapable
JGPSVK women, long not scrapable
JGPSVK men, long not scrapable
JNPCHA women, long not scrapable
JNPCHA men, long not scrapable


In [None]:
# ## SHORT
# DIVISION = ['sr']
# SEX = ['women', 'men']
# PROGRAM = ['short']



# SHORT_ALL_COMPONENTS = []
# SHORT_ALL_TECHNICAL = []
# SHORT_ALL_SKATER = []

# def main(event_name, website):
#     short = scrapeEvent(event_name = event_name, website = website, long = False)
#     short.get_component_score()
#     SHORT_ALL_COMPONENTS.append(short.all_components)
#     short.get_technical_score()
#     SHORT_ALL_TECHNICAL.append(short.all_technical)
#     SHORT_ALL_SKATER.append(event.all_skaters)


# for index, row in events_2122[['Event_Abbreviation','URL']].iterrows():
#     for s in SEX:
#         for p in PROGRAM:
#             ending = f'sr/{s}/{p}'
#             try:
                
#                 main(event_name = row['Event_Abbreviation'],website = row['URL'] + ending)
#                 time.sleep(0.33) # be nice to the generous server admins!
            
#             except Exception as e:

#                 #errors can occur because event got cancelled (from the panini) or event may be juniors only (out of scope)
#                 print(row['Event_Abbreviation'], f"{s}, {p} not scrapable")
#                 #print(e)
#                 pass
                

In [None]:
def iterative_concat(list_of_dfs):
    """Concat iteratively inorder to drop duplicates and avoid InvalidIndexError"""
    start = list_of_dfs[0]
    for i in list_of_dfs[1:]:
        try:
            start = pd.concat([start, i.reset_index(drop = True)], axis = 0)
        except Exception as e:
            print(f"dropped {i.head(1)}")
            pass
    return start

In [None]:
components = iterative_concat(LONG_ALL_COMPONENTS)
components.to_csv('../data/skatingScores/components.csv')

In [None]:
technicals = iterative_concat(LONG_ALL_TECHNICAL)
technicals.to_csv('../data/skatingScores/technicals.csv')

In [None]:
skaters = iterative_concat(LONG_ALL_SKATER)
skaters.to_csv('../data/skatingScores/skaters.csv')

In [None]:
## Appendix: more data about the type of jumps
isu_wiki = pd.read_html('https://en.wikipedia.org/wiki/ISU_Judging_System')
jumps_and_code = isu_wiki[4]
#jumps_and_code.columns = [i[0] for i in jumps_and_code.columns]
#display(liu_cran, jumps_and_code)

Let $S$ be the set of scores an element $e$ receives and let $S_t$ be the set of TRIMMED scores s.t $S_t \in S$,
 
$$GOE_e = \frac{1}{j-2}\sum_{i=1}^{j-2}SOV_e(S_j)$$

TODO
- consider scrape scale of values
- scrape two tables for the technicals (GOE TOTAL)

In [30]:
LONG_ALL_TECHNICAL[0][0:12].melt(id_vars =['Name', 'Country', 'Element', 'Event']
                                ,value_vars = ['J1 🏳', 'J2 🏳', 'J3 🏳', 'J4 🏳', 'J5 🏳', 'J6 🏳', 'J7 🏳'])

Unnamed: 0,Name,Country,Element,Event,0,value
0,Gabriella IZZO,USA,2A+3T,SKTMIL,J1 🏳,3.0
1,Gabriella IZZO,USA,3F+2T,SKTMIL,J1 🏳,2.0
2,Gabriella IZZO,USA,3Lz+2T,SKTMIL,J1 🏳,1.0
3,Gabriella IZZO,USA,3Lo,SKTMIL,J1 🏳,2.0
4,Gabriella IZZO,USA,CCoSp4,SKTMIL,J1 🏳,3.0
...,...,...,...,...,...,...
79,Gabriella IZZO,USA,3Lz,SKTMIL,J7 🏳,1.0
80,Gabriella IZZO,USA,2A,SKTMIL,J7 🏳,-3.0
81,Gabriella IZZO,USA,ChSq11,SKTMIL,J7 🏳,3.0
82,Gabriella IZZO,USA,FCCoSp4,SKTMIL,J7 🏳,2.0


In [29]:
LONG_ALL_TECHNICAL[0][0:12]

Unnamed: 0,Name,Country,Element,BaseValue,GOETotal,J1 🏳,J2 🏳,J3 🏳,J4 🏳,J5 🏳,J6 🏳,J7 🏳,Event
0,Gabriella IZZO,USA,2A+3T,7.5,1.26,3.0,3.0,2.0,4.0,4.0,3.0,2.0,SKTMIL
1,Gabriella IZZO,USA,3F+2T,6.6,1.17,2.0,2.0,2.0,2.0,3.0,3.0,2.0,SKTMIL
2,Gabriella IZZO,USA,3Lz+2T,7.2,-0.24,1.0,0.0,-1.0,-1.0,0.0,-1.0,0.0,SKTMIL
3,Gabriella IZZO,USA,3Lo,4.9,0.98,2.0,2.0,2.0,2.0,2.0,2.0,1.0,SKTMIL
4,Gabriella IZZO,USA,CCoSp4,3.5,0.91,3.0,2.0,2.0,2.0,3.0,3.0,3.0,SKTMIL
5,Gabriella IZZO,USA,StSq32,3.3,0.92,2.0,2.0,3.0,4.0,4.0,3.0,1.0,SKTMIL
6,Gabriella IZZO,USA,3F,5.83,1.48,3.0,2.0,3.0,1.0,3.0,3.0,3.0,SKTMIL
7,Gabriella IZZO,USA,3Lz,6.49,1.18,2.0,1.0,2.0,2.0,3.0,4.0,1.0,SKTMIL
8,Gabriella IZZO,USA,2A,3.63,-0.73,-2.0,-2.0,-2.0,-2.0,-3.0,-2.0,-3.0,SKTMIL
9,Gabriella IZZO,USA,ChSq11,3.0,1.5,3.0,2.0,3.0,3.0,3.0,3.0,3.0,SKTMIL
