In [1]:
from bs4 import BeautifulSoup
import requests
import time, os
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select

from scraping_functions import get_data, add_to_result, get_stats, get_games

## Scraping

Data is scraped from the NCAA stats website. This website has a lot of dropdowns to get to the relevant pages, so I am using Selenium to go through the dropdowns for each season and add them to the results.

In [2]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
base_url = 'https://stats.ncaa.org/rankings/change_sport_year_div'

In [3]:
driver = webdriver.Chrome(chromedriver)
driver.get(base_url)

dropdown_sport = Select(driver.find_element_by_id('sport'))
dropdown_sport.select_by_visible_text('Field Hockey')
# time.sleep(1)

dropdown_season = Select(driver.find_element_by_id('acadyr'))
seasons = [option.text for option in dropdown_season.options]
dropdown_season.select_by_visible_text('2019-20')
# time.sleep(1)

dropdown_div = Select(driver.find_element_by_id('u_div'))
dropdown_div.select_by_visible_text('I')
# time.sleep(1)

short_link = driver.find_element_by_id('stat_type_T_N')
short_link.click()
# time.sleep(1)

dropdown_stats = Select(driver.find_element_by_id('Stats'))
stats = [option.text for option in dropdown_stats.options]
stats.remove('Additional Stats')
dropdown_stats.select_by_visible_text('Assists Per Game')
time.sleep(1)

dropdown_display = Select(driver.find_element_by_name('rankings_table_length'))
dropdown_display.select_by_value('-1')

soup = BeautifulSoup(driver.page_source, 'lxml')
data_table = soup.find('table', {'id':'rankings_table'})
teams_list = [link.text.split('(')[0].strip() for link in data_table.findAll('a')]
final = []
header = ['season', 'games', 'Assists Per Game', 'Defensive Saves', 'Goals Against Average', 'Goals Per Game', 
          'Penalty Corners Per Game', 'Points Per Game', 'Save Percentage', 'Saves Per Game', 'Scoring Average', 
          'Scoring Margin', 'Shutouts Per Game', 'Winning Percentage']

# Gather data for 8 seasons
for season in seasons[0:8]:
    dropdown_season = Select(driver.find_element_by_id('acadyr'))
    dropdown_season.select_by_visible_text(season)
    time.sleep(1)

    dropdown_div = Select(driver.find_element_by_id('u_div'))
    dropdown_div.select_by_visible_text('I')
    time.sleep(1)

    short_link = driver.find_element_by_id('stat_type_T_N')
    short_link.click()
    time.sleep(1)

    dropdown_stats = Select(driver.find_element_by_id('Stats'))
    stats = [option.text for option in dropdown_stats.options]
    stats.remove('Additional Stats')
    dropdown_stats.select_by_visible_text('Assists Per Game')
    time.sleep(1)
    
    dropdown_display = Select(driver.find_element_by_name('rankings_table_length'))
    dropdown_display.select_by_value('-1')
    time.sleep(1)

    result = {team:[] for team in teams_list}
    for key in result:
        result[key].append(season)
    games_dict = get_games(driver)
    for team in result:
        if team in games_dict:
            result[team].append(games_dict[team])
        else:
            result[team].append(np.nan)
    season_stats = get_stats(driver, result, header, stats, teams_list)
    final.append(season_stats)

driver.close()

### Create Dataframe

In [12]:
data = pd.DataFrame(final[0]).transpose().reset_index()
columns = ['team'] + header
data.columns = columns

In [13]:
for i,season in enumerate(final[1:],1):
    df = pd.DataFrame(final[i]).transpose().reset_index()
    df.columns = columns
    data = data.append(df)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 616 entries, 0 to 76
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   team                      616 non-null    object
 1   season                    616 non-null    object
 2   games                     578 non-null    object
 3   Assists Per Game          578 non-null    object
 4   Defensive Saves           559 non-null    object
 5   Goals Against Average     578 non-null    object
 6   Goals Per Game            578 non-null    object
 7   Penalty Corners Per Game  577 non-null    object
 8   Points Per Game           577 non-null    object
 9   Save Percentage           574 non-null    object
 10  Saves Per Game            497 non-null    object
 11  Scoring Average           577 non-null    object
 12  Scoring Margin            303 non-null    object
 13  Shutouts Per Game         528 non-null    object
 14  Winning Percentage        5

In [15]:
data.to_pickle('data.pickle')