In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import chromedriver_autoinstaller

import time
import random
import subprocess
from datetime import datetime
import missingno as ms
from plotnine import *

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

## Contents:
- Set Selenium options
- Scrape OddsPortal URLs of all NBA Games in a season [saved to .txt file]
- Scrape Bookmaker data from OP URLs into dataframe [saved to pickle]
- Transform Bookmaker dataframe
- Join NBA team identifiers (needed to join to RAPTOR)
- Join RAPTOR scores downloaded from 538

### Set Selenium Options 
Important! so that OddsPortal is signed in to Sam's account

In [2]:
profile_path = r'C:\Users\sleblanc\AppData\Local\Google\Chrome\User Data'
profile_name = 'Profile 3'

In [3]:
def set_chrome_options():
    """
    Set options for the Chrome webdriver used by Selenium.
    Most importantly, set the user data directory and user profile to utilize Chrome's saved passwords.
    ----------------
    Returns: webdriver.ChromeOptions: Configured Chrome webdriver options.
    """
    profile_path = r'C:\Users\sleblanc\AppData\Local\Google\Chrome\User Data'
    profile_name = 'Profile 3'
    options = webdriver.ChromeOptions()
    options.add_argument(f'user-data-dir={profile_path}')
    options.add_argument(f'--profile-directory={profile_name}')
    return options


def kill_chrome_processes():
    """
    Terminate all running Chrome processes.
    Selenium may encounter issues if there are multiple Chrome instances running,
    especially when using a Chrome profile for saved passwords. This function
    terminates all Chrome processes (using the subprocess module), including
    previous Selenium instances.
    """
    subprocess.call("TASKKILL /f  /IM  CHROME.EXE", stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)


def get_webdriver():
    """
    Instantiate and return a Selenium Chrome webdriver with custom options.
    ----------------
    Returns:
        webdriver.Chrome: Configured Chrome webdriver instance.
        False if an exception occurs during instantiation.
    Note:
        The webdriver must be downloaded and added to PATH. See:
        https:/selenium-python.readthedocs.io/installation.html
    """
    try:
        chromedriver_autoinstaller.install()
        kill_chrome_processes()
        options = set_chrome_options()
        driver = webdriver.Chrome(options=options)
        return driver
    except Exception as e:
        print(e)
        return False

## Scrape OddsPortal URLs of All NBA Games in a Season
Done: [saved to .txt file]

In [28]:
def load_page(driver, retries=5):
    """
    Load the content of a page by scrolling to the bottom multiple times.
    ----------------
    Args:
        driver (webdriver): The Selenium webdriver instance to interact with the page.
        retries (int, optional): The number of times to scroll to the bottom. Defaults to 5.
    """
    for _ in range(retries):
        driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(1)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)

def extract_game_hrefs(driver, season, remove_hrefs):
    """
    Extract the game links from the current page of the NBA season results.
    ----------------
    Args:
        driver (webdriver): The Selenium webdriver instance to interact with the page.
        season (str): The NBA season in the format "YYYY-YYYY".
        remove_hrefs (list): A list of URLs to exclude from the result.
    Returns:
        set: A set of URLs for individual games in the specified NBA season.
    """
    hrefs = [x.get_attribute("href") for x in driver.find_elements_by_tag_name("a")]
    if season == '2022-2023':
        game_hrefs = set(x for x in hrefs if '/basketball/usa/nba/' in x and x not in remove_hrefs)
    else:
        game_hrefs = set(x for x in hrefs if f'nba-{season}' in x and x not in remove_hrefs)
    return game_hrefs


def click_next_page(driver):
    """
    Click the "Next" button to navigate to the next page of the NBA season results.
    ----------------
    Args: driver (webdriver): The Selenium webdriver instance to interact with the page.
    Raises: Exception: If the "Next" button is not found on the page.
    """
    driver.find_elements_by_class_name("h-max")[1].click()


def scrape_season(season, max_pages=30):
    """
    Scrape game links from the NBA season results page on OddsPortal.
    ----------------
    Args:
        season (str, optional): The NBA season to scrape. Defaults to "2021-2022".
        max_pages (int, optional): Maximum number of pages to scrape. Defaults to 30.
    Returns:
        list: A list of URLs for individual games in the specified NBA season.
    """
    all_game_urls = set()
    url_base = "https://www.oddsportal.com/basketball/usa/"
    
    if season == '2022-2023':
        url = f"{url_base}nba/results/#"
    else:
        url = f"{url_base}nba-{season}/results/#"
        
    driver = get_webdriver()
    driver.get(url)

    for n in range(1, max_pages + 1):    
        time.sleep(0.5 + 2 * random.random())
        load_page(driver)

        remove_hrefs = [
            f'https://www.oddsportal.com/basketball/usa/nba-{season}/results/#',
            f'https://www.oddsportal.com/basketball/usa/nba-{season}/results/',
            f'https://www.oddsportal.com/basketball/usa/nba-{season}/',
            f'https://www.oddsportal.com/basketball/usa/nba-{season}/standings/',
            f'https://www.oddsportal.com/basketball/usa/nba/results/#',
            f'https://www.oddsportal.com/basketball/usa/nba/results/',
            f'https://www.oddsportal.com/basketball/usa/nba/',
            f'https://www.oddsportal.com/basketball/usa/nba/standings/',
        ]

        game_urls = extract_game_hrefs(driver, season, remove_hrefs)
        all_game_urls.update(game_urls)

        print(f'Page {n} Finished: Added {len(game_urls)} new games. Total Games: {len(all_game_urls)}')
                
        try:
            click_next_page(driver)
        except Exception as e:
            print(f"Next Page Error; closing driver")
            driver.quit()
            return list(all_game_urls)

    driver.quit()
    return list(all_game_urls)

## Scrape 🥷

In [None]:
for season in ['2018-2019','2019-2020','2020-2021','2021-2022','2022-2023']:
    
    print(f"\n {season} \n")

    all_game_urls = scrape_season(season)

    # save url list to text file for later
    with open(f'all_game_urls_{season}.txt', 'w') as file:
        for item in all_game_urls:
            file.write(item + '\n')

## Create Bookmaker dataframes from OP URLs

In [30]:
def game_to_df(driver, game_info_xpath, odds_xpath):
    """
    Extract game and odds information from a webpage and return it as a DataFrame.
    ----------------
    Args:
        driver (WebDriver): Selenium WebDriver instance.
        game_info_xpath (str): XPath for the game information element.
        odds_xpath (str): XPath for the odds information element.
    Returns: 
        pd.DataFrame: DataFrame containing game and odds information.
    """

    def get_game_info():
        """
        Extract game information from the webpage using the given XPath.
        ----------------
        Returns: dict: Dictionary containing game information.
        """
        game_info_el = driver.find_element(By.XPATH, game_info_xpath)
        game_info_list = game_info_el.text.split('\n')

        format_string = '%A, %d %b %Y, %H:%M'
        game_time = game_info_list[5]

        return {
            'Game_Time' : datetime.strptime(game_time, format_string).strftime('%Y-%m-%d %H:%M'),
            'Home_Name' : game_info_list[0],
            'Away_Name' : game_info_list[3],
            'Game_Result' : game_info_list[7],
            'Home_Score' : game_info_list[1],
            'Away_Score' : game_info_list[4],
        }

    def get_odds_info():
        """
        Extract odds information from the webpage using the given XPath.
        ----------------
        Returns: list: List containing odds information.
        """
        odds_el = driver.find_element(By.XPATH, odds_xpath)

        odds_list = odds_el.text.split('\n')
        odds_list = [x for x in odds_list if x != 'BONUS']
        odds_list = odds_list[4:odds_list.index('Average')]

        return odds_list

    def get_odds_array(odds_list):
        """
        Convert the odds list into a NumPy array.
        ----------------
        Args: odds_list (list): List containing odds information.
        Returns: np.array: NumPy array of odds information, or False if the list length is not divisible by 4.
        """
        if len(odds_list) % 4 == 0:
            rows = int(len(odds_list) / 4)
            odds_array = np.array(odds_list).reshape(rows, 4)
            return odds_array
        else:
            return False

    # get the game info and odds
    try:
        game_summary = get_game_info()    
        odds_list = get_odds_info()    
        odds_array = get_odds_array(odds_list)

        # create dataframe of bookmaker odds 
        df = pd.DataFrame(odds_array, columns=['Bookmaker', 'Home_Amer', 'Away_Amer', 'Full_Payout'])

        # set the game summary values for all bookmaker rows in dataframe
        for key in game_summary:
            df[key] = game_summary[key]

        # reorder dataframe columns
        df = df[[
            'Game_Time', 'Home_Name', 'Away_Name', 'Game_Result', 'Home_Score', 'Away_Score',
            'Bookmaker', 'Home_Amer', 'Away_Amer', 'Full_Payout']]

        return df
    except:
        return pd.DataFrame()

## Scrape 🥷

In [34]:
# load url list from file

all_game_urls = []

for season in ['2018-2019','2019-2020','2020-2021','2021-2022','2022-2023']:
    
    print(f"\n {season} \n")

    with open(f'all_game_urls_{season}.txt', 'r') as file:
        all_game_urls = [line.strip() for line in file.readlines()]
        
    print(f"\n {len(all_game_urls)} games \n")

    # Element XPATHs, subject to change
    game_info_xpath = "/html/body/div[1]/div/div[1]/div/main/div[2]/div[3]"
    odds_xpath = "/html/body/div[1]/div/div[1]/div/main/div[2]/div[4]"

    driver = get_webdriver()
    bookmaker = pd.DataFrame()
    for i, url in enumerate(all_game_urls):
        if i % 10 == 0: print(f"Finished scraping {i} games out of {len(all_game_urls)}")

        # Scrape Home/Away odds data
        driver.get(url + '/#home-away;1')

        # Random sleep so no DDOS
        time.sleep(1 + 2*random.random())

        # Get bookmaker data from single game into dataframe
        df_ = game_to_df(driver, game_info_xpath, odds_xpath)

        if len(df_) < 1:
            print("Skipped:", url)
        else:
            # Concat with full dataframe
            bookmaker = pd.concat([bookmaker, df_])
            bookmaker = bookmaker.drop_duplicates()
            bookmaker.to_pickle(f'bookmaker_{season}.pkl')

    bookmaker = bookmaker.drop_duplicates()
    bookmaker.to_pickle(f'bookmaker_{season}.pkl')


 2018-2019 


 1380 games 

Finished scraping 0 games out of 1380
Finished scraping 10 games out of 1380
Finished scraping 20 games out of 1380
Finished scraping 30 games out of 1380
Finished scraping 40 games out of 1380
Finished scraping 50 games out of 1380
Finished scraping 60 games out of 1380
Finished scraping 70 games out of 1380
Finished scraping 80 games out of 1380
Finished scraping 90 games out of 1380
Finished scraping 100 games out of 1380
Finished scraping 110 games out of 1380
Finished scraping 120 games out of 1380
Finished scraping 130 games out of 1380
Finished scraping 140 games out of 1380
Finished scraping 150 games out of 1380
Finished scraping 160 games out of 1380
Finished scraping 170 games out of 1380
Finished scraping 180 games out of 1380
Finished scraping 190 games out of 1380
Finished scraping 200 games out of 1380
Finished scraping 210 games out of 1380
Finished scraping 220 games out of 1380
Finished scraping 230 games out of 1380
Finished scraping 240 

KeyboardInterrupt: 

### 2020-2021 Skips

Skipped: https://www.oddsportal.com/basketball/usa/nba-2020-2021/houston-rockets-utah-jazz-GKC2XbS9/
Skipped: https://www.oddsportal.com/basketball/usa/nba-2020-2021/memphis-grizzlies-san-antonio-spurs-GtlhmSW6/
Skipped: https://www.oddsportal.com/basketball/usa/nba-2020-2021/portland-trail-blazers-atlanta-hawks-rsXfJ2zH/
Skipped: https://www.oddsportal.com/basketball/usa/nba-2020-2021/dallas-mavericks-utah-jazz-KfxdwcKJ/
Skipped: https://www.oddsportal.com/basketball/usa/nba-2020-2021/memphis-grizzlies-los-angeles-clippers-INe1o4Ct/
Skipped: https://www.oddsportal.com/basketball/usa/nba-2020-2021/toronto-raptors-charlotte-hornets-W86EQrg4/
Skipped: https://www.oddsportal.com/basketball/usa/nba-2020-2021/phoenix-suns-philadelphia-76ers-dflFs7gF/
Skipped: https://www.oddsportal.com/basketball/usa/nba-2020-2021/detroit-pistons-new-york-knicks-6XicM4qS/
Skipped: https://www.oddsportal.com/basketball/usa/nba-2020-2021/san-antonio-spurs-houston-rockets-2RyzOQLo/

### 2021-2022 Skips
None

## Transform Bookmaker Dataframe

In [None]:
def implied_odds_from_american(american_odds):
    """
    Calculate implied odds from American odds.
    ----------------
    Args: american_odds (int or str): American odds value.
    Returns: float: Implied odds.
    Raises: ValueError: If the absolute value of American odds is less than 100.
    """
    american_odds = int(american_odds)
    if abs(american_odds) < 100:
        raise ValueError(f"American odds must always have absolute value over 100. Supplied odds: {american_odds}")

    if american_odds < 0:
        return -american_odds / (-american_odds + 100)
    return 100 / (american_odds + 100)

def prediction_from_implied_odds(row):
    """
    Predict the winning team based on implied odds.
    ----------------
    Args: row (dict): A dictionary containing 'Home_Imp' and 'Away_Imp' keys.
    Returns: str or None: 'Home' if Home has higher implied odds, 'Away' if Away has higher implied odds, and None if tied.
    """
    home_implied_odds = row['Home_Imp']
    away_implied_odds = row['Away_Imp']
    
    if home_implied_odds > away_implied_odds:
        return 'Home'
    elif away_implied_odds > home_implied_odds:
        return 'Away'
    return None

def actual_winner(row):
    """
    Determine the actual winner based on the scores.
    ----------------
    Args: row (dict): A dictionary containing 'Home_Score' and 'Away_Score' keys.
    Returns: str or None: 'Home' if Home has a higher score, 'Away' if Away has a higher score, and None if tied.
    """
    home_score = int(row['Home_Score'])
    away_score = int(row['Away_Score'])
    
    if home_score > away_score:
        return 'Home'
    elif away_score > home_score:
        return 'Away'
    return None

def calculate_brier_score(row):
    """
    Calculate the Brier score for the given row.
    ----------------
    Args: row (dict): A dictionary containing 'Pred_Winner', 'Actual_Winner', 'Home_Imp', and 'Away_Imp' keys.
    Returns: float or np.nan: Brier score if both prediction and actual winner are not None, otherwise np.nan.
    """
    predicted_winner = row['Pred_Winner']
    actual_winner = row['Actual_Winner']
    
    if not predicted_winner or not actual_winner:
        return np.nan

    if predicted_winner == actual_winner:
        return (row[f'{predicted_winner}_Imp'] - 1) ** 2
    return row[f'{predicted_winner}_Imp'] ** 2

### Apply transformation

In [None]:
bookmaker['date'] = pd.to_datetime(bookmaker['Game_Time'], format='%Y-%m-%d %H:%M').dt.date
bookmaker['date'] = bookmaker['date'].apply(lambda x: x.strftime('%Y-%m-%d'))

bookmaker['Home_Imp'] = bookmaker['Home_Amer'].apply(implied_odds_from_american)
bookmaker['Away_Imp'] = bookmaker['Away_Amer'].apply(implied_odds_from_american)
bookmaker['Total_Imp'] = bookmaker['Home_Imp'] + bookmaker['Away_Imp']

bookmaker['Pred_Winner'] = bookmaker.apply(prediction_from_implied_odds, axis=1)
bookmaker['Actual_Winner'] = bookmaker.apply(actual_winner, axis=1)
bookmaker['Brier_Score'] = bookmaker.apply(calculate_brier_score, axis=1)

### Join NBA team identifiers

In [None]:
nba_idents = pd.read_csv('Desktop/nba_conversions.csv')
nba_idents.head()

In [None]:
bookmaker_ = bookmaker
for team in ['Home','Away']:
    bookmaker_ = pd.merge(bookmaker_, nba_idents, 
                   left_on = f'{team}_Name', 
                   right_on = 'Team',
                   how='left')

    bookmaker_ = bookmaker_.rename(columns={
        'AbbrA': f'{team}_AbbrA',
        'AbbrB': f'{team}_AbbrB',
        'City': f'{team}_City', 
        'Mascot': f'{team}_Mascot', 
        'Team': f'{team}_Team'
    })

### Load RAPTOR scores from 538
Downloaded from [https://data.fivethirtyeight.com/#nba-forecasts]

In [None]:
raptor = pd.read_csv('Desktop/nba_elo.csv')

# reformat date to match bookmaker
raptor['date'] = pd.to_datetime(raptor['date'], format='%Y-%m-%d').astype(str)

raptor.head()

### Join Bookmaker and RAPTOR data

In [None]:
full = pd.merge(bookmaker_, raptor, 
            left_on=['Home_AbbrB','Away_AbbrB','date'],
            right_on=['team1','team2','date'],
            how='left')

In [None]:
full = full.drop_duplicates()
full = full.sort_values(by=['Game_Time'])

### Examine Missing Data
FiveThirtyEight does not provide data for preseason. Also, there are a couple games that are missing a Predicted Winner (because the bookmaker gave the teams equal odds). We remove all of these games but keep both regular season and playoffs.

In [None]:
ms.matrix(full);

In [None]:
full = full.dropna(subset=['raptor1_pre','Pred_Winner'])

In [None]:
full.shape

### Reorder and Drop Columns and Save to Pickle

In [None]:
full = full[[
    'date','Home_AbbrB','Away_AbbrB','Bookmaker',
    'Home_Amer','Away_Amer',
    'Home_Imp','Away_Imp','Total_Imp','Full_Payout', 
    'Pred_Winner','Actual_Winner','Brier_Score',
    'elo1_pre','elo2_pre',
    'elo_prob1','elo_prob2',
    'raptor1_pre','raptor2_pre',
    'raptor_prob1','raptor_prob2',
]]

In [None]:
full.shape

In [None]:
full.to_pickle('nba-{season}-with-raptor.pkl')