# Import packages
Oddsportal has dynamic content, i.e. javascript rendered page. I used `selenium` package to load and render url content with Morzilla Firefox then parsed source code to `BeautifulSoup` to extract information of interest.

In [1]:
from selenium import webdriver
import bs4
import pandas as pd
import numpy as np

## Helper functions for processing HTML code

In [2]:
def process_soup(soup, league, season):
    """
    Input:
        soup: bs4.BeautifulSoup element (HTML source code parced with selenium webdriver)
        league: string, e.g. NBA or Euro
        season: string  e.g. 2017/1018 or 2013/2014
    Output:
        List of list. Each list element contains a size of 5 list
        [total score (int), who won (1: home, 0:away), was overtime (0: Yes, 1: No), home team coef., away team coef.]
    """
    _data = list()
    rows = soup.tbody.findAll('tr')
    for row in rows:
        if len(row.contents) == 6:
            # get score string
            score = row.find("td", {"class": "center bold table-odds table-score"}).text
            # get home and away prices
            home_price = float(row.contents[-3].text)
            away_price = float(row.contents[-2].text)

            _data.append([league, season]+process_score(score)+[home_price, away_price])
    return _data


def process_score(string):
    """
    Input:
        score string, e.g. "124:114 OT" or "122:104". OT (overtime)
    Output:
        list [total score (float), who won (1: home, 0: away), was were overtime (1: Yes, 0: No)]
    """
    if len(string.split(":")) != 2:
        print("Teams names: {}".format(string))
        return [np.nan, np.nan, np.nan, np.nan]
    # check whatver there was overtime. Extract final score for home and away teams
    if string[-2:] == "OT":
        _ot = 1
        home_score = convert_2_int(string[:-3].split(":")[0])
        away_score = convert_2_int(string[:-3].split(":")[1])
    else:
        _ot = 0
        home_score = convert_2_int(string.split(":")[0])
        away_score = convert_2_int(string.split(":")[1])
    
    # check who won match
    if home_score > away_score:
        _win = 1
    else:
        _win = 0
        
    return [home_score, away_score, _win, _ot]


def convert_2_int(string):
    """
    Input:
        string: e.g. "123" or "93"
    Output:
        int:
    If ValueError, print problematic string and return NAN value
    """
    try:
        return int(string)
    except ValueError:
        print("Input {} could not be converted to integer".format(string))
        return np.nan

## Initiate webdriver
`executable_path` points to excecutable used to connect to Firefox. To use different browser download approprate geckodriver (hyperlink) or refer to this Stack exchange post.

In [3]:
%%time
driver = webdriver.Firefox(executable_path=r"geckodriver.exe")

Wall time: 5.29 s


## Load NBA and Euroleague 2013-2018 season data
`season_dict` dictonary keys are  league and season names, while values are tuples (url, number of pages to iterate over).

In [4]:
season_dict = {
    "NBA_2017/2018": ("https://www.oddsportal.com/basketball/usa/nba-2017-2018/results/#/page/", 28),
    "NBA_2016/2017": ("https://www.oddsportal.com/basketball/usa/nba-2016-2017/results/#/page/", 29),
    "NBA_2015/2016": ("https://www.oddsportal.com/basketball/usa/nba-2015-2016/results/#/page/", 29),
    "NBA_2014/2015": ("https://www.oddsportal.com/basketball/usa/nba-2014-2015/results/#/page/", 29),
    "NBA_2013/2014": ("https://www.oddsportal.com/basketball/usa/nba-2013-2014/results/#/page/", 29),
    "EURO_2017/2018": ("https://www.oddsportal.com/basketball/europe/euroleague-2017-2018/results/#/page/", 6),
    "EURO_2016/2017": ("https://www.oddsportal.com/basketball/europe/euroleague-2016-2017/results/#/page/", 6),
    "EURO_2015/2016": ("https://www.oddsportal.com/basketball/europe/euroleague-2015-2016/results/#/page/", 5),
    "EURO_2014/2015": ("https://www.oddsportal.com/basketball/europe/euroleague-2014-2015/results/#/page/", 6),
    "EURO_2013/2014": ("https://www.oddsportal.com/basketball/europe/euroleague-2013-2014/results/#/page/", 6)
}

## Read scored points and team averaged coefficients

In [5]:
%%time
all_data = list()
for key in season_dict.keys():
    # Load main url page
    url = season_dict[key][0]
    # Extract league and season from key string
    league = key.split("_")[0]
    season = key.split("_")[1]
    # Iterate over all pages for particular season
    for idx in range(1, season_dict[key][1]+1):
        # Load page
        driver.get(url+str(idx))
        # Process HTLM into data
        soup = bs4.BeautifulSoup(driver.page_source)
        all_data += process_soup(soup, league, season)

Teams names: canc.
Teams names: canc.
Wall time: 1min 19s


## Transform data into pandas DataFrame and save it as .csv file

In [6]:
df = pd.DataFrame(all_data, columns=["League", "Season", "Home_score", "Away_score", "Win", "OT", "Home_p", "Away_p"])
df.to_csv("basketball_scores.csv")
df.head()

Unnamed: 0,League,Season,Home_score,Away_score,Win,OT,Home_p,Away_p
0,NBA,2017/2018,85.0,108.0,0.0,0.0,2.43,1.61
1,NBA,2017/2018,102.0,110.0,0.0,0.0,2.28,1.68
2,NBA,2017/2018,122.0,103.0,1.0,0.0,1.18,5.36
3,NBA,2017/2018,124.0,114.0,1.0,1.0,1.12,7.26
4,NBA,2017/2018,92.0,101.0,0.0,0.0,3.34,1.37


## Turn off webdriver

In [7]:
driver.quit()