# Import packages
Oddsportal has dynamic content, i.e. javascript rendered page. I used `selenium` package to load and render url content with Morzilla Firefox then parsed source code to `BeautifulSoup` to extract information of interest.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import bs4
import time
import pandas as pd
import numpy as np
from psw import psw, usr

## Helper functions for processing HTML code

In [None]:
def process_soup(soup, league, season):
    """
    Input:
        soup: bs4.BeautifulSoup element (HTML source code parced with selenium webdriver)
        league: string, e.g. NBA or Euro
        season: string  e.g. 2017/1018 or 2013/2014
    Output:
        List of list. Each list element contains a size of 5 list
        [total score (int), who won (1: home, 0:away), was overtime (0: Yes, 1: No), home team coef., away team coef.]
    """
    _data = list()
    rows = soup.tbody.findAll('tr')
    for row in rows:
        if len(row.contents) == 6:
            # get score string
            score = row.find("td", {"class": "center bold table-odds table-score"}).text
            # get home and away prices
            home_price = float(row.contents[-3].text)
            away_price = float(row.contents[-2].text)
            # read url for detailed match coefficient analysis
            href = row.contents[1].find('a', href=True)
            href = "https://www.oddsportal.com" + href['href']
            
            _data.append([league, season]+process_score(score)+[home_price, away_price, href])
    return _data


def process_score(string):
    """
    Input:
        score string, e.g. "124:114 OT" or "122:104". OT (overtime)
    Output:
        list [total score (float), who won (1: home, 0: away), was were overtime (1: Yes, 0: No)]
    """
    if len(string.split(":")) != 2:
        print("Teams names: {}".format(string))
        return [np.nan, np.nan, np.nan, np.nan]
    # check whatver there was overtime. Extract final score for home and away teams
    if string[-2:] == "OT":
        _ot = 1
        home_score = convert_2_int(string[:-3].split(":")[0])
        away_score = convert_2_int(string[:-3].split(":")[1])
    else:
        _ot = 0
        home_score = convert_2_int(string.split(":")[0])
        away_score = convert_2_int(string.split(":")[1])
    
    # check who won match
    if home_score > away_score:
        _win = 1
    else:
        _win = 0
        
    return [home_score, away_score, _win, _ot]


def convert_2_int(string):
    """
    Input:
        string: e.g. "123" or "93"
    Output:
        int:
    If ValueError, print problematic string and return NAN value
    """
    try:
        return int(string)
    except ValueError:
        print("Input {} could not be converted to integer".format(string))
        return np.nan
    
def convert_2_float(string):
    """
    Input:
        string: e.g. "123" or "93"
    Output:
        int:
    If ValueError, print problematic string and return NAN value
    """
    try:
        return float(string)
    except ValueError:
        print("Input {} could not be converted to float".format(string))
        return np.nan

## Initiate webdriver
`executable_path` points to excecutable used to connect to Firefox. To use different browser download approprate geckodriver (hyperlink) or refer to this Stack exchange post.

In [None]:
%%time
driver = webdriver.Firefox(executable_path=r"geckodriver.exe")

# go to Oddsportal website
driver.get("https://www.oddsportal.com")

# click on log-in button
driver.find_element_by_tag_name('button').click()

# enter User name and psw
driver.find_element_by_id('login-username1').send_keys(usr)
driver.find_element_by_id('login-password1').send_keys(psw, Keys.ENTER)

# set timeout for page loadding to 30 sec
driver.set_page_load_timeout(30)

## Load NBA and Euroleague 2013-2018 season data
* `season_dict` dictonary keys are  league and season names, while values are tuples (url, number of pages to iterate over).
* itrate over season url pages and read team scored points and home/away team average coefficients
* transform data into pandas DataFrame and save it as .csv file

In [None]:
%%time
season_dict = {
    "NBA_2017/2018": ("https://www.oddsportal.com/basketball/usa/nba-2017-2018/results/#/page/", 28),
    "NBA_2016/2017": ("https://www.oddsportal.com/basketball/usa/nba-2016-2017/results/#/page/", 29),
    "NBA_2015/2016": ("https://www.oddsportal.com/basketball/usa/nba-2015-2016/results/#/page/", 29),
    "NBA_2014/2015": ("https://www.oddsportal.com/basketball/usa/nba-2014-2015/results/#/page/", 29),
    "NBA_2013/2014": ("https://www.oddsportal.com/basketball/usa/nba-2013-2014/results/#/page/", 29),
    "EURO_2017/2018": ("https://www.oddsportal.com/basketball/europe/euroleague-2017-2018/results/#/page/", 6),
    "EURO_2016/2017": ("https://www.oddsportal.com/basketball/europe/euroleague-2016-2017/results/#/page/", 6),
    "EURO_2015/2016": ("https://www.oddsportal.com/basketball/europe/euroleague-2015-2016/results/#/page/", 5),
    "EURO_2014/2015": ("https://www.oddsportal.com/basketball/europe/euroleague-2014-2015/results/#/page/", 6),
    "EURO_2013/2014": ("https://www.oddsportal.com/basketball/europe/euroleague-2013-2014/results/#/page/", 6)
}

all_data = list()
for key in season_dict.keys():
    # Load main url page
    url = season_dict[key][0]
    # Extract league and season from key string
    league = key.split("_")[0]
    season = key.split("_")[1]
    # Iterate over all pages for particular season
    for idx in range(1, season_dict[key][1]+1):
        # Load page
        driver.get(url+str(idx))
        # quick and dirty fix, implicit wait for 1.5 sec,  so that page is really loaded
        time.sleep(5)
        # Process HTLM into data
        soup = bs4.BeautifulSoup(driver.page_source)
        all_data += process_soup(soup, league, season)
        
df_1 = pd.DataFrame(all_data, columns=["League", "Season", "Home_score", "Away_score",
                                       "Win", "OT", "Home_p", "Away_p", "URL"])
df_1.to_csv("basketball_scores.csv")
df_1.head()

In [None]:
# load data
df_1 = pd.read_csv("basketball_scores.csv", index_col=0)
df_1.head()

In [None]:
# check data
df_1.groupby(["League", "Season"])["URL"].count()

## Analyze matches in more detail

In [None]:
def get_ah_ou_coef(soup):
    """
    Returns asian handicap coefficient or under/over totals, which were offered by largest number of book makers
    Same logic applies for both type pages
    input:
        market_type: boolean, tells how to read asian handicap or over/under totals
        soup: bs4.BeautifulSoup element (HTML source code parced with selenium webdriver)
    output:
        list: [asian handicap, home coef., away coef.]
        or
        list: [asian handicap, home coef., away coef.]
    """
    table = soup.find("div", {"id": "odds-data-table"})
    table = table.findAll("div", {"class": "table-container"})
    max_book_count = 0
    max_ah = np.nan
    for row in table:
        if row.text != "BETTING EXCHANGES":
            odd_count = int(row.find("span", {"class":"odds-cnt"}).text[1:-1])
            if odd_count > max_book_count:
                max_book_count = odd_count
                bet_type = row.strong.text
                price_1 = convert_2_float(row.findAll('span')[1].text)
                price_2 = convert_2_float(row.findAll('span')[2].text)
    return [bet_type, price_1, price_2]

In [None]:
_league = "NBA"
_season = "2016/2017"
filename = _league + "_" + _season.replace("/","_") + ".csv"
cond_1 = df_1.League == _league
cond_2 = df_1.Season == _season
df_c = df_1[cond_1 & cond_2]
df_c.head()

## Read NBA or Euro League detailed stats

In [None]:
%%time

wait = WebDriverWait(driver, 20)
all_data = list()
i = 0
for url in missing_df.URL.values:
    try:
        driver.get(url)
    except Exception as e:
        print("page not loaded")
    ah_prices = [np.nan, np.nan, np.nan]
    # get soup
    soup = bs4.BeautifulSoup(driver.page_source)
    # get team names
    try:
        names = soup.h1.text.split(" - ")
        name_h = names[0]
        name_a = names[1]
    except Exception as e:
        name_h = np.nan
        name_a = np.nan
        print(e)
        print("Team names not found")
    # get match date
    try:
        match_date = soup.find("div", {"id": "col-content"}).p.text
    except:
        match_date = np.nan    
    try:
        # click AH button
        driver.find_element_by_xpath("//span[contains(text(), 'AH')]").click()
        wait.until(EC.element_to_be_clickable((By.ID, 'odds-data-table')))
        # get soup
        soup = bs4.BeautifulSoup(driver.page_source)
        ah_prices = get_ah_ou_coef(soup)
    except Exception as e:
        print(e)
        print("AH prices not read")  
    ou_prices = [np.nan, np.nan, np.nan]
    try:
        # click OU button
        driver.find_element_by_xpath("//span[contains(text(), 'O/U')]").click()
        wait.until(EC.element_to_be_clickable((By.ID, 'odds-data-table')))
        # get soup
        soup = bs4.BeautifulSoup(driver.page_source)
        ou_prices = get_ah_ou_coef(soup)
    except Exception as e:
        print(e)
        print("OU prices not read")
    # add new data
    all_data.append(ah_prices + ou_prices + [name_h, name_a, match_date, url])
    # save temporary data:
    if i%100==0:
        # make temporal save in case webdriver crashes, loss of internet connection, ect.
        filename = _league + "_" + _season.replace("/","_") + "_" + str(i-100) + "_" + str(i)+ ".csv"
        df_2 = pd.DataFrame(all_data, columns=["AH", "AH_Home", "AH_Away",
                                       "OU", "Over", "Under",
                                       "Home_name", "Away_name", "Date", "URL"])
        df_2.to_csv(filename)
    i+=1
filename = _league + "_" + _season.replace("/","_") + "_" + str(i)+ ".csv"
df_2 = pd.DataFrame(all_data, columns=["AH", "AH_Home", "AH_Away",
                                       "OU", "Over", "Under",
                                       "Home_name", "Away_name", "Date", "URL"])
df_2.to_csv(filename)
df_2.head()

## Turn off webdriver

In [None]:
driver.quit()