# Import packages
Oddsportal has dynamic content, i.e. javascript rendered page. I used `selenium` package to load and render url content with Morzilla Firefox then parsed source code to `BeautifulSoup` to extract information of interest.

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import bs4
import time
import pandas as pd
import numpy as np
from datetime import datetime
from psw import psw, usr

### Helper functions

In [2]:
def get_unique_href(soup, league, season):
    """
    Input:
        soup: bs4.BeautifulSoup element (HTML source code parced with selenium webdriver)
        league: string, e.g. NBA or Euro
        season: string  e.g. 2017/1018 or 2013/2014
    Output:
        List of list. Each list element contains a size of 3
        [league, season, url (unique for each game)]
    """
    _data = list()
    rows = soup.tbody.findAll('tr')
    for row in rows:
        if len(row.contents) == 7:
            href = row.contents[1].find('a', href=True)
            if href is not None:
                href = "https://www.oddsportal.com" + href['href']
                _data.append([league, season, href])
    return _data

def convert_2_int(string):
    """
    Input:
        string: e.g. "123" or "93"
    Output:
        int:
    If ValueError, print problematic string and return NAN value
    """
    try:
        return int(string)
    except ValueError:
        print("Input {} could not be converted to integer".format(string))
        return np.nan
    
def convert_2_float(string):
    """
    Input:
        string: e.g. "123" or "93"
    Output:
        int:
    If ValueError, print problematic string and return NAN value
    """
    try:
        return float(string)
    except ValueError:
        print("Input {} could not be converted to float".format(string))
        return np.nan

## Initiate webdriver
`executable_path` points to excecutable used to connect to Firefox. To use different browser download approprate geckodriver (hyperlink) or refer to this Stack exchange post.

In [3]:
%%time
driver = webdriver.Firefox(executable_path=r"geckodriver.exe")

# go to Oddsportal website
driver.get("https://www.oddsportal.com")

# click on log-in button
driver.find_element_by_tag_name('button').click()

# enter User name and psw
driver.find_element_by_id('login-username1').send_keys(usr)
driver.find_element_by_id('login-password1').send_keys(psw, Keys.ENTER)

# set timeout for page loadding to 30 sec
driver.set_page_load_timeout(30)

# set wait element for explicit wait
wait = WebDriverWait(driver, 10)

Wall time: 17.9 s


## Load spesific football league season data

In [4]:
league = "EPL"
season = "2015_2016"
season_url = "https://www.oddsportal.com/soccer/england/premier-league-2015-2016/results/#/page/"
# number of pages for the season
no_pages = 8

In [5]:
%%time
# temporary list to store data
_data = list()

for idx in range(1, no_pages+1):
    # Load page
    driver.get(season_url+str(idx))
    # explicitly wait untill page is loaded
    wait.until(EC.visibility_of_element_located((By.ID, 'tournamentTable')))
    # Process HTLM into data
    soup = bs4.BeautifulSoup(driver.page_source)
    _data += get_unique_href(soup, league, season)

# create dataframe with unqiue URLS
df_urls = pd.DataFrame(_data, columns=["League", "Season", "URL"])

Wall time: 27.4 s


#### Helper functions for analysing match info

In [6]:
def get_team_names(soup):
    """
    Input:
        soup: BS4 soup element
    """
    try:
        names = soup.h1.text.split(" - ")
        return names[0], names[1]
    except Exception as e:
        print(e)
        print("Team names not found")
        return np.nan, np.nan
    
            
def load_page(driver, url):
    """
    Return true if page was loaded correctly, else if error occured
    """
    try:
        driver.get(url)
        return True
    except Exception as e:
        print("page not loaded")
        print(f"{e}")
        return False
    
def get_match_date(soup):
    """
    Return date as string
    """   
    try:
        match_date = soup.find("div", {"id": "col-left"}).p.text
        datetime_object = datetime.strptime(match_date, '%A, %d %b %Y, %H:%M')
        return datetime_object
    except:
        return np.nan  
       
def get_score(soup):
    """
    Return
    """
    element = soup.find("p", {"class": "result"})
    # calculate full-time score
    score_FT = element.contents[1].text.split(":")
    score_home_FT = convert_2_int(score_FT[0])
    score_away_FT = convert_2_int(score_FT[1])
    # calculate half-time scores
    score_HT = element.contents[2].split(",")[0].split(":")
    score_home_HT = convert_2_int(score_HT[0][2:])
    score_away_HT = convert_2_int(score_HT[1])

    return [score_home_FT, score_away_FT, score_home_HT, score_away_HT]
        
def norm_prop(list_of_odds):
    """
    Input:
        list_of_odds contains a list of odds
    Output:
        list of implied probabilities
    """
    list_of_odds = 1/np.array(list_of_odds)
    vigor = np.sum(list_of_odds)
    return list_of_odds/vigor

def calculate_stats(stats):
    """
    Input:
        stats: list, list of probabilities
    Output:
        list, [avg, median, std]
    """
    stats = np.array(stats)
    _max = stats.max().round(3)
    _avg = stats.mean().round(3)
    _median = np.median(stats).round(3)
    _std = stats.std().round(3)

    return [_max, _avg, _median, _std]

def process_1x2_data(soup):
    """
    Input:
        soup: bs4 element
        single: bool, if Ture then returns info about single event 1x2 odds
    Output:
        list of lists
    """
    # create empty list to store odds and probabilities for spesific bookmakers
    pinn_odds = [np.nan]*3
    pinn_prop = [np.nan]*3
    
    # create empty lists to store home, draw and away probabilities
    home_stats = list()
    draw_stats = list()
    away_stats = list()
    
    home_odds = list()
    draw_odds = list()
    away_odds = list()

    _data = list()

    rows = soup.tbody.findAll("tr")
    for row in rows:
        content = row.findAll("td")

        if len(content) == 6:
            # read odds for each bookmaker
            book_name = content[0].text.replace(u'\xa0', u'')
            home_o = convert_2_float(content[1].text)
            draw_o = convert_2_float(content[2].text)
            away_o = convert_2_float(content[3].text)
            # save odds to lists
            home_odds.append(home_o)
            draw_odds.append(draw_o)
            away_odds.append(away_o)

            # calculate probabilities
            _odds = [home_o, draw_o, away_o]
            _props = (1/norm_prop(_odds)).round(2)
            home_p = _props[0]
            draw_p = _props[1]
            away_p = _props[2]
            # add probabilities to list for future calculations
            home_stats.append(home_p)
            draw_stats.append(draw_p)
            away_stats.append(away_p)

            # read spesific bookmaker data
            if book_name == "Pinnacle":
                pinn_prop = _props.tolist()
                pinn_odds = _odds
    
    # calculate stats
    home_stats = calculate_stats(home_stats)
    draw_stats = calculate_stats(draw_stats)
    away_stats = calculate_stats(away_stats)
    
    
    home_odds = np.array(home_odds)
    draw_odds = np.array(draw_odds)
    away_odds = np.array(away_odds)
    
    home_max = home_odds.max()
    draw_max = draw_odds.max()
    away_max = away_odds.max()

    _data.append(["home", pinn_prop[0], pinn_odds[0], home_max] + home_stats)
    _data.append(["draw", pinn_prop[1], pinn_odds[1], draw_max] + draw_stats)
    _data.append(["away", pinn_prop[2], pinn_odds[2], away_max] + away_stats)
    return _data    


In [7]:
def refresh_driver(driver):
    """
    Turns off and on webdriver
    """
    # turn off driver
    driver.quit()

    # initiate again driver
    driver = webdriver.Firefox(executable_path=r"geckodriver.exe")

    # go to Oddsportal website
    driver.get("https://www.oddsportal.com")

    # click on log-in button
    driver.find_element_by_tag_name('button').click()

    # enter User name and psw
    driver.find_element_by_id('login-username1').send_keys(usr)
    driver.find_element_by_id('login-password1').send_keys(psw, Keys.ENTER)

    # set timeout for page loadding to 30 sec
    driver.set_page_load_timeout(30)

    # set wait element for explicit wait
    wait = WebDriverWait(driver, 10)
    
    return driver, wait

In [8]:
driver, wait = refresh_driver(driver)

In [9]:
%%time
# temporary list to store data
_data = list()
failed_urls = list()

i = 1

for url in df_urls.URL.values[:30]:
    if load_page(driver, url):
        # wait till page is loaded
        wait.until(EC.visibility_of_element_located((By.ID , "odds-data-table")))
        # get soup element
        soup = bs4.BeautifulSoup(driver.page_source)
        # read team names
        home_n, away_n = get_team_names(soup)
        # get match date
        match_date = get_match_date(soup)
        # get score
        score = get_score(soup)
        
        # get 1x2 stats
        _ = process_1x2_data(soup)
        
        # add all data
        _data.append([league, season, match_date, home_n, away_n, url] + score + _[0])
        _data.append([league, season, match_date, home_n, away_n, url] + score + _[1])
        _data.append([league, season, match_date, home_n, away_n, url] + score + _[2])
    else:
        failed_urls.append([url])
        
    if i%5 == 0:
        # create DataFrame
        df_events = pd.DataFrame(_data, columns=["League", "Season", "Date", "Home_name", "Away_name", "URL",
                                        "Score_home_FT", "Score_away_FT", "Score_home_HT", "Score_away_HT", "Type",
                                        "Pinn_prop", "Pinn_odds", "Odds_max",
                                        "Prop_max", "Prop_avg", "Prop_median", "Prop_STD"])
        
        # make temporal save in case webdriver crashes, loss of internet connection, ect.
        filename = league + "_" + season + "_" + str(i)+ ".csv"
        
        # save temporary file
        df_events.to_csv(filename)
        
#         # turn off driver
#         driver.quit()
        
#         # initiate again driver
#         driver = webdriver.Firefox(executable_path=r"geckodriver.exe")

#         # go to Oddsportal website
#         driver.get("https://www.oddsportal.com")

#         # click on log-in button
#         driver.find_element_by_tag_name('button').click()

#         # enter User name and psw
#         driver.find_element_by_id('login-username1').send_keys(usr)
#         driver.find_element_by_id('login-password1').send_keys(psw, Keys.ENTER)

#         # set timeout for page loadding to 30 sec
#         driver.set_page_load_timeout(30)

#         # set wait element for explicit wait
#         wait = WebDriverWait(driver, 10)

        driver, wait = refresh_driver(driver)
        
    i+=1

# create DataFrame
df_events = pd.DataFrame(_data, columns=["League", "Season", "Date", "Home_name", "Away_name", "URL",
                                        "Score_home_FT", "Score_away_FT", "Score_home_HT", "Score_away_HT", "Type",
                                        "Pinn_prop", "Pinn_odds", "Odds_max",
                                        "Prop_max", "Prop_avg", "Prop_median", "Prop_STD"])
    
# make temporal save in case webdriver crashes, loss of internet connection, ect.
filename = league + "_" + season + "_" + str(i)+ ".csv"

# save temporary file
df_events.to_csv(filename)

Wall time: 3min


In [None]:
# _data

df_events = pd.DataFrame(_data, columns=["League", "Season", "Date", "Home_name", "Away_name", "URL",
                                        "Score_home_FT", "Score_away_FT", "Score_home_HT", "Score_away_HT", "Type",
                                        "Pinn_prop", "Pinn_odds", "Odds_max",
                                        "Prop_max", "Prop_avg", "Prop_median", "Prop_STD"])

df_events.head()

In [None]:
driver.quit()

In [None]:
_data[0]

In [None]:
soup = bs4.BeautifulSoup(driver.page_source)
_ = process_1x2_data(soup)

In [None]:
_data[0]

In [None]:
soup = bs4.BeautifulSoup(driver.page_source)
get_score(soup)

In [None]:
home_n, away_n

In [None]:
match_date