# Web scraping: Tipico live sports betting 

This notebook uses the BeautifulSoup library to scrape information about live soccer games from tipico.de.
It stores all information in pd dataframes and exports them into csv-files for further analyses.

## Import relevant libraries

In [188]:
import time
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

from bs4 import BeautifulSoup
import urllib.request
import re

## Load and parse Tipico website source code

In [311]:
url = "https://www.tipico.de/de/live-wetten/"

try:
    page = urllib.request.urlopen(url)
except:
    print("An error occured.")

soup = BeautifulSoup(page, 'html.parser')
print(soup)

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" >

<html xmlns="http://www.w3.org/1999/xhtml" xmlns:e="ebet/e">
<head>
<link class="component" href="/a4j/s/3_3_1.GAorg/richfaces/renderkit/html/css/basic_classes.xcss/DATB/eAF7sqpgb-jyGdIAFrMEaw__.faces" rel="stylesheet" type="text/css"/><link class="component" href="/a4j/s/3_3_1.GAorg/richfaces/renderkit/html/css/extended_classes.xcss/DATB/eAF7sqpgb-jyGdIAFrMEaw__.faces" media="rich-extended-skinning" rel="stylesheet" type="text/css"/><script src="/a4j/g/3_3_1.GAorg.ajax4jsf.javascript.AjaxScript.faces" type="text/javascript"></script><script src="/a4j/g/3_3_1.GAorg/ajax4jsf/javascript/scripts/form.js.faces" type="text/javascript"></script><script id="org.ajax4jsf.queue_script" type="text/javascript">if (typeof A4J != 'undefined') { if (A4J.AJAX) { with (A4J.AJAX) {if (!EventQueue.getQueue('topSearchForm:headQueue')) { EventQueue.addQueue(new EventQueue('topSearchF

## Extract relevant information from source code

In [312]:
regex = re.compile('c_but_base c_but')
content_lis = soup.find_all('button', attrs={'class': regex})
print(content_lis)

[<button class="c_but_base c_but" name="ql44352866410" onclick="javascript:tr(44352866410,'WEB_LIVE_LIVEGAMES')" result_pk="44352866410" tabindex="" type="button">
			30
		</button>, <button class="c_but_base c_but" name="ql44352866510" onclick="javascript:tr(44352866510,'WEB_LIVE_LIVEGAMES')" result_pk="44352866510" tabindex="" type="button">
			10
		</button>, <button class="c_but_base c_but" name="ql44352866610" onclick="javascript:tr(44352866610,'WEB_LIVE_LIVEGAMES')" result_pk="44352866610" tabindex="" type="button">
			1,07
		</button>, <button class="c_but_base c_but" name="ql44398525110" onclick="javascript:tr(44398525110,'WEB_LIVE_LIVEGAMES')" result_pk="44398525110" tabindex="" type="button">
			2,8
		</button>, <button class="c_but_base c_but" name="ql44398524910" onclick="javascript:tr(44398524910,'WEB_LIVE_LIVEGAMES')" result_pk="44398524910" tabindex="" type="button">
			2,5
		</button>, <button class="c_but_base c_but" name="ql44398525010" onclick="javascript:tr(44398525

In [313]:
content = []
for li in content_lis:
    content.append(li.getText().replace("\n","").replace("\t",""))
print(content)

['30', '10', '1,07', '2,8', '2,5', '2,9', '2,3', '3,8', '2,3', '2,15', '1,6', '3,3', '2,4', '2,6', '3,3', '2,4', '2,6', '2,5', '3,8', '2,1', '1,95', '1,7', '8,0', '1,17', '7,0', '8,0', '1,17', '7,0', '8,0', '1,2', '6,5', '4,0', '1,2', '1,7', '3,3', '5,0', '1,7', '3,3', '5,0', '1,55', '6,0', '3,3', '1,6', '2,2', '3,0', '1,55', '7,0', '3,0', '1,55', '7,0', '2,8', '1,65', '6,0', '2,0', '1,65', '1,35', '', '2,9', '', '', '', '', '', '', '3,2', '', '1,3', '4,5', '', '1,17', '2,55', '1,45', '1,03', '', '7,5', '', '', '', '', '', '', '1,01', '', '9,5', '', '', '', '', '', '', '2,8', '', '1,4', '5,0', '20', '1,15', '1,8', '1,85', '1,85', '1,85', '5,2', '', '1,12', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '6,0', '', '1,08', '6,0', '1,08', '1,35', '', '3,0', '5,5', '', '1,12', '2,4', '1,5', '3,0', '', '1,3', '', '', '', '3,5', '1,25', '1,02', '', '9,0', '1,12', '', '5,0', '2,7', '1,4', '4,5', '3,8', '1,55', '4,7', '2,2', '2,0', '1,6', '2,2', '2,45', '2,9', '3,1', '

In [314]:
regex_names = re.compile('team')
content_lis_names = soup.find_all('div', attrs={'class': regex_names})
print(content_lis_names)

[<div class="team redcard-holder redcard_0 " onmouseout="untip()" onmouseover="tip('Fu\u00DFball - Mexiko - Liga Premier',0)">Cruz Azul Hidalgo</div>, <div class="team redcard-holder redcard_0 " onmouseout="untip()" onmouseover="tip('Fu\u00DFball - Mexiko - Liga Premier',0)">Irapuato</div>, <div class="team redcard-holder redcard_0 " onmouseout="untip()" onmouseover="tip('Fu\u00DFball - Paraguay - Primera Division, Clausura',0)">Dep. Capiata</div>, <div class="team redcard-holder redcard_0 " onmouseout="untip()" onmouseover="tip('Fu\u00DFball - Paraguay - Primera Division, Clausura',0)">River Plate Asuncion</div>, <div class="team">1. Halbzeit</div>, <div class="team redcard-holder redcard_0 " onmouseout="untip()" onmouseover="tip('Fu\u00DFball - Nicaragua - Primera Division, Apertura',0)">Dep. Walter Ferreti</div>, <div class="team redcard-holder redcard_0 " onmouseout="untip()" onmouseover="tip('Fu\u00DFball - Nicaragua - Primera Division, Apertura',0)">Las Sabanas CF</div>, <div cla

In [315]:
content_names = []
for li in content_lis_names:
    content_names.append(li.getText().split('\n\t')[0])
print(content_names)

['Cruz Azul Hidalgo', 'Irapuato', 'Dep. Capiata', 'River Plate Asuncion', '1. Halbzeit', 'Dep. Walter Ferreti', 'Las Sabanas CF', '1. Halbzeit', 'Hsieh S-W/Hsieh Y-C', 'Kichenok N/Spears ', 'Doi, M.', 'Pavlyuchenkova, A.', 'Caruana, L.', 'Hohmann, R.', 'Sarkissian, A.', 'Ignat, D. C.', 'Flamengo', 'CA San Lorenzo de Almagro', 'Chicago Cubs', 'St. Louis Cardinals', 'Alftanes', 'Afturelding', 'Ginasio', 'SC Espinho', 'Northwest. Wildcats (F)', 'Loyala Unv. Chicago (F)', 'Miami Redhawks (F)', 'Illinois State Redbirds (F)', 'Minas Tenis Clube U19', 'Botafogo C.R. U19', 'Villa San Carlos (F)', 'Rosario Central (F)', 'Union Santa Fe', 'Rosario Central', 'Sao Judas', 'Sao Jose dos C.', 'Criciuma EC', 'Atletico GO', 'Loros Universidad', 'Atlante FC', 'Greenville T. SC', 'FC Toronto II', 'Dor. de Sinaloa', 'CF Cafetaleros', 'Mississauga Steel.', 'Niagara Icedogs', 'New Jersey Devils', 'New York Rangers', 'Funvic/Taubate', 'Volei Ribeirao', 'Baltimore Orioles', 'Seattle Mariners', 'New York Yank

In [316]:
regex_minute = re.compile('c_1 time pulsation')
content_lis_minute = soup.find_all('div', attrs={'class': regex_minute})
print(content_lis_minute)

[<div class="c_1 time pulsation">
	
		
		
		
		
			
			HZ				
		
	
</div>, <div class="c_1 time pulsation">
	
		
		
		
		
			
			37'				
		
	
</div>, <div class="c_1 time pulsation"> </div>, <div class="c_1 time pulsation">
	
		
		
		
		
			
			24'				
		
	
</div>, <div class="c_1 time pulsation"> </div>, <div class="c_1 time pulsation">
<img border="0" src="/img/sporticons/icon_rain-26694EBCC12D742D4EBA6D2AE06E71DD.png" style="width: 38px; height: 24px;"/>
</div>, <div class="c_1 time pulsation">
<img border="0" src="/img/sporticons/icon_rain-26694EBCC12D742D4EBA6D2AE06E71DD.png" style="width: 38px; height: 24px;"/>
</div>, <div class="c_1 time pulsation">
<img border="0" src="/img/sporticons/icon_rain-26694EBCC12D742D4EBA6D2AE06E71DD.png" style="width: 38px; height: 24px;"/>
</div>, <div class="c_1 time pulsation">
<img border="0" src="/img/sporticons/icon_rain-26694EBCC12D742D4EBA6D2AE06E71DD.png" style="width: 38px; height: 24px;"/>
</div>, <div class="c_1 time pulsation">
<div cla

In [317]:
content_minute = []
for li in content_lis_minute:
    if (li.getText().replace("\n","").replace("\t","").replace("\xa0","").replace("'","") != ""):
        content_minute.append(li.getText().replace("\n","").replace("\t","").replace("\xa0","").replace("'",""))
print(content_minute)

['HZ', '37', '24', '1.P', '9/9', '3.Set', '3.Set', '2.Set', '1.Set', '1.Set']


In [76]:
regex_score = re.compile('score')
content_lis_score = soup.find_all('div', attrs={'class': regex_score})
print(content_lis_score)

[<div class="score ">
<span class="">0</span>:<span class="">0</span>
</div>, <div class="score ">
<span class="">2</span>:<span class="">0</span>
</div>, <div class="score ">
<span class="">2</span>:<span class="">0</span>
</div>, <div class="score ">
<span class="">0</span>:<span class="">0</span>
</div>, <div class="score ">
<span class="">2</span>:<span class="">1</span>
</div>, <div class="score ">
<span class="">0</span>:<span class="">0</span>
</div>, <div class="score ">
<span class="">0</span>:<span class="">0</span>
</div>, <div class="score ">
<span class="">2</span>:<span class="">1</span>
</div>, <div class="score ">
<span class="">3</span>:<span class="">1</span>
</div>, <div class="score ">
<span class="">2</span>:<span class="">1</span>
</div>, <div class="score ">
<span class="">1</span>:<span class="">0</span>
</div>, <div class="score ">
<span class="">0</span>:<span class="">1</span>
</div>, <div class="score ">
<span class="">0</span>:<span class="">0</span>
</div>

In [31]:
content_score = []
for li in content_lis_score:
    content_score.append(li.getText().replace("\n","").replace("\t","").replace("\xa0",""))
print(content_score)

now contents
['0:0', '2:0', '2:0', '0:0', '1:0', '0:0', '0:0', '2:1', '2:1', '1:1', '1:0', '0:1', '0:0', '1:0', '0:0', '0:0', '1:1', '0:1', '1:1', '0:0', '0:1', '0:1', '0:0', '0:0', '0:0', '0:0', '0:0', '0:0', '0:0', '1:0', '0:0', '0:0', '0:0', '0:1', '    5:6  ', '0:0', '3  6:6 2', '0:0', '    6:5  ', '0:0', '    0:2  ', '0:0', '    0:2  ', '1:1', '    4:4  ', '0:0', '5  6:6 6', '0:1', '    1:4  ', '0:0', '    5:2  ', '0:1', '    0:1  ', '0:1', '    1:3  ', '0:0', '    2:1  ', '0:0', '    0:1  ', '99:94', '80:71', '27:28', '25:43', '38:30', '3:0', '0:0', '1:2', '0:0', '0:0', '0:0', '0:0', '10:11', '11:10', '9:12', '10:11', '1:2', '    17:19  ', '2:1', '    13:16  ', '1:2', '    4:3  ', '0:0', '    13:13  ', '0:1', '    13:14  ', '0:1', '    1:0  ', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 

## Store information in dataframe

In [134]:
N_games = 12 # number of games observed
N_players = 2 # number of players per game
N_outcomes = 11 # number of possible outcomes (Win, lose, tie, Next goal etc.)
df = []
for i in range(N_games):
    df.append([datetime.now(), content_names[0+i*N_players], content_names[1+i*N_players], content_minute[0+i],
        content_score[0+i], content[0+i*N_outcomes], content[1+i*N_outcomes], content[2+i*N_outcomes], content[6+i*N_outcomes],
        content[7+i*N_outcomes], content[8+i*N_outcomes]])
df

[[datetime.datetime(2019, 9, 20, 16, 50, 53, 798616),
  'Ahi Akko',
  'Bnei Hag. VeHaGalil',
  '89',
  '0:0',
  '6,5',
  '1,18',
  '10',
  '6,0',
  '1,2',
  '8,5'],
 [datetime.datetime(2019, 9, 20, 16, 50, 53, 798616),
  'Watford U23',
  'Bristol City U23',
  '85',
  '2:0',
  '',
  '30',
  '120',
  '4,5',
  '1,4',
  '5,5'],
 [datetime.datetime(2019, 9, 20, 16, 50, 53, 798616),
  'FC Sioni Bolnisi',
  'Torpedo Kutaisi',
  '87',
  '2:0',
  '',
  '30',
  '80',
  '6,5',
  '1,2',
  '6,0'],
 [datetime.datetime(2019, 9, 20, 16, 50, 53, 798616),
  'Dep. Capiata Res.',
  'River Plate Asun. Res.',
  '87',
  '0:0',
  '6,0',
  '1,2',
  '10',
  '6,0',
  '1,25',
  '6,5'],
 [datetime.datetime(2019, 9, 20, 16, 50, 53, 798616),
  'FC Rustavi',
  'FC Lokomotive Tbilisi',
  '84',
  '1:0',
  '1,05',
  '8,0',
  '100',
  '4,3',
  '1,3',
  '8,5'],
 [datetime.datetime(2019, 9, 20, 16, 50, 53, 798616),
  'Ironi Tiberias',
  'Hapoel Bnei Zalfa',
  '86',
  '0:0',
  '8,0',
  '1,2',
  '7,0',
  '8,0',
  '1,18',
  '

In [135]:
pdf = pd.DataFrame(df, columns = ['Time', 'Player_1', 'Player_2', 'MinuteOfGame', 'Score', 'Win_1', 'Win_X', 'Win_2', 
                                  'NextGoal_1' , 'NextGoal_X' , 'NextGoal_2' ])
#pdf['Score'] = pdf['Score'].astype(str)
pdf

Unnamed: 0,Time,Player_1,Player_2,MinuteOfGame,Score,Win_1,Win_X,Win_2,NextGoal_1,NextGoal_X,NextGoal_2
0,2019-09-20 16:50:53.798616,Ahi Akko,Bnei Hag. VeHaGalil,89,0:0,65.0,118.0,10.0,60.0,12.0,85.0
1,2019-09-20 16:50:53.798616,Watford U23,Bristol City U23,85,2:0,,30.0,120.0,45.0,14.0,55.0
2,2019-09-20 16:50:53.798616,FC Sioni Bolnisi,Torpedo Kutaisi,87,2:0,,30.0,80.0,65.0,12.0,60.0
3,2019-09-20 16:50:53.798616,Dep. Capiata Res.,River Plate Asun. Res.,87,0:0,60.0,12.0,10.0,60.0,125.0,65.0
4,2019-09-20 16:50:53.798616,FC Rustavi,FC Lokomotive Tbilisi,84,1:0,105.0,80.0,100.0,43.0,13.0,85.0
5,2019-09-20 16:50:53.798616,Ironi Tiberias,Hapoel Bnei Zalfa,86,0:0,80.0,12.0,70.0,80.0,118.0,65.0
6,2019-09-20 16:50:53.798616,Mac. Kabilio Jaffa,Nordia Jerusalem,85,0:0,95.0,12.0,60.0,85.0,117.0,55.0
7,2019-09-20 16:50:53.798616,Hull City U23,Leeds Utd U23,84,2:1,125.0,40.0,20.0,55.0,15.0,36.0
8,2019-09-20 16:50:53.798616,Ho Chí Minh City,FC Sai Gon,84,2:1,,10.0,100.0,33.0,145.0,75.0
9,2019-09-20 16:50:53.798616,The Cong F.C.,B. Binh Duong,87,1:1,,,,,,


# Write results into functions

In [384]:
def get_soccer_rates_tipico(N_games = 60):
    """
    This function creates a table with the live betting information,
    this includes a timestamp, the players, the score and the rates
    for each party winning and scoring the next goal.
    
    Arguments:
    None
    
    Returns:
    pdf -- pandas dataframe with the results of shape (12, 11)
    """
    
    # Check processing time 
    start_time = time.time()
    
    ## SCRAPE AND PARSE TIPICO
    
    # Retrieve source code from tipico live betting page and create soup
    url = "https://www.tipico.de/de/live-wetten/"

    try:
        page = urllib.request.urlopen(url)
    except:
        print("An error occured.")

    soup = BeautifulSoup(page, 'html.parser')

    # Read out betting rates
    regex = re.compile('c_but_base c_but')
    content_lis = soup.find_all('button', attrs={'class': regex})
    content = []
    for li in content_lis:
        content.append(li.getText().replace("\n","").replace("\t",""))

    # Read out names of players
    regex_names = re.compile('team')
    content_lis_names = soup.find_all('div', attrs={'class': regex_names})
    content_names = []
    for li in content_lis_names:
        content_names.append(li.getText().split('\n\t')[0])

    # Read out minutes of each game
    regex_minute = re.compile('c_1 time pulsation')
    content_lis_minute = soup.find_all('div', attrs={'class': regex_minute})
    content_minute = []
    for li in content_lis_minute:
        # Sometimes it finds blanks, they must be skipped to keep the order
        if (li.getText().replace("\n","").replace("\t","").replace("\xa0","").replace("'","") != ""):
            content_minute.append(li.getText().replace("\n","").replace("\t","").replace("\xa0","").replace("'",""))

    # Read out scores of each game
    regex_score = re.compile('score')
    content_lis_score = soup.find_all('div', attrs={'class': regex_score})
    content_score = []
    for li in content_lis_score:
        content_score.append(li.getText().replace("\n","").replace("\t","").replace("\xa0",""))
    
    ## STORE AND RETURN RESULTS IN DATAFRAME
    
    # Define number of games and players and outcomes to correctly transfer data into table
    #N_games = 60 # number of games observed, put 60 to get all soccer games (plus a few tennis)
    N_players = 2 # number of players per game
    N_outcomes = 11 # number of possible outcomes (Win, lose, tie, Next goal etc.)

    df = []
    k = 0 # Shift player names to correct the "1. Halbzeit" entries
    j = 0 # Shift minutes to correct the "1. Halbzeit" entries
    for i in range(N_games):
        # When there is a "1. Halbzeit", there is only on entry for the player names and it must be shifted
        if content_names[0+i*N_players+k] == "1. Halbzeit":
            j = j-1 # correct minute
            df.append([datetime.now(), content_names[i*N_players+k], "", content_minute[j],
            content_score[j], content[i*N_outcomes], content[1+i*N_outcomes], content[2+i*N_outcomes], 
            content[6+i*N_outcomes], content[7+i*N_outcomes], content[8+i*N_outcomes]])
            k = k-1 # correct player name for halftime
        
        else:
            df.append([datetime.now(), content_names[i*N_players+k], content_names[1+i*N_players+k], content_minute[j],
            content_score[j], content[i*N_outcomes], content[1+i*N_outcomes], content[2+i*N_outcomes],
            content[6+i*N_outcomes], content[7+i*N_outcomes], content[8+i*N_outcomes]])
        
        j += 1
        
    # Load results into pandas dataframe
    pdf = pd.DataFrame(df, columns = ['Time', 'Player_1', 'Player_2', 'MinuteOfGame', 'Score', 'Win_1', 'Win_X', 'Win_2', 
                                  'NextGoal_1' , 'NextGoal_X' , 'NextGoal_2' ])
    
    # Return processing time
    prc_time = time.time() - start_time
    #print("Processing time in seconds: ", prc_time)
    
    return pdf

In [386]:
get_soccer_rates_tipico(N_games=5)

Unnamed: 0,Time,Player_1,Player_2,MinuteOfGame,Score,Win_1,Win_X,Win_2,NextGoal_1,NextGoal_X,NextGoal_2
0,2019-09-21 00:05:30.958600,Cruz Azul Hidalgo,Irapuato,48,0:2,30,11,106,23,36,24
1,2019-09-21 00:05:30.958600,Dep. Capiata,River Plate Asuncion,HZ,0:1,10,38,135,24,35,23
2,2019-09-21 00:05:30.958600,Dep. Walter Ferreti,Las Sabanas CF,37,1:0,112,60,20,165,43,35
3,2019-09-21 00:05:30.958600,1. Halbzeit,,37,1:0,103,11,100,47,125,10
4,2019-09-21 00:05:30.958600,Villa San Carlos (F),Rosario Central (F),4,0:1,85,70,122,26,13,15


In [389]:
def repeat_scraping(timedelay, number_of_scrapes, N_games = 60, filename = 'bet_rates_scraping_tipico.csv'):
    """
    This function repeadetly calls the scraping function to create a timeseries
    of scraping data. The time interval between scrapes and number of scrapes
    in total are taken as argument. The result is saved in a csv-file.
    
    Arguments:
    timedelay -- delay between each scrape request in seconds (min. 3 sec recommended due to processing time)
    number_of_scrapes --  number of scrape requests 
    
    Returns:
    Void
    """
    
    ## CALL AND REPEAT SCRAPING FUNCTION
    runs = 0
    while runs < number_of_scrapes:
        
        start_time = time.time()
        runs += 1
        
        # Print progress every 10 runs
        if ( (np.mod(runs,10) == 0) or (runs == 1) ):
            print("Scraping run number: " + str(runs) + " / " + str(number_of_scrapes) + " (timedelay is " + str(timedelay) +")")
        
        pdf = get_soccer_rates_tipico(N_games)
        
        # Check if a .csv file exists, if yes append new information, else create a new file
        try:
            dataframe = pd.read_csv(filename, encoding = 'unicode_escape')
            dataframe = pd.concat([dataframe, pdf], ignore_index=True)
            dataframe.to_csv(filename, index=False)
        except:
            print("Couldn't find ", filename)
            print("Created as new file: ", filename)
            dataframe = pdf.to_csv(filename, index=False)
        
        # Check processing time and add sleeping time to fit the timedelay
        time_run = time.time() - start_time
        time.sleep(timedelay - time_run)

# Scrape games

In [None]:
# Input: timedelay, number_of_scrapes, N_games, filename 
repeat_scraping(5, 50, 4, 'newscrape.csv') # Every 10 seconds for an entire game ~115 minutes and 6 scrapes/min = 690 scrapes

Scraping run number: 1 / 50 (timedelay is 5)


# Analyse games

In [168]:
dataframe = pd.read_csv('bet_rates_scraping_tipico.csv', encoding = 'unicode_escape')

In [170]:
dataframe.head()

Unnamed: 0,Time,Player_1,Player_2,MinuteOfGame,Score,Win_1,Win_X,Win_2,NextGoal_1,NextGoal_X,NextGoal_2
0,2019-09-20 17:02:09.264616,FC Ufa,Spartak Moskau,31.0,1:0,16,36,60,27,50,185
1,2019-09-20 17:02:09.264616,Al-Orouba SC,Dhofar,,0:0,116,45,30,60,14,43
2,2019-09-20 17:02:09.264616,Al Suwaiq,Al Nahda,90.0,1:1,13,112,85,15,115,55
3,2019-09-20 17:02:09.264616,Atl. Tucuman Res.,Independiente Res.,80.0,0:1,50,145,50,43,155,40
4,2019-09-20 17:02:09.264616,Al-Sadd,Umm-Salal SC,75.0,2:1,15,52,12,36,19,30


In [171]:
dataframe.shape

(42000, 11)