In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

In [2]:
diz_anni = {
    2024: 'serie-a',
    2023: 'serie-a-2023-2024',
    2022: 'serie-a-2022-2023',
    2021: 'serie-a-2021-2022'
}

lista_quote = []

### estrazione_dati 

The  function automates the scrolling and extraction of data from a dynamic web page (via Selenium), where soccer matches are listed with associated odds (1X2). The data is extracted from an HTML container (container) and stored in a dictionary, then added to the global list list_quote.

In [4]:
def estrazione_dati(container,body,wait,season):
    try:
        #container = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-v-b8d70024]")))
        # Scroll della pagina per caricare tutti i match
        for _ in range(5):  # Scrolla 5 volte per caricare tutti i contenuti
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(2)
        partite = container.find_elements(By.CSS_SELECTOR,".group.flex")
        
        for partita in partite:
            diz ={}
            print(partita.text)
            righe = partita.text.split('\n')
            if len(righe) >= 7:
                hours = righe[0]
                home_team = righe[1]
                away_team = righe[5]
                risultato = righe[2] + '-' + righe[4]
                quota_1 = righe[6]
                quota_x = righe[7]
                quota_2 = righe[8]

            diz = {
                'season' : season,
                'hours' : hours,
                'home_team': home_team,
                'away_team': away_team,
                'risultato': risultato,
                'quota_1' : quota_1,
                'quota_x' : quota_x,
                'quota_2' : quota_2
            }
            lista_quote.append(diz)

    except Exception as e:
        print(f"errore durante l'estrazione dei dati: {e}")


### navigazione_e_estrazione

The function uses Selenium to browse the oddsportal.com site, browse all the Serie A results pages for each season specified in the diz_years dictionary, and for each page calls the extract_data() function to extract match data (e.g., teams, result, odds). Finally, it closes the browser.

In [5]:
def navigazione_e_estrazione(diz_anni):  
    driver = webdriver.Firefox()

    diz_quote = {}
    for key,value in diz_anni.items():    

        driver.get(f"https://www.oddsportal.com/it/football/italy/{value}/results/#/page/1/")

        time.sleep(10)


        wait = WebDriverWait(driver,15)
        #container = driver.find_element(By.CSS_SELECTOR, "div[data-v-0ba76030]")
        
        container = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,  "div.min-h-\\[80vh\\]")))

        # Trova il body per inviare comandi di scrolling
        body = driver.find_element(By.TAG_NAME, "body")

        try:
            wait = WebDriverWait(driver, 10)
            reject_button = wait.until(EC.element_to_be_clickable((By.ID, "onetrust-reject-all-handler")))
            reject_button.click()
            print("Bottone cliccato con successo!")
        except Exception as e:
            print("Errore nel cliccare il bottone:", e)



        pagination_links = driver.find_elements(By.CSS_SELECTOR, ".pagination-link[data-number]")



        for i in range(len(pagination_links)):
            try:
                if "active" not in pagination_links[i].get_attribute("class"):
                    driver.execute_script("arguments[0].scrollIntoView();", pagination_links[i])
                    time.sleep(1)

                                # Attendi che l'elemento sia effettivamente cliccabile
                    WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable(pagination_links[i]))

                    # Clicca sulla pagina successiva
                    pagination_links[i].click()
                    time.sleep(5) 

                    
                # Aggiorna la lista degli elementi della paginazione (potrebbero cambiare dopo il clic)
                pagination_links = driver.find_elements(By.CSS_SELECTOR, ".pagination-link[data-number]")


                #driver.execute_script("arguments[0].scrollIntoView();", pagination_links[i])
                #time.sleep(1)
                ## Clicca sulla pagina successiva
                #pagination_links[i].click()
                #time.sleep(5)  # Attendi il caricamento della nuova pagina

                print(f"🔄 Estrazione dati dalla pagina {i + 1}")
                estrazione_dati(container,body,wait,key)
            except Exception as e:
                print(f"Errore nel cambio pagina {i + 1}: {e}")

        # Stampa tutti i dati estratti
        for match, info in diz_quote.items():
            print(match, info)

    # Chiudi il browser
    driver.quit()

In [6]:
navigazione_e_estrazione(diz_anni)

df_quote = pd.DataFrame(lista_quote)
df_quote.to_csv("dataset/odds_per_match.csv", index= False, sep = ';')

Bottone cliccato con successo!
🔄 Estrazione dati dalla pagina 1
20:45
Atalanta
2
–
3
Parma
1.84
3.67
4.07
12
20:45
Empoli
1
–
2
Verona
1.93
2.93
4.95
12
20:45
Lazio
0
–
1
Lecce
1.43
4.40
7.46
12
20:45
Torino
0
–
2
Roma
5.45
4.29
1.55
12
20:45
Udinese
2
–
3
Fiorentina
3.00
3.32
2.35
12
20:45
Venezia
2
–
3
Juventus
6.41
4.69
1.45
12
20:45
Milan
2
–
0
Monza
1.24
6.70
10.23
11
18:00
Bologna
1
–
3
Genoa
1.48
4.23
6.76
12
20:45
Como
0
–
2
Inter
4.32
4.04
1.73
12
20:45
Napoli
2
–
0
Cagliari
1.18
7.19
14.17
12
20:45
Roma
3
–
1
Milan
2.15
3.52
3.22
12
20:45
Cagliari
3
–
0
Venezia
2.72
2.89
2.88
12
20:45
Fiorentina
3
–
2
Bologna
2.97
3.02
2.54
12
20:45
Inter
2
–
2
Lazio
1.65
3.96
4.99
12
20:45
Juventus
2
–
0
Udinese
1.38
4.76
8.13
12
20:45
Lecce
1
–
0
Torino
2.04
3.09
4.00
12
20:45
Monza
1
–
3
Empoli
5.09
3.50
1.73
12
20:45
Parma
0
–
0
Napoli
7.02
4.30
1.46
12
20:45
Verona
1
–
1
Como
3.25
3.03
2.36
12
20:45
Genoa
2
–
3
Atalanta
3.24
3.30
2.23
12
20:45
Atalanta
2
–
1
Roma
2.01
3.43
3.68
12
18:30


In [7]:
df_quote

Unnamed: 0,season,hours,home_team,away_team,risultato,quota_1,quota_x,quota_2
0,2024,20:45,Atalanta,Parma,2-3,1.84,3.67,4.07
1,2024,20:45,Empoli,Verona,1-2,1.93,2.93,4.95
2,2024,20:45,Lazio,Lecce,0-1,1.43,4.40,7.46
3,2024,20:45,Torino,Roma,0-2,5.45,4.29,1.55
4,2024,20:45,Udinese,Fiorentina,2-3,3.00,3.32,2.35
...,...,...,...,...,...,...,...,...
195,2021,20:45,Lazio,Milan,1-2,3.04,3.38,2.33
196,2021,18:00,Genoa,Cagliari,1-0,2.05,3.24,3.87
197,2021,15:00,Bologna,Udinese,2-2,3.19,3.31,2.29
198,2021,15:00,Empoli,Napoli,3-2,4.84,4.17,1.64
