In [6]:
from seleniumbase import Driver
from selenium.webdriver.common.by import By
import time
import csv
import pandas as pd

# Function to collect match data for a given league URL
def collect_league_data(base_url, league_code, valid_seasons, output_file):
    all_dates_list = []
    all_homeTeams_list = []
    all_awayTeams_list = []
    all_scores_list = []
    league_played_list = []

    for season in valid_seasons:
        full_url = f"{base_url}/{league_code}-{season}"

        # Navigate to the URL
        driver.get(full_url)
        # Wait to ensure the page loads completely
        time.sleep(3)

        # Extract match data
        match_dates = driver.find_elements(By.XPATH, '//*[@class="standard_tabelle"]/tbody/tr[*]/td[1]')
        match_homeTeams = driver.find_elements(By.XPATH, '//*[@class="standard_tabelle"]/tbody/tr[*]/td[3]')
        match_awayTeams = driver.find_elements(By.XPATH, '//*[@class="standard_tabelle"]/tbody/tr[*]/td[5]')
        match_scores = driver.find_elements(By.XPATH, '//*[@class="standard_tabelle"]/tbody/tr[*]/td[6]')

        # Append data to the lists
        all_dates_list.extend([date.text for date in match_dates])
        all_homeTeams_list.extend([team.text for team in match_homeTeams])
        all_awayTeams_list.extend([team.text for team in match_awayTeams])
        all_scores_list.extend([score.text for score in match_scores])
        league_played_list.extend([season] * len(match_dates))

    # Ensure all lists have the same length
    min_length = min(len(all_dates_list), len(all_homeTeams_list), len(all_awayTeams_list), len(all_scores_list), len(league_played_list))

    # Save the data to a CSV file
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Date", "Home Team", "Away Team", "Score", "Season"])
        for i in range(min_length):
            writer.writerow([all_dates_list[i], all_homeTeams_list[i], all_awayTeams_list[i], all_scores_list[i], league_played_list[i]])

    # Create the data dictionary with the first 20 elements of each list
    data = {
        "Dates": all_dates_list[:20],
        "Home Team": all_homeTeams_list[:20],
        "Away Team": all_awayTeams_list[:20],
        "Score": all_scores_list[:20],
        "League Played": league_played_list[:20]
    }

    df = pd.DataFrame(data)

    print(df)

# Initialize the driver
driver = Driver(uc=True)

try:
    # Define base URLs and league codes
    leagues = {
        "Turkey": "https://www.worldfootball.net/all_matches/tur-sueperlig",
        "France": "https://www.worldfootball.net/all_matches/fra-ligue-1",
        "England": "https://www.worldfootball.net/all_matches/eng-premier-league",
        "Spain": "https://www.worldfootball.net/all_matches/esp-primera-division",
        "Germany": "https://www.worldfootball.net/all_matches/bundesliga",
    }

    # Define valid seasons
    valid_seasons = [f"{year}-{year + 1}" for year in range(2000, 2025)]

    # Collect data for each league and save to separate CSV files
    for league_name, base_url in leagues.items():
        print(f"Processing league: {league_name}")
        output_file = f"{league_name}_matches.csv"
        collect_league_data(base_url, base_url.split('/')[-1], valid_seasons, output_file)

finally:
    driver.quit()

print("Data collection completed. CSV files saved.")


Processing league: Turkey
         Dates            Home Team            Away Team      Score  \
0   05/08/2022      İstanbulspor AŞ          Trabzonspor  0:2 (0:1)   
1   06/08/2022            Sivasspor         Gaziantep FK  1:1 (0:1)   
2                          Beşiktaş          Kayserispor  1:0 (0:0)   
3   07/08/2022          Giresunspor      Adana Demirspor  2:3 (0:2)   
4                  Fatih Karagümrük           Alanyaspor  2:4 (0:2)   
5                       Antalyaspor          Galatasaray  0:1 (0:0)   
6   08/08/2022  İstanbul Başakşehir         Kasımpaşa SK  4:0 (2:0)   
7                    MKE Ankaragücü            Konyaspor  0:0 (0:0)   
8                        Fenerbahçe         Ümraniyespor  3:3 (2:1)   
9   12/08/2022          Trabzonspor            Hatayspor  1:0 (0:0)   
10  13/08/2022          Kayserispor      İstanbulspor AŞ  1:0 (1:0)   
11                  Adana Demirspor            Sivasspor  3:0 (1:0)   
12                      Galatasaray          Giresu