# Notebook to scrape Transfermarket

## You have to manually input the data for the league you want to scrape, try mixing it up and do some lower leagues as well. 

## When saving to a csv, indicate what tier the league you scraped for was in, as well as what the league is, so we can put that data in.

# Remember to change the csv, when scraping a new league so you do not overwrite the data.

In [11]:
import requests
import csv
from bs4 import BeautifulSoup
import time
import random
import os

base_url = f'https://www.transfermarkt.com'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
}

def extract_and_strip_text(element):
    if isinstance(element, str):
        return element.strip()
    elif element: 
        return element.text.strip()
    return "N/A" 


def get_info(labels, results):
    if not results:
        return None
    if isinstance(labels, str):
        labels = [labels]
    for label in labels:
        element = results.find('span', class_='info-table__content', string=label)
        if element:
            value = element.find_next_sibling('span', class_='info-table__content--bold')
            if value:
                return value.get_text(strip=True)
    return None

def get_info_youth(labels, results):
    if not results:
        return None
    if isinstance(labels, str):
        labels = [labels]
    for label in labels:
        element = results.find('h2', class_='content-box-headline', string=label)
        if element:
            value = element.find_next_sibling('div', class_='content')
            if value:
                return value.get_text(strip=True)
    return None


def extract_transfer_details(data):
    transfers = []
    for transfer in data:
        transfer_details = {
            'From Club': transfer['from']['clubName'],
            'To Club': transfer['to']['clubName'],
            'Date of Transfer': transfer['date'],
            'Unformatted Date': transfer['dateUnformatted'],
            'Fee': transfer['fee'],
            'Season': transfer['season'],
            'Market Value': transfer['marketValue']
        }
        transfers.append(transfer_details)
    return transfers


seen_urls = set()
all_transfer_data = []


for num in range(23, 22, -1):  # Adjust to your actual range The range right now is only for 1 year.
    URL = f"https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/plus/?saison_id=20{num}"

    rand = random.randint(10, 15) 
    time.sleep(rand)
    
    season_page = requests.get(URL, headers=headers)
    each_season_soup = BeautifulSoup(season_page.content, "html.parser")

    season_results = each_season_soup.find("table", class_="items")
    season_listResults1 = season_results.find("tbody")
    season_listResults = season_listResults1.find_all("tr")

    for season_element in season_listResults:
        if season_element.find("td", class_="extrarow"):
            continue
        season_name_element = season_element.find("td").find("a")["href"]

        if season_name_element in seen_urls:
            continue
        else:
            seen_urls.add(season_name_element)

        team_url = f'https://www.transfermarkt.com{season_name_element}'

        rand = random.randint(10, 15) 
        time.sleep(rand)
    
        team_page = requests.get(team_url, headers=headers)
        team_soup = BeautifulSoup(team_page.content, "html.parser")

        team_results = team_soup.find("table", class_="items")
        team_listResults1 = team_results.find("tbody")
        team_listResults2 = team_listResults1.find_all("td", class_="posrela")

        team_listResults = [team_element.find_all("td", class_="hauptlink") for team_element in team_listResults2]

        for team_element in team_listResults:
            for team_cell in team_element:
                team_name_element = team_cell.a.text.strip()
                team_link = team_cell.a['href'] if team_cell.a else None

                player_link = f'https://www.transfermarkt.com{team_link}'

                name_in_link = player_link.split("transfermarkt.com/")[1].split("/profil")[0]
                
                rand = random.randint(10, 15) 
                time.sleep(rand)
    
                player_page = requests.get(player_link, headers=headers)
                soup = BeautifulSoup(player_page.content, "html.parser")

                results1 = soup.find("div", class_="box tm-player-additional-data")
                if results1:
                    youth_club_info = results1.find("div", class_="content").text.strip() if results1.find("div", class_="content") else "Empty"
                else:
                    youth_club_info = "Empty"
                results = soup.find("div", class_="info-table")

                # Extract key data
                player_data = {
                    'Full name': get_info(['Full name:', 'Name in home country:'], results),
                    'Name Alternative' : name_in_link,
                    'Date of birth/Age': get_info('Date of birth/Age:', results),
                    'Place of birth': get_info('Place of birth:', results),
                    'Height': get_info('Height:', results),
                    'Citizenship': get_info('Citizenship:', results),
                    'Position': get_info('Position:', results),
                    'Foot': get_info('Foot:', results),
                    'Player agent': get_info('Player agent:', results),
                    'Current club': get_info('Current club:', results),
                    'Joined': get_info('Joined:', results),
                    'Contract expires': get_info('Contract expires:', results),
                    'Last contract extension': get_info('Last contract extension:', results),
                    'Outfitter': get_info('Outfitter:', results),
                    'Youth Club': get_info_youth('Youth clubs', results1)
                }


                player_id = player_link.split("/")[-1]
                
                transfer_url = f'https://www.transfermarkt.com/ceapi/transferHistory/list/{player_id}'

                rand = random.randint(10, 15) #A random time and sleep was used to ensure the site is not flooded with requests.
                time.sleep(rand)
    
                response = requests.get(transfer_url, headers=headers)
                leagues = response.json().get('transfers', [])
                transfers = extract_transfer_details(leagues)

                # for transfer in transfers:
                #     # Combine player data with transfer data
                #     player_data.update(transfer)
                #     all_transfer_data.append(player_data)

                for transfer in transfers:
                    combined_data = player_data.copy()
                    combined_data.update(transfer)
                    all_transfer_data.append(combined_data)






file_name = 'Premier_League_Transfermarkt_scrape.csv'
write_header = not os.path.exists(file_name)

with open(file_name, mode='a', newline='', encoding='utf-8-sig') as file:
    writer = csv.DictWriter(file, fieldnames=all_transfer_data[0].keys())
    if write_header:
        writer.writeheader() 
    writer.writerows(all_transfer_data)