In [25]:
import os
import time
from tqdm import tqdm
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup

def get_players_links ():
    # Url menu des résultats
    url = "https://www.transfermarkt.fr/spieler-statistik/wertvollstespieler/marktwertetop?ajax=yw1&page="

    # Créer une instance du navigateur Firefox
    driver = webdriver.Firefox(service=service, options=options)
    links = []
    print('Téléchargement des liens des joueurs...')
    for page in tqdm(range(1,13)):
        # Naviguer vers l'URL
        driver.get(url + str(page))
        time.sleep(2)
        # Obtenir le HTML complet après l'exécution de JavaScript
        html_content = driver.page_source

        soup = BeautifulSoup(html_content, 'html.parser')
        
        for hauptling in soup.findAll('td', class_ ='rechts hauptlink') :
            links.append(hauptling.find('a')['href'])

    driver.quit()
    return links


def get_player_info(driver, player_link):
    # Url menu des résultats
    url = "https://www.transfermarkt.fr" + player_link

    # Naviguer vers l'URL
    driver.get(url)
    #time.sleep(2)
    # Obtenir le HTML complet après l'exécution de JavaScript
    html_content = driver.page_source

    soup = BeautifulSoup(html_content, 'html.parser')
    
    name = soup.find('h1', class_='data-header__headline-wrapper').find('strong').text
    date_of_birth = soup.find('span', class_='data-header__content', itemprop="birthDate").text[33:]
    nationality = soup.find('span', class_='data-header__content', itemprop="nationality").text[33:]
    club = soup.find('span', class_='data-header__club', itemprop="affiliation").text[1:]
    if club == 'Sans club ' : 
        league ='Pas de League' 
    else : 
        league= soup.find('a', class_='data-header__league-link').text[1:]
    market_value = soup.find('a', class_='data-header__market-value-wrapper').text[:13]
    
    #Position
    li_element = soup.find_all('li', class_='data-header__label')
    # Parcourir chaque élément <li>
    for element in li_element:
        # Rechercher "Position:" dans le texte de l'élément
        if re.search(r'Position:', element.text):
            returned_li = element
            break
    position = element.find('span', class_='data-header__content').text[29:]
    
    df = pd.DataFrame()
    df['name'] = [name]
    df['date_of_birth'] = [date_of_birth]
    df['age']=[0]
    df['nationality'] = [nationality]
    df['club'] = [club]
    df['league'] = [league]
    df['market_value'] = [market_value]
    df['position'] = [position]
    
    return df


def get_all_players_info(player_links):
    # Créer une instance du navigateur Firefox
    driver = webdriver.Firefox(service=service, options=options)

    #On crée un dataframe pour stocker toutes les informations scrapées
    df_players_info = pd.DataFrame()
    print('Téléchargement des données des joueurs...')
    for player in tqdm(player_links) :
        df = get_player_info(driver, player)
        df_players_info = pd.concat((df_players_info, df), axis = 0)

    #On ferme l'instance du navigateur
    driver.quit()
    
    return df_players_info.astype(str)


def clean_spaces(string):
    index = string.find('  ')
    return string[:index]

def seperate_birth_and_age(string):
    index = string.find(' (')
    return string[:index], int(string[index+2:index+4])

def get_market_value(string):
    index = string.find(',')
    value = int(string[:index])
    unit = string[-6:]
    return float(value * 1000000)

def cleaning_dataframe(df):
    print('Nettoyage du dataframe...')
    for i in tqdm(range(df.shape[0])):
        df.loc[i, 'league']= clean_spaces(df.loc[i, 'league'])
        df.loc[i, 'date_of_birth'], df.loc[i, 'age'] = seperate_birth_and_age(df.loc[i, 'date_of_birth'])
        df.loc[i, 'market_value'] = get_market_value(df.loc[i, 'market_value'])
        
        # Définir les options d'affichage de pandas
        pd.options.display.float_format = '{:,.0f}'.format

    return df

In [None]:
#-------MAIN

# Configurer les options pour utiliser Firefox en mode sans tête
options = Options()
options.add_argument("--headless")  # Exécuter en mode sans tête
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Chemin vers le geckodriver (à adapter selon votre configuration)
service = Service(executable_path=r"C:\Users\poljo\OneDrive\Documents\geckodriver.exe")


player_links = get_players_links()

df_players_info = pd.DataFrame()
df_players_info = get_all_players_info(player_links)
df_players_info.to_csv("df_players_info.csv", index=False)


#On formate et réenregistre le dataframe
df_players_info = cleaning_dataframe(pd.read_csv('df_players_info.csv'))
df_players_info.to_csv("df_players_info.csv", index=False)
df_players_info

Téléchargement des liens des joueurs...


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:45<00:00,  3.77s/it]


Téléchargement des données des joueurs...


 91%|████████████████████████████████████████████████████████████████████████▌       | 272/300 [04:03<00:48,  1.74s/it]