## Extracción de datos

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

In [None]:

current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)

In [None]:
# Lista de URLs a scrapear
urls = [
    "https://sofifa.com/players?type=all&lg%5B0%5D=13&tm%5B0%5D=1&r=220069&set=true",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=2",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=1795",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=1925",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=1808",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=5",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=1799",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=7",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=1792",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=8",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=95",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=9",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=10",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=11",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=13",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=1796",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=17",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=18",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=19",
    "https://sofifa.com/players?type=all&lg%5B%5D=13&tm%5B0%5D=110"
]

In [None]:
# Configuración de Selenium para Chrome en modo headless
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
# Opcional: agregar un user-agent para imitar un navegador real
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                            "AppleWebKit/537.36 (KHTML, like Gecko) "
                            "Chrome/115.0.0.0 Safari/537.36")

In [None]:
# Inicializar el WebDriver (asegúrate de tener chromedriver instalado y en tu PATH)
driver = webdriver.Chrome(options=chrome_options)
driver.set_window_size(1920, 1080)


In [None]:
all_data = []  # Lista para almacenar los datos extraídos (cada fila es un diccionario)

# Recorrer cada URL y extraer la tabla
for url in urls:
    print(f"Procesando {url}")
    try:
        driver.get(url)
        # Esperar hasta que la tabla esté presente (máximo 10 segundos)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "table"))
        )
        
        # Obtener el contenido de la página y parsearlo con BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")
        table = soup.find("table")
        
        if table:
            # Extraer cabeceras de la tabla (<th>)
            headers = [th.text.strip() for th in table.find_all("th")]
            # Iterar sobre las filas de la tabla (<tr>)
            for row in table.find_all("tr"):
                cells = row.find_all("td")
                if cells:
                    cell_values = [cell.text.strip() for cell in cells]
                    # Crear un diccionario si el número de celdas coincide con el de cabeceras
                    if len(headers) == len(cell_values):
                        row_dict = dict(zip(headers, cell_values))
                        all_data.append(row_dict)
        else:
            print("No se encontró tabla en", url)
    except Exception as e:
        print(f"Error al procesar {url}: {e}")
    # Pausa para evitar sobrecargar el servidor
    time.sleep(2)

driver.quit()

In [None]:
# Crear un DataFrame a partir de la lista de diccionarios
df = pd.DataFrame(all_data)
df

In [None]:
df.drop(columns=df.columns[0], axis=1, inplace=True)
df

In [None]:
dir_path = os.path.join(parent_directory, "data", "raw")

In [None]:
df.to_csv(os.path.join(dir_path,'premier_league_2022_wages_sofifa.csv'), index=False)