In [0]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL de la página de Wikipedia
url = "https://en.wikipedia.org/wiki/List_of_association_football_stadiums_by_capacity"

# Realizamos una solicitud GET a la página
response = requests.get(url)

# Comprobamos si la solicitud fue exitosa
if response.status_code == 200:
    # Analizamos el contenido HTML de la página con BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Buscamos todas las tablas que contienen las listas de estadios
    tables = soup.find_all('table', {'class': 'wikitable'})

    # Lista para almacenar las filas con 7 celdas
    valid_rows = []

    # Iteramos sobre cada tabla
    for table in tables:
        # Encontramos todas las filas (tr) de la tabla
        rows = table.find_all('tr')
        
        # Iteramos sobre cada fila (tr)
        for row in rows:
            # Obtenemos todas las celdas (td) de la fila
            cells = row.find_all('td')
            
            # Si la fila contiene celdas y tiene exactamente 7 celdas
            if len(cells) == 7:
                # Extraemos el texto de cada celda y eliminamos los saltos de línea y espacios extra
                row_data = [cell.get_text(strip=True) for cell in cells]
                valid_rows.append(row_data)
    
    # Convertimos las filas válidas a un DataFrame
    df_valid_rows = pd.DataFrame(valid_rows, columns=['Stadium', 'Capacity', 'Region', 'Country', 'City', 'Image','Clubs'])
    df_valid_rows.display()
    df_valid_rows.info()
else:
    print("Error al obtener la página:", response.status_code)


Stadium,Capacity,Region,Country,City,Image,Clubs
Rungrado 1st of May Stadium♦,"114,000[1]",East Asia,North Korea,Pyongyang,,"Korea DPR national football team,Korea DPR women's national football team,April 25"
Michigan Stadium,"107,601[2]",North America,United States,"Ann Arbor, Michigan",,Michigan Wolverines football
Ohio Stadium,"102,780[3]",North America,United States,"Columbus, Ohio",,Ohio State Buckeyes football
Melbourne Cricket Ground♦,"100,024[4]",Oceania,Australia,"Melbourne,Victoria",,"Australia national cricket team,Victoria cricket team,Melbourne Cricket Club,Melbourne Stars,Melbourne FC,Richmond FC,Collingwood FC,Hawthorn FC"
Camp Nou♦,"99,354[5]",Europe,Spain,"Barcelona, Catalonia",,FC Barcelona
Estadio Azteca♦,"95,500[6]",North America,Mexico,Mexico City,,"Club América,Cruz Azul,Mexico national football team"
FNB Stadium♦,"94,736[7]",Africa,South Africa,"Johannesburg,Gauteng",,"South Africa national football team,Kaizer Chiefs"
New Administrative Capital Stadium♦,"93,940[8]",Africa,Egypt,New Administrative Capital,,Egypt national football team
Rose Bowl Stadium,"92,800[9]",North America,United States,"Pasadena, California",,UCLA Bruins
Cotton Bowl Stadium,"92,100[10]",North America,United States,"Dallas,Texas",,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 379 entries, 0 to 378
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Stadium   379 non-null    object
 1   Capacity  379 non-null    object
 2   Region    379 non-null    object
 3   Country   379 non-null    object
 4   City      379 non-null    object
 5   Image     379 non-null    object
 6   Clubs     379 non-null    object
dtypes: object(7)
memory usage: 20.9+ KB


In [0]:
import re

# Eliminar '♦' en la columna 'Stadium'
df_valid_rows['Stadium'] = df_valid_rows['Stadium'].str.replace('♦', '', regex=False)

# Eliminar números entre corchetes en la columna 'Capacity'
df_valid_rows['Capacity'] = df_valid_rows['Capacity'].str.replace(r'\[\d+\]', '', regex=True)

#Eliminar columna imagen
df_valid_rows = df_valid_rows.drop(columns=['Image'])

# Mostrar df
df_valid_rows.display()


Stadium,Capacity,Region,Country,City,Clubs
Rungrado 1st of May Stadium,114000,East Asia,North Korea,Pyongyang,"Korea DPR national football team,Korea DPR women's national football team,April 25"
Michigan Stadium,107601,North America,United States,"Ann Arbor, Michigan",Michigan Wolverines football
Ohio Stadium,102780,North America,United States,"Columbus, Ohio",Ohio State Buckeyes football
Melbourne Cricket Ground,100024,Oceania,Australia,"Melbourne,Victoria","Australia national cricket team,Victoria cricket team,Melbourne Cricket Club,Melbourne Stars,Melbourne FC,Richmond FC,Collingwood FC,Hawthorn FC"
Camp Nou,99354,Europe,Spain,"Barcelona, Catalonia",FC Barcelona
Estadio Azteca,95500,North America,Mexico,Mexico City,"Club América,Cruz Azul,Mexico national football team"
FNB Stadium,94736,Africa,South Africa,"Johannesburg,Gauteng","South Africa national football team,Kaizer Chiefs"
New Administrative Capital Stadium,93940,Africa,Egypt,New Administrative Capital,Egypt national football team
Rose Bowl Stadium,92800,North America,United States,"Pasadena, California",UCLA Bruins
Cotton Bowl Stadium,92100,North America,United States,"Dallas,Texas",


In [0]:
from azure.storage.filedatalake import DataLakeServiceClient
import pandas as pd
import io
import datetime

# Función para conectarse al servicio de Azure Data Lake usando la connection string
def connect_to_datalake(connection_string):
    try:
        # Crear una instancia del cliente de DataLake usando la connection string
        datalake_service_client = DataLakeServiceClient.from_connection_string(connection_string)
        return datalake_service_client
    except Exception as e:
        print(f"Error al conectarse a Azure Data Lake: {e}")
        return None

# Cargar el DataFrame a Azure Data Lake
def upload_dataframe_to_datalake(df, datalake_service_client, filesystem_name, file_path):
    try:
        # Acceder al filesystem de Azure Data Lake
        filesystem_client = datalake_service_client.get_file_system_client(filesystem_name)
        
        # Crear o obtener el archivo
        file_client = filesystem_client.get_file_client(file_path)
        
        # Convertir el DataFrame a CSV
        csv_data = df.to_csv(index=False)

        # Subir el CSV a Azure Data Lake
        file_client.upload_data(csv_data, overwrite=True)
        print(f"El archivo ha sido cargado correctamente a {file_path}.")
    except Exception as e:
        print(f"Error al cargar el DataFrame a Data Lake: {e}")

# Parámetros de conexión
current_date = datetime.datetime.now().strftime('%Y-%m-%d')

connection_string = "DefaultEndpointsProtocol=https;AccountName=pruebaa2004;AccountKey=LT/Tw4tgaj1oUCJ7GJOVs/WsAggss3MvlQH4UZccTaC/+BURWupADTeOpPZB29002/YKGeVeINq5+AStSmORbw==;EndpointSuffix=core.windows.net"
filesystem_name = "container"  # El contenedor de tu Data Lake
file_path = f"data/{current_date}/stadiums.csv"  # La ruta dentro del contenedor donde guardarás el archivo

# Conectar al DataLake
datalake_service_client = connect_to_datalake(connection_string)

if datalake_service_client:
    # Cargar el DataFrame a Azure Data Lake
    upload_dataframe_to_datalake(df_valid_rows, datalake_service_client, filesystem_name, file_path)


El archivo ha sido cargado correctamente a data/2024-12-04/stadiums.csv.
