### Setup

In [21]:
from typing import Tuple, Optional
import logging
import requests
from bs4 import BeautifulSoup
from io import StringIO
import pyspark.pandas as ps
import concurrent.futures
from datetime import datetime

from delta.tables import DeltaTable

from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.functions as F
import pyspark.sql.types as T

StatementMeta(, 03e79284-5638-40ba-9a83-6ef57b8fc077, 26, Finished, Available, Finished)

In [22]:
%run Helpers

StatementMeta(, 03e79284-5638-40ba-9a83-6ef57b8fc077, 28, Finished, Available, Finished)

In [23]:
logger = setup_logger()

StatementMeta(, 03e79284-5638-40ba-9a83-6ef57b8fc077, 29, Finished, Available, Finished)

In [24]:
PLAYLISTS_TABLE = "abfss://fd12376e-2797-4027-bb8e-42a3a8228a70@onelake.dfs.fabric.microsoft.com/77b89b44-1bcf-42fa-a9ac-7d0593123d3d/Tables/playlists"
RESULTS_TABLE = "abfss://fd12376e-2797-4027-bb8e-42a3a8228a70@onelake.dfs.fabric.microsoft.com/77b89b44-1bcf-42fa-a9ac-7d0593123d3d/Tables/results"

StatementMeta(, 03e79284-5638-40ba-9a83-6ef57b8fc077, 30, Finished, Available, Finished)

### Load data

In [25]:
def has_expected_columns(table) -> bool:
    """
    Check if the BeautifulSoup Tag table has the essential columns:
    "R/O", "Country", "Artist", "Song", "Points", "Place".
    """
    header_row = table.find("tr")
    if not header_row:
        return False
    headers = {th.get_text(strip=True) for th in header_row.find_all("th")}
    required = {"R/O","Country", "Artist", "Song", "Points","Place"}
    if not required.issubset(headers):
        return False
    return True

def fetch_year_page(year: int) -> BeautifulSoup:
    """
    Fetch the Wikipedia page for the given Eurovision year
    and return a BeautifulSoup object.
    """
    url = f"https://en.wikipedia.org/wiki/Eurovision_Song_Contest_{year}#Contest_overview"
    headers_req = {"User-Agent": "Mozilla/5.0"}
    logger.info(f"Year {year}: Fetching URL {url}")
    try:
        response = requests.get(url, headers=headers_req)
        response.raise_for_status()
        return BeautifulSoup(response.text, "html.parser")
    except Exception as e:
        logger.error(f"Year {year}: Error fetching URL: {e}")
        raise

def extract_table_htmls(soup: BeautifulSoup, year: int) -> Tuple[str, str, str]:
    """
    Iterate through wiki tables in the parsed soup and
    extract the HTML for:
      - Final table,
      - Semi-final 1 table,
      - Semi-final 2 table.
    Only tables whose captions contain expected keywords and whose header rows
    have the required columns are accepted.
    
    Returns:
        A tuple (final_table_html, semifinal1_table_html, semifinal2_table_html)
    """
    final_table_html = None
    semifinal1_table_html = None
    semifinal2_table_html = None
    
    for table in soup.find_all("table", class_="wikitable"):
        caption_tag = table.find("caption")
        if not caption_tag:
            continue
        caption_text = caption_tag.get_text(strip=True).lower()
        if not has_expected_columns(table):
            logger.debug(f"Year {year}: Skipping table with caption '{caption_text}' due to missing expected columns.")
            continue
        if f"final of the eurovision song contest {year}" in caption_text and "semi" not in caption_text:
            final_table_html = str(table)
            logger.info(f"Year {year}: Final table found with caption: {caption_text}")
        elif f"first semi-final of the eurovision song contest {year}" in caption_text:
            semifinal1_table_html = str(table)
            logger.info(f"Year {year}: Semi-final 1 table found with caption: {caption_text}")
        elif f"second semi-final of the eurovision song contest {year}" in caption_text:
            semifinal2_table_html = str(table)
            logger.info(f"Year {year}: Semi-final 2 table found with caption: {caption_text}")
            
    if not final_table_html:
        logger.error(f"Year {year}: Final table not found by caption filtering.")
        raise ValueError(f"Final table not found for year {year}.")
    if not semifinal1_table_html:
        logger.error(f"Year {year}: Semi-final 1 table not found by caption filtering.")
        raise ValueError(f"Semi-final 1 table not found for year {year}.")
    if not semifinal2_table_html:
        logger.error(f"Year {year}: Semi-final 2 table not found by caption filtering.")
        raise ValueError(f"Semi-final 2 table not found for year {year}.")
        
    return final_table_html, semifinal1_table_html, semifinal2_table_html

def load_and_standardize_tables(final_table_html: str, semifinal1_table_html: str, semifinal2_table_html: str) -> Tuple[DataFrame, DataFrame, DataFrame]:
    """
    Read the HTML tables (as strings) using pyspark.pandas.read_html,
    reset indices, rename columns to standard names, and convert to Spark DataFrames.
    
    Returns:
        A tuple (final_df, semifinal1_df, semifinal2_df)
    """
    try:
        final_psdf = ps.read_html(StringIO(final_table_html))[0]
        semifinal1_psdf = ps.read_html(StringIO(semifinal1_table_html))[0]
        semifinal2_psdf = ps.read_html(StringIO(semifinal2_table_html))[0]
    except Exception as e:
        logger.error(f"Error reading HTML tables: {e}")
        raise

    logger.info(f"Final table columns: {final_psdf.columns}")
    logger.info(f"Semi-final 1 table columns: {semifinal1_psdf.columns}")
    logger.info(f"Semi-final 2 table columns: {semifinal2_psdf.columns}")

    rename_mapping = {
        "R/O": "running_order",
        "Country": "country",
        "Artist": "artist", 
        "Song": "song", 
        "Points": "points",
        "Place": "place"
    }
    final_psdf = final_psdf.rename(columns=rename_mapping).reset_index(drop=True)
    semifinal1_psdf = semifinal1_psdf.rename(columns=rename_mapping).reset_index(drop=True)
    semifinal2_psdf = semifinal2_psdf.rename(columns=rename_mapping).reset_index(drop=True)

    final_df = final_psdf.to_spark()
    semifinal1_df = semifinal1_psdf.to_spark()
    semifinal2_df = semifinal2_psdf.to_spark()

    final_df.cache(); semifinal1_df.cache(); semifinal2_df.cache()
    final_df.count(); semifinal1_df.count(); semifinal2_df.count()

    return final_df, semifinal1_df, semifinal2_df

def combine_final_and_semi(final_df: DataFrame, semifinal1_df: DataFrame, semifinal2_df: DataFrame, year: int) -> DataFrame:
    """
    Given the final and semi-final DataFrames, first union the semi-final results
    (while adding an indicator of the semi-final number), then join that union with the final DF.
    For countries appearing only in the semi-finals, append the rows.
    Finally, add a "year" column.
    
    Returns:
         The combined Spark DataFrame for that year.
    """
    semifinal1_df = semifinal1_df.withColumn("semi_final", F.lit(1))
    semifinal2_df = semifinal2_df.withColumn("semi_final", F.lit(2))
    
    semi_df = semifinal1_df.unionByName(semifinal2_df)
    semi_df = semi_df.select(
        "country",
        "artist",
        "song",
        "semi_final",
        F.col("points").alias("sf_points"),
        F.col("place").alias("sf_place")
    ).cache()
    semi_df.count()
    
    final_with_semi = final_df.join(
        semi_df, on="country", how="left"
    ).select(
        final_df["country"],
        final_df["artist"],
        final_df["song"],
        final_df["place"],
        final_df["points"],
        semi_df["semi_final"],
        semi_df["sf_points"],
        semi_df["sf_place"]
    )
    
    semi_only = semi_df.join(final_df, on="country", how="left_anti") \
        .withColumn("place", F.lit(None).cast(T.StringType())) \
        .withColumn("points", F.lit(None).cast(T.StringType()))
    semi_only = semi_only.select(
        "country", "artist", "song", "place", "points", "semi_final", "sf_points", "sf_place"
    )
    
    result_df = final_with_semi.unionByName(semi_only)
    result_df = result_df.withColumn("year", F.lit(year))
    logger.info(f"Year {year}: Processed {result_df.count()} rows.")
    return result_df

def process_year(year: int, spark: SparkSession) -> DataFrame:
    """
    Process the specified year by orchestrating:
        1. Fetching the Wikipedia page.
        2. Extracting the table HTML strings.
        3. Loading and standardizing the tables.
        4. Combining final with semi-final data.
    
    Returns:
         A Spark DataFrame (with a "year" column) for the given year.
    """
    soup = fetch_year_page(year)
    final_html, semi1_html, semi2_html = extract_table_htmls(soup, year)
    final_df, semi1_df, semi2_df = load_and_standardize_tables(final_html, semi1_html, semi2_html)
    return combine_final_and_semi(final_df, semi1_df, semi2_df, year)


StatementMeta(, 03e79284-5638-40ba-9a83-6ef57b8fc077, 31, Finished, Available, Finished)

In [26]:
def extract_participant_html(response_text: str, year: int, caption: str) -> str:
    """
    Extracts the HTML content of the participants table from the given response text.

    This function parses the provided HTML response text, searches for a table with the class
    "wikitable" that contains a caption indicating it is a participants table, and returns the
    HTML content of that table. If no such table is found, a ValueError is raised.

    Args:
        response_text (str): The HTML response text to search within.
        year (int): The year associated with the request.

    Returns:
        str: The HTML content of the participants table.

    Raises:
        ValueError: If no participants table is found in the response text.
    """
    soup = BeautifulSoup(response_text, "html.parser")
    participant_table_content: Optional[str] = None
    # Iterate over all wikitable tags to find one with a caption containing "participants"
    for table in soup.find_all("table", class_="wikitable"):
        caption_tag = table.find("caption")
        if caption_tag:
            caption_text = caption_tag.get_text(strip=True).lower()
            if caption.casefold() in caption_text:
                participant_table_content = str(table)
                logger.info(f"Year {year}: Participants table found with caption: {caption_text}")
                break
    if not participant_table_content:
        logger.warning(f"Year {year}: Participants table not found.")
        raise ValueError(
            f"Participants table not found for year {year}. Ensure the response contains a valid 'wikitable' with the appropriate caption."
        )
    return participant_table_content

StatementMeta(, 03e79284-5638-40ba-9a83-6ef57b8fc077, 32, Finished, Available, Finished)

In [27]:
def process_participants(year: int, spark: SparkSession, caption: str) -> DataFrame:
    """
    For a year with no contest results yet (e.g., 2025), fetch the Wikipedia page section
    for "Participating countries", extract the table containing participants, and return a DataFrame.
    
    This DataFrame will contain the columns:
        country, artist, song,
    with placeholder null values for columns such as:
        place, points, semi_final, sf_points, sf_place,
    and includes a "year" column.

    The data is converted to pandas dataframes so that the HTML parsing can be done with soup. 
    Once completed it goes back to spark dataframes.
    """
    # Use the "#Participating_countries" fragment so that we target that section.
    url = f"https://en.wikipedia.org/wiki/Eurovision_Song_Contest_{year}"
    headers_req = {"User-Agent": "Mozilla/5.0"}
    logger.info(f"Year {year}: Fetching participants URL {url}")
    try:
        response = requests.get(url, headers=headers_req)
        response.raise_for_status()
    except Exception as e:
        logger.error(f"Year {year}: Error fetching participants URL: {e}")
        raise

    try:
        participant_table_html = extract_participant_html(response.text, year, caption)
        participants_psdf = ps.read_html(StringIO(participant_table_html))[0]
    except Exception as e:
        logger.error(f"Year {year}: Error reading participants table: {e}")
        raise

    logger.info(f"Year {year}: Participants table columns: {participants_psdf.columns}")

    # Rename columns if necessary. We expect at least "Country", "Artist" and "Song".
    rename_mapping = {
        "Country": "country",
        "Artist": "artist",
        "Song": "song"
    }
    participants_psdf = participants_psdf.rename(columns=rename_mapping).reset_index(drop=True)

    for col_name in ["country", "artist", "song"]:
        if col_name not in participants_psdf.columns:
            logger.error(f"Year {year}: Expected column {col_name} not found in the participants table.")
            raise ValueError(f"Expected column {col_name} not found for year {year}.")

    participants_df = participants_psdf.to_spark(index_col=None)

    participants_df = participants_df.select("country","artist","song")
    # Add placeholder columns (set to null) for contest result fields.
    participants_df = participants_df.withColumn("place", F.lit(None).cast(T.StringType()))
    participants_df = participants_df.withColumn("points", F.lit(None).cast(T.StringType()))
    participants_df = participants_df.withColumn("semi_final", F.lit(None).cast(T.IntegerType()))
    participants_df = participants_df.withColumn("sf_points", F.lit(None).cast(T.StringType()))
    participants_df = participants_df.withColumn("sf_place", F.lit(None).cast(T.StringType()))
    
    # Replace "TBA" values in artist and song with null.
    participants_df = participants_df.withColumn(
        "artist",
        F.when(F.col("artist").rlike("(?i)^TBA.*"), None).otherwise(F.col("artist"))
    ).withColumn(
        "song",
        F.when(F.col("song").rlike("(?i)^TBA.*"), None).otherwise(F.col("song"))
    )
    
    # Add the year column.
    participants_df = participants_df.withColumn("year", F.lit(year))
    
    return participants_df

StatementMeta(, 03e79284-5638-40ba-9a83-6ef57b8fc077, 33, Finished, Available, Finished)

In [28]:
# Read distinct years from the 'playlists' table (which contains a column 'Year').
try:
    playlists_df = spark.read.format("delta").load(PLAYLISTS_TABLE)
    playlists_df = playlists_df.filter("Year <> 2020") # Removed as event did not happen
    distinct_years = [int(row["Year"]) for row in playlists_df.select("Year").distinct().collect()]
    logger.info(f"Found years from 'playlists' table: {distinct_years}")

    today = datetime.today()
    current_year = today.year
    if current_year in distinct_years and today <= datetime(current_year, 5, 17):
        distinct_years.remove(current_year)
        logger.info(f"Current year {current_year} removed because today's date is before May 17.")
except Exception as e:
    logger.error(f"Error reading playlists table: {e}")
    raise
        

StatementMeta(, 03e79284-5638-40ba-9a83-6ef57b8fc077, 34, Finished, Available, Finished)

2025-04-28 21:00:47,190 - INFO - Found years from 'playlists' table: [2021, 2022, 2024, 2025, 2023]
2025-04-28 21:00:47,191 - INFO - Current year 2025 removed because today's date is before May 17.


In [29]:
results = []

# Process multiple years in parallel using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=len(distinct_years)) as executor:
    future_to_year = {executor.submit(process_year, year, spark): year for year in distinct_years}
    for future in concurrent.futures.as_completed(future_to_year):
        yr = future_to_year[future]
        try:
            df_year = future.result()
            results.append(df_year)
            logger.info(f"Year {yr} processed successfully.")
        except Exception as e:
            logger.error(f"Year {yr} generated an exception: {e}")
            
if not results:
    message = "No data processed for any year."
    logger.error(message)
    notebookutils.notebook.exit(message)

# Union all yearly DataFrames
union_df = results[0]
for df in results[1:]:
    union_df = union_df.unionByName(df)

results_2020 = process_participants(2020,spark,"Eurovision Song Contest 2020 selected participants")
union_df = union_df.unionByName(results_2020)

if today <= datetime(current_year, 5, 17):
    results_2025 = process_participants(2025,spark,"eurovision song contest 2025 participants")
    union_df = union_df.unionByName(results_2025)

union_df = union_df.withColumn("song", F.regexp_replace(F.col("song"), '^"+|"+$', ''))

StatementMeta(, 03e79284-5638-40ba-9a83-6ef57b8fc077, 35, Finished, Available, Finished)

2025-04-28 21:00:48,887 - INFO - Year 2021: Fetching URL https://en.wikipedia.org/wiki/Eurovision_Song_Contest_2021#Contest_overview
2025-04-28 21:00:48,889 - INFO - Year 2022: Fetching URL https://en.wikipedia.org/wiki/Eurovision_Song_Contest_2022#Contest_overview
2025-04-28 21:00:48,890 - INFO - Year 2024: Fetching URL https://en.wikipedia.org/wiki/Eurovision_Song_Contest_2024#Contest_overview
2025-04-28 21:00:48,891 - INFO - Year 2023: Fetching URL https://en.wikipedia.org/wiki/Eurovision_Song_Contest_2023#Contest_overview
2025-04-28 21:00:51,514 - INFO - Year 2021: Semi-final 1 table found with caption: results of the first semi-final of the eurovision song contest 2021[109]
2025-04-28 21:00:51,552 - INFO - Year 2024: Semi-final 1 table found with caption: first semi-final of the eurovision song contest 2024[148]
2025-04-28 21:00:51,600 - INFO - Year 2021: Semi-final 2 table found with caption: results of the second semi-final of the eurovision song contest 2021[115]
2025-04-28 21:

### Merge data

In [33]:
def check_table_and_create_if_not_exists(table_path: str, schema: T.StructType) -> None:
    directory, table_name = table_path.rsplit('/', 1)
    all_databases = spark.catalog.listDatabases()
    database = [t for t in all_databases if t.locationUri == directory][0].name
    logger.debug(f"Checking table {database}.{table_name}")
    if not spark.catalog.tableExists(f"`{database}`.`{table_name}`"):
        logger.info("Table doesn't exists, starting to create")
        df = spark.createDataFrame([], schema)
        df.write.format("delta").save(table_path)
        logger.info("Table creation finished")
    else:
        logger.debug("Table already exists")
    return

StatementMeta(, 03e79284-5638-40ba-9a83-6ef57b8fc077, 39, Finished, Available, Finished)

In [35]:
def merge_results_data(union_df: DataFrame, table_path: str) -> None:
    """
    Merge the comments data into the Delta table
    
    Args:
        union_df: The DataFrame containing the new video data and details.
        table_path: The Delta table ABFS path to merge into.
    """
    try:
        new_columns = [("_modified_date", T.TimestampType(), True), ("_created_date", T.TimestampType(), True)]
        new_schema = union_df.schema
        for col_name, col_type, nullable in new_columns:
            new_schema = new_schema.add(col_name, col_type, nullable)
        check_table_and_create_if_not_exists(table_path, new_schema)

        target_table = DeltaTable.forPath(spark, table_path)
        logger.info("Merging data started")
        target_table.alias("target").merge(
            union_df.alias("source"),
            """
            target.country = source.country 
            AND target.song = source.song 
            AND target.year = source.year
            """
        ).whenMatchedUpdate(
            condition="""
            target.artist <> source.artist 
            OR target.place <> source.place 
            OR target.points <> source.points 
            OR target.semi_final <> source.semi_final 
            OR target.sf_points <> source.sf_points 
            OR target.sf_place <> source.sf_place
            """,
            set={
                "artist": "source.artist",
                "place": "source.place",
                "points": "source.points",
                "semi_final": "source.semi_final",
                "sf_points": "source.sf_points",
                "sf_place": "source.sf_place",
                "_modified_date": "current_timestamp()"
            }
        ).whenNotMatchedInsert(
            values={
                "country": "source.country",
                "artist": "source.artist",
                "song": "source.song",
                "place": "source.place",
                "points": "source.points",
                "semi_final": "source.semi_final",
                "sf_points": "source.sf_points",
                "sf_place": "source.sf_place",
                "year": "source.year",
                "_created_date": "current_timestamp()",
                "_modified_date": "current_timestamp()"
            }
        ).execute()
        logger.info("Merging data finished")
        lastCommit = target_table.history(1).collect()[0]
        metrics = lastCommit["operationMetrics"] 

        numInserted = int(metrics.get("numTargetRowsInserted", 0))
        numUpdated = int(metrics.get("numTargetRowsUpdated", 0))
        numDeleted = int(metrics.get("numTargetRowsDeleted", 0))

        logger.info(f"Rows inserted: {numInserted}")
        logger.info(f"Rows updated: {numUpdated}")
        logger.info(f"Rows deleted: {numDeleted}")
        try:
            logger.info("Start optimize")
            target_table.optimize().executeCompaction()
            logger.info("Finished optimize")
        except Exception as e:
            logger.error("Failed to optimize")
            raise

    except Exception as e:
        logger.exception(f"Exception details: {str(e)}")
        raise
        


StatementMeta(, 03e79284-5638-40ba-9a83-6ef57b8fc077, 41, Finished, Available, Finished)

In [36]:
merge_results_data(union_df,RESULTS_TABLE)

StatementMeta(, 03e79284-5638-40ba-9a83-6ef57b8fc077, 42, Finished, Available, Finished)

2025-04-28 21:02:06,922 - INFO - Merging data started
2025-04-28 21:02:18,337 - INFO - Merging data finished
2025-04-28 21:02:19,009 - INFO - Rows inserted: 5
2025-04-28 21:02:19,009 - INFO - Rows updated: 1
2025-04-28 21:02:19,010 - INFO - Rows deleted: 0
2025-04-28 21:02:19,011 - INFO - Start optimize
2025-04-28 21:02:21,694 - INFO - Finished optimize
