### Setup

In [2]:
import logging

from delta import DeltaTable

from pyspark.sql import DataFrame
import pyspark.sql.functions as F

StatementMeta(, 1119c858-9fff-45c3-b1e6-0105764336fb, 4, Finished, Available, Finished)

In [11]:
%run Helpers

StatementMeta(, 1119c858-9fff-45c3-b1e6-0105764336fb, 15, Finished, Available, Finished)

In [12]:
logger = setup_logger()

StatementMeta(, 1119c858-9fff-45c3-b1e6-0105764336fb, 16, Finished, Available, Finished)

In [4]:
RAW_VIDEOS_TABLE = "abfss://fd12376e-2797-4027-bb8e-42a3a8228a70@onelake.dfs.fabric.microsoft.com/77b89b44-1bcf-42fa-a9ac-7d0593123d3d/Tables/videos"
ENHANCED_VIDEOS_TABLE = "abfss://fd12376e-2797-4027-bb8e-42a3a8228a70@onelake.dfs.fabric.microsoft.com/1ead427c-19d4-417e-bb8f-a68d9adc0f38/Tables/videos"

StatementMeta(, 1119c858-9fff-45c3-b1e6-0105764336fb, 7, Finished, Available, Finished)

### Load videos

In [7]:
def add_country_column(df: DataFrame, title_col: str = "title") -> DataFrame:
    """
    Adds a 'country' column to the DataFrame by extracting a country name from the title column.

    Args:
        df (DataFrame): Input DataFrame containing the video data.
        title_col (str): Name of the column containing the title.

    Returns:
        DataFrame: The input DataFrame with an additional column 'country'.
    """
    # Define a list of country names to search for
    # List of tuples: (regex pattern, standardized country name)
    conditions = [
        (r"(?i).*\bAlbania\b.*", "Albania"),
        (r"(?i).*\bAndorra\b.*", "Andorra"),
        (r"(?i).*\bArmenia\b.*", "Armenia"),
        (r"(?i).*\bAustralia\b.*", "Australia"),
        (r"(?i).*\bAustria\b.*", "Austria"),
        (r"(?i).*\bAzerbaijan\b.*", "Azerbaijan"),
        (r"(?i).*\bBelarus\b.*", "Belarus"),
        (r"(?i).*\bBelgium\b.*", "Belgium"),
        (r"(?i).*\bBosnia\b.*\bHerzegovina\b.*", "Bosnia and Herzegovina"),
        (r"(?i).*\bBulgaria\b.*", "Bulgaria"),
        (r"(?i).*\bCroatia\b.*", "Croatia"),
        (r"(?i).*\bCyprus\b.*", "Cyprus"),
        (r"(?i).*\bCzech Republic\b.*", "Czechia"),
        (r"(?i).*\bCzechia\b.*", "Czechia"),
        (r"(?i).*\bDenmark\b.*", "Denmark"),
        (r"(?i).*\bEstonia\b.*", "Estonia"),
        (r"(?i).*\bFinland\b.*", "Finland"),
        (r"(?i).*\bFrance\b.*", "France"),
        (r"(?i).*\bGeorgia\b.*", "Georgia"),
        (r"(?i).*\bGermany\b.*", "Germany"),
        (r"(?i).*\bGreece\b.*", "Greece"),
        (r"(?i).*\bHungary\b.*", "Hungary"),
        (r"(?i).*\bIceland\b.*", "Iceland"),
        (r"(?i).*\bIreland\b.*", "Ireland"),
        (r"(?i).*\bIsrael\b.*", "Israel"),
        (r"(?i).*\bItaly\b.*", "Italy"),
        (r"(?i).*\bLatvia\b.*", "Latvia"),
        (r"(?i).*\bLithuania\b.*", "Lithuania"),
        (r"(?i).*\bLuxembourg\b.*", "Luxembourg"),
        (r"(?i).*\bMalta\b.*", "Malta"),
        (r"(?i).*\bMoldova\b.*", "Moldova"),
        (r"(?i).*\bMonaco\b.*", "Monaco"),
        (r"(?i).*\bMontenegro\b.*", "Montenegro"),
        (r"(?i).*\bMorocco\b.*", "Morocco"),
        (r"(?i).*\bNetherlands\b.*", "Netherlands"),
        (r"(?i).*\bNorth Macedonia\b.*", "North Macedonia"),
        (r"(?i).*\bNorway\b.*", "Norway"),
        (r"(?i).*\bPoland\b.*", "Poland"),
        (r"(?i).*\bPortugal\b.*", "Portugal"),
        (r"(?i).*\bRomania\b.*", "Romania"),
        (r"(?i).*\bRussia\b.*", "Russia"),
        (r"(?i).*\bSan Marino\b.*", "San Marino"),
        (r"(?i).*\bSerbia\b.*\bMontenegro\b.*", "Serbia and Montenegro"),
        (r"(?i).*\bSerbia\b.*", "Serbia"),
        (r"(?i).*\bSlovakia\b.*", "Slovakia"),
        (r"(?i).*\bSlovenia\b.*", "Slovenia"),
        (r"(?i).*\bSpain\b.*", "Spain"),
        (r"(?i).*\bSweden\b.*", "Sweden"),
        (r"(?i).*\bSwitzerland\b.*", "Switzerland"),
        (r"(?i).*\bTurkey\b.*", "Turkey"),
        (r"(?i).*\bUkraine\b.*", "Ukraine"),
        (r"(?i).*\bUnited Kingdom\b.*", "United Kingdom"),
        (r"(?i).*\bYugoslavia\b.*", "Yugoslavia")
    ]

    
    country_expr = None
    # Loop through each condition and check if the title column values match that pattern
    for pattern, country in conditions:
        condition = F.col(title_col).rlike(pattern)
        if country_expr is None:
            country_expr = F.when(condition, F.lit(country))
        else:
            country_expr = country_expr.when(condition, F.lit(country))
    
    # Add handling for if no condition is met, set the country as Unknown.
    country_expr = country_expr.otherwise(F.lit("Unknown"))
    
    return df.withColumn("country", country_expr)

StatementMeta(, 1119c858-9fff-45c3-b1e6-0105764336fb, 10, Finished, Available, Finished)

In [8]:
videos_df = spark.read.format("delta").load(RAW_VIDEOS_TABLE)

videos_df = add_country_column(videos_df)
videos_df = videos_df.select("id","title","playlistId","viewCount","likeCount","commentCount","country","_created_date","_modified_date")

StatementMeta(, 1119c858-9fff-45c3-b1e6-0105764336fb, 11, Finished, Available, Finished)

### Write data

In [9]:
def merge_videos_data(videos_df: DataFrame, table_path: str) -> None:
    """
    Merge the enhanced video data into the Delta table
    
    Args:
        videos_df: The DataFrame containing the new video data and details.
        table_path: The Delta ABFS path name to merge into.
    """
    try:
        target_table = DeltaTable.forPath(spark, table_path)
        logger.info("Merging data started")
        (
            target_table.alias("target").merge(
                videos_df.alias("source"),
                "target.id = source.id"
            ).whenMatchedUpdate(set={
                "title": "source.title",
                "playlistId": "source.playlistId",
                "viewCount": "source.viewCount",
                "likeCount": "source.likeCount",
                "commentCount": "source.commentCount",
                "country": "source.country",
                "_modified_date": "source._modified_date"
            })
            .whenNotMatchedInsert(values={
                "id": "source.id",
                "title": "source.title",
                "playlistId": "source.playlistId",
                "viewCount": "source.viewCount",
                "likeCount": "source.likeCount",
                "commentCount": "source.commentCount",
                "country": "source.country",
                "_created_date": "source._created_date",
                "_modified_date": "source._modified_date"
            })
            .whenNotMatchedBySourceDelete()
            .execute()
        )
        logger.info("Merging data finished")
        lastCommit = target_table.history(1).collect()[0]
        metrics = lastCommit["operationMetrics"] 

        numInserted = int(metrics.get("numTargetRowsInserted", 0))
        numUpdated = int(metrics.get("numTargetRowsUpdated", 0))
        numDeleted = int(metrics.get("numTargetRowsDeleted", 0))

        logger.info(f"Rows inserted: {numInserted}")
        logger.info(f"Rows updated: {numUpdated}")
        logger.info(f"Rows deleted: {numDeleted}")
    except Exception as e:
        logger.exception("Exception details: %s", str(e))
        raise

StatementMeta(, 1119c858-9fff-45c3-b1e6-0105764336fb, 12, Finished, Available, Finished)

In [13]:
merge_videos_data(videos_df,ENHANCED_VIDEOS_TABLE)

StatementMeta(, 1119c858-9fff-45c3-b1e6-0105764336fb, 17, Finished, Available, Finished)

2025-04-28 21:15:42,977 - INFO - Merging data started
2025-04-28 21:15:54,929 - INFO - Merging data finished
2025-04-28 21:15:55,727 - INFO - Rows inserted: 0
2025-04-28 21:15:55,727 - INFO - Rows updated: 231
2025-04-28 21:15:55,728 - INFO - Rows deleted: 0
