## Import and install

In [1]:
import googleapiclient.discovery
import logging

from typing import List, Any
from delta import DeltaTable
from pyspark.sql import DataFrame, Row
import pyspark.sql.functions as F
import pyspark.sql.types as T

StatementMeta(, , -1, SessionStarting, , SessionStarting)

## Setup and initialise

In [None]:
%run Google-Helpers

StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
# Youtube API
API_SERVICE_NAME = "youtube"
API_VERSION = "v3"
DEVELOPER_KEY = ""

# Delta table names
VIDEOS_TABLE = "abfss://fd12376e-2797-4027-bb8e-42a3a8228a70@onelake.dfs.fabric.microsoft.com/77b89b44-1bcf-42fa-a9ac-7d0593123d3d/Tables/videos"
PLAYLISTS_TABLE = "abfss://fd12376e-2797-4027-bb8e-42a3a8228a70@onelake.dfs.fabric.microsoft.com/77b89b44-1bcf-42fa-a9ac-7d0593123d3d/Tables/playlists" 

StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
logger = setup_logger()
youtube_client = build_youtube_client(API_SERVICE_NAME,API_VERSION,DEVELOPER_KEY)

StatementMeta(, , -1, Waiting, , Waiting)

### Schemas

In [None]:
video_schema = T.StructType([
    T.StructField("id", T.StringType(), True),
    T.StructField("title", T.StringType(), True),
    T.StructField("playlistId", T.StringType(), True)
])

details_schema = T.StructType([
    T.StructField("videoId", T.StringType(), True),
    T.StructField("viewCount", T.IntegerType(), True),
    T.StructField("likeCount", T.IntegerType(), True),
    T.StructField("commentCount", T.IntegerType(), True)
])

StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
def load_playlists() -> DataFrame:
    """
    Load playlists data from a Delta table.
    
    Returns:
        A DataFrame containing playlists.
    """
    try:
        logger.info("Loading playlists data")
        playlists_df = spark.read.format("delta").load(PLAYLISTS_TABLE)
        logger.debug("Finished loading playlists")
        return playlists_df
    except Exception as e:
        logger.exception("Failed to load playlists data")
        raise

playlists = load_playlists()

StatementMeta(, , -1, Waiting, , Waiting)

# API Requests

## Get videos from playlist

In [None]:
def get_videos(playlist: Row) -> List[tuple]:
    """
    Fetch videos from a given playlist using the YouTube API.
    
    Args:
        playlist: A Row object expected to have attributes `PlaylistId` and `Year`.
        
    Returns:
        A list of tuples containing (videoId, title, playlistId).
    """
    playlist_id = playlist.PlaylistId
    try:
        logger.debug(f"Loading playlist items for playlist {playlist_id} ({playlist.Year})")
        request = youtube_client.playlistItems().list(
            part="snippet,contentDetails,status",
            maxResults=50,
            playlistId=playlist_id
        )
        response = request.execute()
        items = response.get('items', [])
        public_items = [d for d in items if d["status"]["privacyStatus"] == 'public']
        data = [
            (
                d['snippet']['resourceId']['videoId'],
                d['snippet']['title'],
                playlist_id
            ) for d in public_items
        ]
        return data
    except Exception as e:
        error_message = f"Failed to load videos for playlist {playlist_id} ({playlist.Year}): {str(e)}"
        logger.exception(error_message)
        raise RuntimeError(error_message)

def parallel_process(playlists_rdd) -> DataFrame:
    """
    Process the playlists RDD in parallel by fetching videos.
    
    Args:
        playlists_rdd: RDD of playlist rows.
        
    Returns:
        A DataFrame containing video data.
    """
    try:
        videos_rdd = playlists_rdd.flatMap(lambda playlist: get_videos(playlist))
        return spark.createDataFrame(videos_rdd, schema=video_schema)
    except Exception as e:
        logger.exception(f"Error during parallel processing of playlists: {str(e)}")
        raise RuntimeError("Parallel processing failed.") from e


StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
video_df = parallel_process(playlists.rdd)
logger.info(f"Total videos retrieved: {video_df.count()}")

StatementMeta(, , -1, Waiting, , Waiting)

## Get video details

In [None]:
def fetch_video_details(video_ids: List[str], batch_size: int = 50) -> List[tuple]:
    """
    Fetch video details from the YouTube API for the provided video IDs.
    
    Args:
        video_ids: List of video IDs.
        batch_size: Number of IDs to process per API call.
        
    Returns:
        List of tuples: (videoId, viewCount, likeCount, commentCount)
    """
    results = []
    try:
        for i in range(0, len(video_ids), batch_size):
            batch = video_ids[i:i + batch_size]
            logger.debug("Fetching video details for a batch")
            request = youtube_client.videos().list(
                part="statistics",
                id=",".join(batch)
            )
            response = request.execute()
            items = response.get('items', [])
            results.extend([
                (
                    d['id'],
                    int(d['statistics'].get('viewCount', 0)),
                    int(d['statistics'].get('likeCount', 0)),
                    int(d['statistics'].get('commentCount', 0))
                ) for d in items
            ])
    except Exception as e:
        error_message = f"Failed to fetch video details for batch {video_ids}. Error: {str(e)}"
        logger.error(error_message)
        raise RuntimeError(error_message)
    return results

def process_partition(video_ids_partition) -> List[tuple]:
    """
    Process a partition of video IDs and fetch details.
    
    Args:
        video_ids_partition: A partition (iterator) of video IDs.
        
    Returns:
        A list of tuples containing video details.
    """
    try:
        video_ids = list(video_ids_partition)
        logger.debug(f"Processing a partition of {len(video_ids)} video IDs")
        if not video_ids:
            return []
        return fetch_video_details(video_ids)
    except Exception as e:
        logger.exception(f"Error processing video IDs partition: {str(e)}")
        raise RuntimeError("Parallel processing of video details failed.") from e

StatementMeta(, , -1, Waiting, , Waiting)

In [None]:

try:
    # Process video details in parallel using RDD
    video_ids_rdd = video_df.select("id").rdd.map(lambda row: row.id)
    video_details_rdd = video_ids_rdd.mapPartitions(process_partition)
    video_details_df = spark.createDataFrame(video_details_rdd, schema=details_schema)

    # Join video data with details
    joined_df = video_df.join(video_details_df, video_df.id == video_details_df.videoId, "leftouter").drop("videoId")
    logger.info(f"Total videos processed with details: {joined_df.count()}")

except Exception as e:
    logger.exception(f"Error during parallel processing: {str(e)}")
    raise e


StatementMeta(, , -1, Waiting, , Waiting)

# Write data

In [None]:
def merge_videos_data(joined_df: DataFrame, table_path: str) -> None:
    """
    Merge the joined video data into the Delta table
    
    Args:
        joined_df: The DataFrame containing the new video data and details.
        table_path: The Delta table path to merge into.
    """
    try:
        target_table = DeltaTable.forPath(spark, table_path)
        logger.info("Merging data started")
        (
            target_table.alias("target").merge(
                joined_df.alias("source"),
                "target.id = source.id"
            ).whenMatchedUpdate(set={
                "title": "source.title",
                "playlistId": "source.playlistId",
                "viewCount": "source.viewCount",
                "likeCount": "source.likeCount",
                "commentCount": "source.commentCount",
                "_modified_date": "current_timestamp()"
            })
            .whenNotMatchedInsert(values={
                "id": "source.id",
                "title": "source.title",
                "playlistId": "source.playlistId",
                "viewCount": "source.viewCount",
                "likeCount": "source.likeCount",
                "commentCount": "source.commentCount",
                "_created_date": "current_timestamp()",
                "_modified_date": "current_timestamp()"
            })
            .whenNotMatchedBySourceDelete()
            .execute()
        )
        logger.info("Merging data finished")
        lastCommit = target_table.history(1).collect()[0]
        metrics = lastCommit["operationMetrics"] 

        numInserted = int(metrics.get("numTargetRowsInserted", 0))
        numUpdated = int(metrics.get("numTargetRowsUpdated", 0))
        numDeleted = int(metrics.get("numTargetRowsDeleted", 0))

        logger.info(f"Rows inserted: {numInserted}")
        logger.info(f"Rows updated: {numUpdated}")
        logger.info(f"Rows deleted: {numDeleted}")
    except Exception as e:
        logger.exception("Exception details: %s", str(e))
        raise

StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
merge_videos_data(joined_df, VIDEOS_TABLE)

StatementMeta(, , -1, Waiting, , Waiting)