# Import and install

In [None]:
#%pip install google-api-python-client


In [None]:
import googleapiclient.discovery
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import lit
import math

# Setup and initialise

In [None]:
api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = "YOUR API KEY HERE"


In [None]:
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey = DEVELOPER_KEY)


In [None]:
playlists = spark.sql("SELECT PlaylistId, Year FROM Raw.playlists")

# API Requests

## Get videos from playlist

In [None]:
schema = StructType([
        StructField("id", StringType(), True),
        StructField("title", StringType(), True),
        StructField("playlistId", StringType(), True)
    ])

In [None]:
def getVideos(playlistId):
    request = youtube.playlistItems().list(
        part="snippet,contentDetails",
        maxResults=50,
        playlistId=playlistId
    )

    response = request.execute()
    items = response['items']
    schema = StructType([
            StructField("id", StringType(), True),
            StructField("title", StringType(), True)
        ])
    df = spark.createDataFrame([
        (
            d['snippet']['resourceId']['videoId'],
            d['snippet']['title'],
        )
            for d in items],
        schema=schema
    )
    
    df = df.withColumn("playlistId", lit(playlistId))

    return df

In [None]:
df = spark.createDataFrame([],schema=schema)

In [None]:
for playlist in playlists.rdd.collect():
    print(f"Getting videos for {playlist.Year}")
    newDf = getVideos(playlist.PlaylistId)
    print(f"Obtained {newDf.count()} videos for {playlist.Year}")
    df = df.union(newDf)

## Get video details

In [None]:
videoIds = df.select('Id').rdd.flatMap(lambda x: x).collect()

In [None]:
schema = StructType([
        StructField("videoId", StringType(), True),
        StructField("viewCount", StringType(), True),
        StructField("likeCount", StringType(), True),
        StructField("commentCount", StringType(), True)
    ])

In [None]:
def getVideoDetails(videoIds):
    batchSize=50
    requestCount = math.ceil(len(videoIds)/batchSize)
    df = spark.createDataFrame([], schema=schema)

    for i in range(0,requestCount):
        batchMin = i*batchSize
        batchMax = (i+1)*batchSize
        print(f"Getting the details for {batchMax} items")
        request = youtube.videos().list(
                part="statistics",
                id=videoIds[batchMin:batchMax]
            )
        response = request.execute()
        items = response['items']

        newDf = spark.createDataFrame([
            (
                d['id'],
                d['statistics']['viewCount'],
                d['statistics']['likeCount'],
                d['statistics']['commentCount']
            )
                for d in items],
            schema=schema
        )
        df = df.union(newDf)
        i=i+1
    return df

In [None]:
videoDetailsDf = getVideoDetails(videoIds)

### Convert data type

In [None]:
videoDetailsDf = videoDetailsDf.withColumn('viewCount',videoDetailsDf.viewCount.cast(IntegerType()))
videoDetailsDf = videoDetailsDf.withColumn('likeCount',videoDetailsDf.likeCount.cast(IntegerType()))
videoDetailsDf = videoDetailsDf.withColumn('commentCount',videoDetailsDf.commentCount.cast(IntegerType()))

## Join dataframes

In [None]:
joinedDf = df.join(videoDetailsDf, df.id == videoDetailsDf.videoId, "leftouter").drop("videoId")

# Write data

In [None]:
joinedDf.write.mode("overwrite").format("delta").option("overwriteSchema", "true").saveAsTable("Raw.videos")