### **Import Required Packages**

In [1]:
# !pip install isodate

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, explode, to_timestamp, udf, lit, when,
    current_timestamp
)
from pyspark.sql.types import *
import isodate
import json
print("Packages imported successfully!")

Packages imported successfully!


In [3]:
# UDF to convert ISO8601 duration to seconds
def parse_duration(iso_duration):
    try:
        duration = isodate.parse_duration(iso_duration)
        return int(duration.total_seconds())
    except:
        return None

parse_duration_udf = udf(parse_duration, IntegerType())

In [4]:
# Set spark session
spark = SparkSession.builder \
    .appName("TransformYouTubeData") \
    .getOrCreate()

# Load raw data
raw_df = spark.read.option("multiline", True).json("/home/george/data_engineering/youtube-analytics-pipeline/data/raw/raw_youtube_data.json")

25/04/22 06:34:05 WARN Utils: Your hostname, DESKTOP-0MIQQS8 resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/04/22 06:34:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/22 06:34:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
raw_df

DataFrame[contentDetails: struct<caption:string,definition:string,dimension:string,duration:string,licensedContent:boolean,projection:string>, etag: string, id: string, kind: string, snippet: struct<categoryId:string,channelId:string,channelTitle:string,defaultLanguage:string,description:string,liveBroadcastContent:string,localized:struct<description:string,title:string>,publishedAt:string,tags:array<string>,thumbnails:struct<default:struct<height:bigint,url:string,width:bigint>,high:struct<height:bigint,url:string,width:bigint>,maxres:struct<height:bigint,url:string,width:bigint>,medium:struct<height:bigint,url:string,width:bigint>,standard:struct<height:bigint,url:string,width:bigint>>,title:string>, statistics: struct<commentCount:string,favoriteCount:string,likeCount:string,viewCount:string>]

In [6]:
# Transform the data
transformed_df = raw_df.select(
    col("id").alias("video_id"),
    col("snippet.title").alias("title"),
    col("snippet.description").alias("description"),
    to_timestamp(col("snippet.publishedAt")).alias("published_at"),
    col("statistics.viewCount").cast("int").alias("view_count"),
    col("statistics.likeCount").cast("int").alias("like_count"),
    col("statistics.commentCount").cast("int").alias("comment_count"),
    col("contentDetails.duration").alias("raw_duration"),
    col("snippet.tags").alias("tags"),
    col("snippet.categoryId").alias("category_id"),
    col("snippet.channelTitle").alias("channel_title"),
    # Calculate engagement rate later after checking if view_count is non-zero
    current_timestamp().alias("fetched_at")
)

transformed_df.show(truncate=False)

+-----------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
# Calculate engagement rate
from pyspark.sql.functions import when, col

transformed_df = transformed_df.withColumn(
    "engagement_rate",
    when(col("view_count") > 0,
         (col("like_count") + col("comment_count")) / col("view_count")
    ).otherwise(0.0)
)

In [8]:
# Convert YouTube's raw_duration (which is in ISO 8601 duration format 
#like "PT3M45S", "PT1H2M10S") into something usable in Spark (like INTERVAL or total seconds)
import re
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

# Parse ISO 8601 duration to seconds
def parse_duration_to_seconds(duration_str):
    pattern = re.compile(
        r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?'
    )
    match = pattern.match(duration_str)
    if not match:
        return 0
    hours = int(match.group(1)) if match.group(1) else 0
    minutes = int(match.group(2)) if match.group(2) else 0
    seconds = int(match.group(3)) if match.group(3) else 0
    return hours * 3600 + minutes * 60 + seconds

# Register UDF
parse_duration_udf = udf(parse_duration_to_seconds, IntegerType())

# Apply transformation
transformed_df = transformed_df.withColumn(
    "duration",
    parse_duration_udf(col("raw_duration"))
).drop("raw_duration")

In [9]:
import pandas as pd

In [10]:
# For better display in notebook
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

pandas_df = transformed_df.toPandas()
display(pandas_df.head(10))

Unnamed: 0,video_id,title,description,published_at,view_count,like_count,comment_count,tags,category_id,channel_title,fetched_at,engagement_rate,duration
0,0tucSj6lJyE,Twanga Pepeta - Prince Indah ft. Phina & Cedo (Official Video) sms SKIZA 9845145 to 811,Track - Twanga Pepeta\nArtist - Prince Indah ft Phina\nAudio - Cedo\n\nStream/Download : https://tr.ee/Twanga_Pepeta_Prince_Indah_ft_Phina\n\n©EMC 2025,2025-04-02 10:06:18,525315,15778,1643,"[prince indah songs, Phina, prince Indah twanga pepeta, twanga pepeta, twwanga, pepeta, Phina songs, latest kenyan songs, prince indah new, Phina ft prince Indah, Prince Indah and Phina, luo songs latest, EMC music, simu ya nini prince indah, rembo by prince indah]",10,"Prince Indah, OGW",2025-04-22 06:35:43.363852,0.033163,226
1,R0EKZbz6yro,Prince Indah ~ Simu Ya Nini(sms SKIZA 9844758 to 811),"Simu Ya Nini? is all about a lady who boasts to the man's friends and family that she has found a better partner who is responsible and caring. The man decides to move on, but later, the lady reaches out and sends messages hoping for a reunion. The man, however, now declines her advances\nTrack - Simu Ya Nini \nArtist - Prince Indah \nAudio - Wuod Fibi \nArtworks - Mainstream Image\n©EMC 2024",2024-12-10 15:44:21,1535226,9821,1331,"[Kenya, Prince Indah, Emma Jalamo, Ohangla, Luo Rhumba, Luo Music, Kisumu, Prince Indah latest songs, Freddy Mopao Prince Indah, Weche Hera Prince Indah, Weche Singo, Maria Prince Indah, Zena Prince Indah, Opija Father, Zainabu, Sigand Luo Prince Indah, Washington Jakadel, Raila Odinga, Kenyan politics, NASA, ODM, Nyakisumo Part 2, Adeka Engineer Prince Indah, Angeli, Uchumi Prince Indah, Ken Soldier, Cynderella Prince Indah.]",10,"Prince Indah, OGW",2025-04-22 06:35:43.363852,0.007264,603
2,9PiJkT6jyH0,Prince Indah ~ Rembo(sms SKIZA 9844757 to 811),“Rembo” a Swahili name for beauty is a love song about a beautiful lady. She enters the scene and a man asks her if she shares his feelings for her after chatting. He promises to love her deeply if she commits as long as he lives.\nTrack - Rembo \nArtist - Prince Indah \nAudio - Wuod Fibi \nArtworks - Mainstream Image\n©EMC 2024,2024-12-10 14:30:06,2466582,16716,2291,"[Kenya, Prince Indah, Emma Jalamo, Ohangla, Luo Rhumba, Luo Music, Kisumu, Prince Indah latest songs, Freddy Mopao Prince Indah, Weche Hera Prince Indah, Weche Singo, Maria Prince Indah, Zena Prince Indah, Opija Father, Zainabu, Sigand Luo Prince Indah, Washington Jakadel, Raila Odinga, Kenyan politics, NASA, ODM, Nyakisumo Part 2, Adeka Engineer Prince Indah, Angeli, Uchumi Prince Indah, Ken Soldier, Cynderella Prince Indah.]",10,"Prince Indah, OGW",2025-04-22 06:35:43.363852,0.007706,518
3,KFs2Un7JzvU,Prince Indah ~ Ka Manene(sms SKIZA 9844756 to 811),"A vibrant lady who desires to have everything for herself showers her partner with love before disappearing to socialize with other men. When she returns at the last moment, she is ready to settle down for marriage, but her partner imposes conditions and ultimatums, insisting that the marriage should be traditional. He seeks faithfulness and commitment.\n\nTrack - Ka Manene \nArtist - Prince Indah \nAudio - Wuod Fibi\nArtworks - Mainstream Image\n©EMC 2024",2024-12-10 10:00:06,699743,6870,1258,"[Kenya, Prince Indah, Emma Jalamo, Ohangla, Luo Rhumba, Luo Music, Kisumu, Prince Indah latest songs, Freddy Mopao Prince Indah, Weche Hera Prince Indah, Weche Singo, Maria Prince Indah, Zena Prince Indah, Opija Father, Zainabu, Sigand Luo Prince Indah, Washington Jakadel, Raila Odinga, Kenyan politics, NASA, ODM, Nyakisumo Part 2, Adeka Engineer Prince Indah, Angeli, Uchumi Prince Indah, Ken Soldier, Cynderella Prince Indah., up]",10,"Prince Indah, OGW",2025-04-22 06:35:43.363852,0.011616,556
4,_-wxZEiS6H8,Malaika Musicals Festival 4th Edition,This is a recap of Malaika Festival Events Nairobi Edition @ UHURU GARDENS\n#princeindah #malaikafestival,2024-10-23 13:10:57,14434,335,21,"[prince indah songs, prince indah, Malaika Musicals, malaika festival, luo rhumba, Prince, Indah, prince indah live performance, Prince Indah today, Prince Indah latest]",10,"Prince Indah, OGW",2025-04-22 06:35:43.363852,0.024664,1251
5,bMECgLgBvEg,Prince Indah - Nyar Jaduong (Official Video),Enjoy Prince Indah's first release of 2024 'Nyar Jaduong'.\nStream/Download - https://ziiki.media/NyarJaduong-PrinceIndah\nStream Puonj Mag Dak album - https://smartklix.com/PuonjMagDak\nListen to Prince Indah on Digital Platforms:\nSpotify - https://open.spotify.com/artist/72UZHvETWq3aV97cVxC5VS?si=rZWrJ-ukRxCu7SPjoX6U8w\nApple Music - https://beta.music.apple.com/us/artist/prince-indah/1526224571\nBoomplay - https://www.boomplay.com/artists/2873527?from=search\nAudiomack - https://audiomack.com/Indah\nDeezer - https://www.deezer.com/en/artist/104538912\nTidal - https://tidal.com/browse/artist/20953836\n\nChannel Administered by Ziiki Media. All Rights Reserved\n\n#PrinceIndah #NyarJaduong #Ohangla,2024-03-03 15:00:09,13061422,77338,8671,"[Kenya, Prince Indah, Emma Jalamo, Ohangla, Luo Rhumba, Luo Music, Kisumu, Prince Indah latest songs, Freddy Mopao Prince Indah, Weche Hera Prince Indah, Weche Singo, Maria Prince Indah, Zena Prince Indah, Opija Father, Zainabu, Sigand Luo Prince Indah, Washington Jakadel, Raila Odinga, Kenyan politics, NASA, ODM, Nyakisumo Part 2, Adeka Engineer Prince Indah, Angeli, Uchumi Prince Indah, Ken Soldier, Cynderella Prince Indah.]",10,"Prince Indah, OGW",2025-04-22 06:35:43.363852,0.006585,518
6,c7gdO8V4VDE,Prince Indah - Nyar Jaduong (Official Trailer),Now Out! Stream Here - https://ziiki.media/NyarJaduong-PrinceIndah\nStream Puonj Mag Dak album - https://smartklix.com/PuonjMagDak\nListen to Prince Indah on Digital Platforms:\nSpotify - https://open.spotify.com/artist/72UZHvETWq3aV97cVxC5VS?si=rZWrJ-ukRxCu7SPjoX6U8w\nApple Music - https://beta.music.apple.com/us/artist/prince-indah/1526224571\nBoomplay - https://www.boomplay.com/artists/2873527?from=search\nAudiomack - https://audiomack.com/Indah\nDeezer - https://www.deezer.com/en/artist/104538912\nTidal - https://tidal.com/browse/artist/20953836\n\nChannel Administered by Ziiki Media. All Rights Reserved\n\n#PrinceIndah #NyarJaduong #Ohangla,2024-02-28 12:01:23,311803,3476,201,"[Kenya, Prince Indah, Emma Jalamo, Ohangla, Luo Rhumba, Luo Music, Kisumu, Prince Indah latest songs, Freddy Mopao Prince Indah, Weche Hera Prince Indah, Weche Singo, Maria Prince Indah, Zena Prince Indah, Opija Father, Zainabu, Sigand Luo Prince Indah, Washington Jakadel, Raila Odinga, Kenyan politics, NASA, ODM, Nyakisumo Part 2, Adeka Engineer Prince Indah, Angeli, Uchumi Prince Indah, Ken Soldier, Cynderella Prince Indah., Nyar Jaduong Prince Indah, Nyar Jaduong]",10,"Prince Indah, OGW",2025-04-22 06:35:43.363852,0.011793,26
7,_NqNg4iSE2M,Prince Indah - Puonj Mag Dak Jukebox,"Just as the name suggests, PUONJ MAG DAK meaning 'living lessons' was adopted from 90% of the lyrics of about ¾ of the songs released on this album. \nThe bracket name 'FORMULA 5' comes out loudly with an incorporated collaboration song of 5 celebrated artists who came up with a composition of a NYAR MSEE track 10.\nOther than OSIEPE, JOGI and OGADA MEMBA songs; the remaining 8 songs are associated with love birds(couples/lovers) and the same nature of relationships. \nThe words used to convey the messages are lyrically lethal, soothing and ear friendly, enjoy listening! This stands to be @princeindah's 5th Album. \n\nListen to Prince Indah on Digital Platforms:\nSpotify - https://open.spotify.com/artist/72UZHvETWq3aV97cVxC5VS?si=rZWrJ-ukRxCu7SPjoX6U8w\nApple Music - https://beta.music.apple.com/us/artist/prince-indah/1526224571\nBoomplay - https://www.boomplay.com/artists/2873527?from=search\nAudiomack - https://audiomack.com/Indah\nDeezer - https://www.deezer.com/en/artist/104538912\nTidal - https://tidal.com/browse/artist/20953836\n\nChannel Administered by Ziiki Media. All Rights Reserved\n#PrinceIndah #Osiepe #PuonjMagDak",2024-02-19 20:20:07,540861,2749,219,"[Kenya, Prince Indah, Emma Jalamo, Ohangla, Luo Rhumba, Luo Music, Kisumu, Prince Indah latest songs, Freddy Mopao Prince Indah, Weche Hera Prince Indah, Weche Singo, Maria Prince Indah, Zena Prince Indah, Opija Father, Zainabu, Sigand Luo Prince Indah, Washington Jakadel, Raila Odinga, Kenyan politics, NASA, ODM, Nyakisumo Part 2, Adeka Engineer Prince Indah, Angeli, Uchumi Prince Indah, Ken Soldier, Cynderella Prince Indah., osiepe by prince indah, kenyan food]",10,"Prince Indah, OGW",2025-04-22 06:35:43.363852,0.005488,5336
8,rOeMGVqtu6I,"MALAIKA MUSICALS FESTIVAL AT JOMO KENYATTA STADIUM, MAMBOLEO KISUMU.",,2023-09-22 15:09:31,550410,3707,410,"[Kenya, Prince Indah, Emma Jalamo, Ohangla, Luo Rhumba, Luo Music, Kisumu, Prince Indah latest songs, Freddy Mopao Prince Indah, Weche Hera Prince Indah, Weche Singo, Maria Prince Indah, Zena Prince Indah, Opija Father, Zainabu, Sigand Luo Prince Indah, Washington Jakadel, Raila Odinga, Kenyan politics, NASA, ODM, Nyakisumo Part 2, Adeka Engineer Prince Indah, Angeli, Uchumi Prince Indah, Ken Soldier, Cynderella Prince Indah.]",10,"Prince Indah, OGW",2025-04-22 06:35:43.363852,0.00748,11712
9,sBdpjpgRWLw,Prince Indah - Mummy Chulo (Official Lyric Video),"Just as the name suggests, PUONJ MAG DAK meaning 'living lessons' was adopted from 90% of the lyrics of about ¾ of the songs released on this album. \nThe bracket name 'FORMULA 5' comes out loudly with an incorporated collaboration song of 5 celebrated artists who came up with a composition of a NYAR MSEE track 10.\nOther than OSIEPE, JOGI and OGADA MEMBA songs; the remaining 8 songs are associated with love birds(couples/lovers) and the same nature of relationships. \nThe words used to convey the messages are lyrically lethal, soothing and ear friendly, enjoy listening! This stands to be @princeindah's 5th Album. \n\nListen to Prince Indah on Digital Platforms:\nSpotify - https://open.spotify.com/artist/72UZHvETWq3aV97cVxC5VS?si=rZWrJ-ukRxCu7SPjoX6U8w\nApple Music - https://beta.music.apple.com/us/artist/prince-indah/1526224571\nBoomplay - https://www.boomplay.com/artists/2873527?from=search\nAudiomack - https://audiomack.com/Indah\nDeezer - https://www.deezer.com/en/artist/104538912\nTidal - https://tidal.com/browse/artist/20953836\n\nChannel Administered by Ziiki Media. All Rights Reserved",2023-05-10 20:21:49,1815454,7169,638,"[Kenya, Prince Indah, Emma Jalamo, Ohangla, Luo Rhumba, Luo Music, Kisumu, Prince Indah latest songs, Freddy Mopao Prince Indah, Weche Hera Prince Indah, Weche Singo, Maria Prince Indah, Zena Prince Indah, Opija Father, Zainabu, Sigand Luo Prince Indah, Washington Jakadel, Raila Odinga, Kenyan politics, NASA, ODM, Nyakisumo Part 2, Adeka Engineer Prince Indah, Angeli, Uchumi Prince Indah, Ken Soldier, Cynderella Prince Indah.]",10,"Prince Indah, OGW",2025-04-22 06:35:43.363852,0.0043,609
