In [1]:
#%pip install --upgrade google-api-python-client -q
#%pip install pymongo cassandra-driver -q

In [2]:
import os
import pyspark
import googleapiclient.discovery
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum
from pyspark.sql.types import StringType
from pymongo import MongoClient
from cassandra.cluster import Cluster

# MINIO CONFIGURATION
s3_host = "minio"
s3_url = f"http://{s3_host}:9000"
s3_key = "minio"
s3_secret = "SU2orange!"
s3_bucket = "labe"

# CASSANDRA CONFIGURATION
cassandra_host = "cassandra"

# MONGO CONFIGURATION
mongo_uri = "mongodb://admin:mongopw@mongo:27017/demo.feedback?authSource=admin"

# Spark init
spark = SparkSession.builder \
    .master("local") \
    .appName('jupyter-pyspark') \
    .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:3.1.2")\
    .config("spark.hadoop.fs.s3a.endpoint", s3_url) \
    .config("spark.hadoop.fs.s3a.access.key", s3_key) \
    .config("spark.hadoop.fs.s3a.secret.key", s3_secret) \
    .config("spark.hadoop.fs.s3a.fast.upload", True) \
    .config("spark.hadoop.fs.s3a.path.style.access", True) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.mongodb.input.uri", mongo_uri) \
    .config("spark.mongodb.output.uri", mongo_uri) \
    .config("spark.jars.packages","org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
    .config("spark.cassandra.connection.host", cassandra_host) \
    .config("spark.jars.packages","com.datastax.spark:spark-cassandra-connector-assembly_2.12:3.1.0")\
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")
print("Done")



:: loading settings :: url = jar:file:/usr/local/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
com.datastax.spark#spark-cassandra-connector-assembly_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-60e6f61f-2b44-4697-adeb-53bb46c19688;1.0
	confs: [default]
	found com.datastax.spark#spark-cassandra-connector-assembly_2.12;3.1.0 in central
:: resolution report :: resolve 2163ms :: artifacts dl 22ms
	:: modules in use:
	com.datastax.spark#spark-cassandra-connector-assembly_2.12;3.1.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   1   |   0   |   0   |   0   ||   1   |   0   |
	---------------------------------------------------------------------
:

Done


In [3]:
#MONGO

API_KEY = "AIzaSyDsMwmQeItUE4T4Stzq6mYTxelrdOaUL_8"

youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

# Use Youtube API to get video title from video id
def get_video_title(video_id):
    try:

        response = youtube.videos().list(
            part='snippet',
            id=video_id
        ).execute()

        # Get video title from response
        video_title = response['items'][0]['snippet']['title']

        return video_title

    except Exception as e:
        print(f"Error fetching video title for {video_id}: {str(e)}")
        return None


# API call to Youtube to get 100 of the most recent comments and related data from a video
def main(video_id):
    
    max_results = 100
    comment_list = []

    try:
        # Get video title
        video_title = get_video_title(video_id)

        # Loop through pages to get 100 comments (only does about 20 at a time)
        while len(comment_list) < max_results:
            request = youtube.commentThreads().list(
                part="snippet,replies",
                maxResults=min(100, max_results - len(comment_list)),
                textFormat="plainText",
                videoId=video_id,
                pageToken=None if not comment_list else comment_list[-1].get("nextPageToken"),
                prettyPrint=True
            )

            # Send request
            response = request.execute()

            # Iterate through comments and extract relevant information
            comment_list.extend(
                {
                    "_id": item["id"],
                    "video_id": item["snippet"]["videoId"],
                    "video_title": video_title,
                    "author_display_name": item["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"],
                    "text_original": item["snippet"]["topLevelComment"]["snippet"]["textOriginal"],
                    "like_count": item["snippet"]["topLevelComment"]["snippet"]["likeCount"],
                    "repliesCount": item["snippet"]["totalReplyCount"],
                    "datetime_posted": item["snippet"]["topLevelComment"]["snippet"]["publishedAt"],
                }
                for item in response.get("items", [])
            )

    except Exception as e:
        print(f"An error occurred: {str(e)}")

    return comment_list



# Youtube ids of videos we want to get comments from
video_list = ["gir8BEqAutk", "mvVBuG4IOW4", "lUvBk4owRNU"]

# Define a lambda function to process each video_id, NO FOR LOOP!
process_video = lambda video_id: spark.createDataFrame(main(video_id)) \
    .write.format("mongo") \
    .mode("append") \
    .option("replaceDocument", "false") \
    .option("database", "youtube_comments") \
    .option("collection", "video_comments") \
    .save()

# Apply function to each video_id
list(map(process_video, video_list))

print("Done")

                                                                                

Done


                                                                                

In [4]:
df = spark.read \
    .format("mongo") \
    .option("uri", "mongodb://admin:mongopw@mongo:27017/") \
    .option("database", "youtube_comments") \
    .option("collection", "video_comments") \
    .option("authSource", "admin") \
    .load()

print(f"{df.count()} comments accross {len(video_list)} videos.")

                                                                                

350 comments accross 3 videos.


In [5]:
# CASSANDRA

# CQL statements
drop_table_cassandra_sql = "DROP TABLE IF EXISTS youtube_comments.video_comments;"

create_table_cassandra_sql = '''
    CREATE TABLE youtube_comments.video_comments
    (
        id text,
        author_display_name text,
        datetime_posted timestamp,
        like_count bigint,
        repliescount bigint,
        text_original text,
        video_id text,
        video_title text,
        PRIMARY KEY (video_title, datetime_posted)
    );
'''

# Cassandra connection setup
with Cluster([cassandra_host]) as cluster:
    session = cluster.connect()

    # Use keyspace
    session.execute("USE youtube_comments;")   

    # Drop the table
    session.execute(drop_table_cassandra_sql)

    # Create the table
    session.execute(create_table_cassandra_sql)

# Had to rename cols because Cassandra will not have a field starting with an underscore
df_cassandra = df.toDF(
    "id",
    "author_display_name",
    "datetime_posted",
    "like_count",
    "repliescount",
    "text_original",
    "video_id",
    "video_title"
)
cassandra_options = {
    "table": "video_comments",
    "keyspace": "youtube_comments",
    "cluster": cassandra_host
}

# Write data from Spark DataFrame to Cassandra table
df_cassandra.write \
    .format("org.apache.spark.sql.cassandra") \
    .mode("append") \
    .options(**cassandra_options) \
    .save()

print("Done")

[Stage 6:>                                                          (0 + 1) / 1]

Done


                                                                                

In [6]:
cassandra_comments = spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(**cassandra_options) \
    .load()

cassandra_comments.createOrReplaceTempView("blank_space_comments")
query1 = """
    SELECT 
        video_title,
        datetime_posted,
        text_original
    FROM blank_space_comments
    WHERE video_title = "Taylor Swift - Blank Space (Taylor's Version) (Lyric Video)"
    ORDER BY datetime_posted DESC;
    """

query2 = """
    SELECT 
        video_title,
        like_count,
        text_original
    FROM blank_space_comments
    WHERE video_title = "Taylor Swift - Blank Space (Taylor's Version) (Lyric Video)"
    ORDER BY like_count DESC;
    """
blank_space1 = spark.sql(query1)

blank_space2 = spark.sql(query2)

# The query executes on cassandra, not spark.
spark.sql(query1).explain()

== Physical Plan ==
*(1) Sort [datetime_posted#193 DESC NULLS LAST], true, 0
+- *(1) Project [video_title#192, datetime_posted#193, text_original#198]
   +- BatchScan[video_title#192, datetime_posted#193, text_original#198] Cassandra Scan: youtube_comments.video_comments
 - Cassandra Filters: [["video_title" = ?, Taylor Swift - Blank Space (Taylor's Version) (Lyric Video)]]
 - Requested Columns: [video_title,datetime_posted,text_original]




In [7]:
blank_space1.toPandas()

                                                                                

Unnamed: 0,video_title,datetime_posted,text_original
0,Taylor Swift - Blank Space (Taylor's Version) ...,2023-11-21 23:50:58,Ya me acostumbré al antigue al escuchar esto s...
1,Taylor Swift - Blank Space (Taylor's Version) ...,2023-11-21 23:06:38,While you ask for 100 bottles of expensive cha...
2,Taylor Swift - Blank Space (Taylor's Version) ...,2023-11-21 23:01:53,wow lo mejor
3,Taylor Swift - Blank Space (Taylor's Version) ...,2023-11-21 21:48:45,The only
4,Taylor Swift - Blank Space (Taylor's Version) ...,2023-11-21 21:13:10,Only real swiftys did the Sidney chant
...,...,...,...
117,Taylor Swift - Blank Space (Taylor's Version) ...,2023-11-15 01:46:37,R.I.P “Starbucks lovers” 2014-2023
118,Taylor Swift - Blank Space (Taylor's Version) ...,2023-11-14 22:44:36,"te amamos lindota, MI GENTE LATINO"
119,Taylor Swift - Blank Space (Taylor's Version) ...,2023-11-14 22:44:22,and you love the gameeee
120,Taylor Swift - Blank Space (Taylor's Version) ...,2023-11-14 22:44:09,so it's gonna be forever


In [8]:
blank_space2.toPandas()

Unnamed: 0,video_title,like_count,text_original
0,Taylor Swift - Blank Space (Taylor's Version) ...,3,"Haha she changed the "" Starbucks lovers"" 😂❤"
1,Taylor Swift - Blank Space (Taylor's Version) ...,3,I love the spareness of this version. Taylor's...
2,Taylor Swift - Blank Space (Taylor's Version) ...,3,🙌🏽🙌🏽🙌🏽🙌🏽🙌🏽🙌🏽✨✨✨✨
3,Taylor Swift - Blank Space (Taylor's Version) ...,2,"te amamos lindota, MI GENTE LATINO"
4,Taylor Swift - Blank Space (Taylor's Version) ...,2,R.I.P “Starbucks lovers” 2014-2023
...,...,...,...
117,Taylor Swift - Blank Space (Taylor's Version) ...,0,Only real swiftys did the Sidney chant
118,Taylor Swift - Blank Space (Taylor's Version) ...,0,The only
119,Taylor Swift - Blank Space (Taylor's Version) ...,0,wow lo mejor
120,Taylor Swift - Blank Space (Taylor's Version) ...,0,While you ask for 100 bottles of expensive cha...
