In [None]:
# Import needed libraries
import pandas as pd

### This is for loading the data

# The crawl dataset we are using do not have column names or headings so we need to handle that
column_names = ['videoID', 'uploader', 'age', 'category', 'length', 'views', 'rate', 'ratings', 'comments', 'relatedIDs']

# Load YouTube data for each depth file(we have 4 depths in total starting from 0)
def load_depth(path, crawl_date, depth):
    # Initialize a list to store the valid rows
    valid_rows = []
    # Read our dataset line by line
    # We were using pandas to load/read data at first but there were errors when the 1st row only has 1 or 2 column and pandas were assuming that's all the columns we have (inconsistent)
    # so we decided to go with this route instead and it works
    with open(path, 'r') as file:
        for line in file:
            # Split the line by tab and check if it has at least 2 columns
            # Some rows might have only 1 or 2 columns which is not useful for us, so we ignore them
            # this way it helps with fixing the issues with some rows having only 1 column as well
            split_line = line.strip().split('\t')
            if len(split_line) >= 2:
                valid_rows.append(split_line)       
    # Convert the list into pandas dataframe
    depth_data = pd.DataFrame(valid_rows)
    depth_data['crawl_date'] = crawl_date
    depth_data['depth'] = depth
    return depth_data

# Load all depth files for a single crawl
def load_crawl(path, crawl_date):
    # Load depth file 0, 1, 2 and 3
    depth_files = [f"{path}/{i}.txt" for i in range(4)]
    depth_dframe = []
    # Loop through the total depth files and add the dataframe to depth_dframe list
    for i, depth_file in enumerate (depth_files):
        depth_dframe.append(load_depth(depth_file, crawl_date, i))
    # Now for each crawl, we combine all depths into a single dataframe
    combined_data = pd.concat(depth_dframe, axis=0).reset_index(drop = True)
    return combined_data

In [None]:
### This is for cleaning and transformation the data (Data Preparation)
def prepare_data(df):
    # Since there may be more than 1 related ids, we handle it by combining the related IDs together as a single list
    combined_related_ids = []
    # Loop over each row and combine the related IDs
    for index, row in df.iterrows():
        # We select the related IDs columns (from the 10th column onward) 
        # The dataset description says the related IDs is up to 20 strings only 
        related_ids = row[9:29]  
        # Join the cleaned related IDs into a single string separated by commas then add to the list
        combined_related_ids.append(','.join(related_ids.astype(str)))
    # Add the combined relatedIDs to the DataFrame
    df['relatedIDs'] = combined_related_ids
    # Keep only the first 9 columns plus the new combined 'relatedIDs' column
    depth_data = df.iloc[:, :9].copy() 
    depth_data['relatedIDs'] = combined_related_ids 
    # Keep the crawl_date and depth column 
    depth_data[['crawl_date', 'depth']] = df[['crawl_date', 'depth']] 
    # Add the column names to each column
    depth_data.columns = column_names + ['crawl_date', 'depth']
    # Remove the leading/trailing whitespace from string columns
    depth_data['uploader'] = depth_data['uploader'].str.strip()
    depth_data['category'] = depth_data['category'].str.strip()
    # Convert these columns to numeric
    numeric_columns = ['age', 'length', 'views', 'rate', 'ratings', 'comments']
    for col in numeric_columns:
        depth_data[col] = pd.to_numeric(depth_data[col])   
    # Fill in the missing 'rate' values with the mean of the column
    depth_data['rate'] = depth_data['rate'].fillna(depth_data['rate'].mean())
    return depth_data

In [None]:
# testing = (load_depth("data/080331/2.txt", '2008-03-31', "2"))
# clean = clean_data(testing)
# print(clean)

# Load all the crawls in the given dataset
crawl1 = load_crawl('data/080327', '2008-03-27')
crawl2 = load_crawl('data/080329', '2008-03-29')
crawl3 = load_crawl('data/080331', '2008-03-31')
# Combine all crawls into a single dataframe
combined_data = pd.concat([crawl1, crawl2, crawl3], axis=0).reset_index(drop=True)
# Prepare the data (clean & transform)
combined_data = prepare_data(combined_data)
print(combined_data.head())

In [None]:
# Data ingestion and connection with MongoDB
from pymongo import MongoClient
# Connect to MongoDb database that we have created
uri = 'mongodb://localhost:27017/'
client = MongoClient(uri)
db = client["youtubedb"]
collection = db["youtube_vids"]
# this prevents duplicate from running this section more than once
collection.delete_many({})
# Convert the DataFrame into a list of dictionaries
insert_data = combined_data.to_dict('records')
# Insert the data into the collection
collection.insert_many(insert_data)

In [None]:
# Validation Queries

# Query 1: Validate Document Count
document_count = collection.count_documents({})
print("Total documents:", document_count)

# Query 2: Check Category Distribution
category_counts = collection.aggregate([
    { "$group": { "_id": "$category", "count": { "$sum": 1 } } }
])
print("Category distribution:")
for category in category_counts:
    print(category)

# Query 3: Check Average and Max Views
view_stats = collection.aggregate([
    { "$group": { "_id": None, "avgViews": { "$avg": "$views" }, "maxViews": { "$max": "$views" } } }
])
print("View statistics:")
for stats in view_stats:
    print(stats)

# Query 4: Validate Related IDs Format
related_id_check = collection.find({"relatedIDs": {"$exists": True, "$type": "string"}}).limit(5)
print("Sample related IDs:")
for doc in related_id_check:
    print(doc['relatedIDs'])


In [None]:
import time

# Clear the collection to avoid duplicate entries
collection.delete_many({})
print("Collection cleared.")

# Start timer
start_time = time.time()

# Ingest data (insert many records)
collection.insert_many(insert_data)

# End timer
end_time = time.time()

# Calculate and print ingestion time
ingestion_time = end_time - start_time
print(f"Data ingestion took {ingestion_time:.2f} seconds")


In [None]:
import time

# Measure execution time for document count
start_time = time.time()
document_count = collection.count_documents({})
end_time = time.time()

# Print results
print("Total documents:", document_count)
print(f"Document count query took {end_time - start_time:.4f} seconds")


In [None]:
# Measure execution time for category distribution query
start_time = time.time()
category_counts = collection.aggregate([
    { "$group": { "_id": "$category", "count": { "$sum": 1 } } }
])

# Collect results to ensure the query executes fully
categories = list(category_counts)
end_time = time.time()

# Print results and time
print("Category distribution:")
for category in categories:
    print(category)
print(f"Category distribution query took {end_time - start_time:.4f} seconds")


In [None]:
# Measure execution time for view statistics query
start_time = time.time()
view_stats = collection.aggregate([
    { "$group": { "_id": None, "avgViews": { "$avg": "$views" }, "maxViews": { "$max": "$views" } } }
])

# Collect results to ensure the query executes fully
view_stats_result = list(view_stats)
end_time = time.time()

# Print results and time
print("View statistics:", view_stats_result)
print(f"View statistics query took {end_time - start_time:.4f} seconds")


In [None]:
# Measure execution time for related IDs format check
start_time = time.time()
related_id_check = collection.find({"relatedIDs": {"$exists": True, "$type": "string"}}).limit(5)

# Collect results to ensure the query executes fully
related_ids = list(related_id_check)
end_time = time.time()

# Print results and time
print("Sample related IDs:")
for doc in related_ids:
    print(doc['relatedIDs'])
print(f"Related IDs check query took {end_time - start_time:.4f} seconds")


In [None]:
# Running aggregation functions (from aggregations.py) and displaying results

from pymongo import MongoClient
from aggregations import (
    calculate_view_statistics,
    calculate_avg_rating_by_category,
    calculate_total_comments_and_ratings_by_category,
    calculate_avg_length_by_category,
    avg_views_per_video_by_category,
    most_viewed_videos,
    most_viewed_video_in_each_category,
    top_uploader_by_views,
    top_commented_videos
)

# MongoDB connection setup
uri = 'mongodb://localhost:27017/'
client = MongoClient(uri)
db = client["youtubedb"]
collection = db["youtube_vids"]

# 1. Platform-wide View Statistics
view_stats = list(calculate_view_statistics(collection))[0]
print("\nPlatform-Wide View Statistics:")
print(f"  - Average Views: {view_stats['avgViews']:.2f}")
print(f"  - Maximum Views: {view_stats['maxViews']}")
print(f"  - Minimum Views: {view_stats['minViews']}")

# 2. Average Rating by Category
avg_rating_by_category = list(calculate_avg_rating_by_category(collection))
rating_df = pd.DataFrame(avg_rating_by_category)
rating_df.columns = ["Category", "Average Rating"]
rating_df["Average Rating"] = rating_df["Average Rating"].round(2)
print("\nAverage Rating by Category:")
display(rating_df)

# 3. Total Comments and Ratings by Category
comments_ratings_by_category = list(calculate_total_comments_and_ratings_by_category(collection))
comments_ratings_df = pd.DataFrame(comments_ratings_by_category)
comments_ratings_df.columns = ["Category", "Total Comments", "Total Ratings"]
print("\nTotal Comments and Ratings by Category:")
display(comments_ratings_df)

# 4. Average Video Length by Category
avg_length_by_category = list(calculate_avg_length_by_category(collection))
length_df = pd.DataFrame(avg_length_by_category)
length_df.columns = ["Category", "Average Length"]
print("\nAverage Video Length by Category:")
display(length_df)

# 5. Average Views per Video by Category
avg_views_category = list(avg_views_per_video_by_category(collection))
avg_views_df = pd.DataFrame(avg_views_category)
avg_views_df.columns = ["Category", "Average Views"]
print("\nAverage Views per Video by Category:")
display(avg_views_df)

# 6. Most Viewed Videos on the Platform
most_viewed = list(most_viewed_videos(collection, top_n=5))
most_viewed_df = pd.DataFrame(most_viewed)
print("\nTop 5 Most Viewed Videos on the Platform:")
display(most_viewed_df)

# 7. Most Viewed Video in Each Category
most_viewed_per_category = list(most_viewed_video_in_each_category(collection))
most_viewed_category_df = pd.DataFrame(most_viewed_per_category)
print("\nMost Viewed Videos in Each Category:")
display(most_viewed_category_df)

# 8. Top Uploader by Total Views
top_uploader = list(top_uploader_by_views(collection))[0]
print("\nTop Uploader by Total Views:")
print(f"  - Uploader: {top_uploader['_id']}")
print(f"  - Total Views: {top_uploader['totalViews']}")

# 9. Top 5 Most Commented Videos
top_commented = list(top_commented_videos(collection))
top_commented_df = pd.DataFrame(top_commented)
print("\nTop 5 Most Commented Videos:")
display(top_commented_df)


In [None]:
from graphframes import GraphFrame
from pyspark.sql.functions import col, explode, split
from pyspark.sql import SparkSession
# PageRank for influence analysis
spark = ((((SparkSession.builder 
            .appName("YouTubeAnalyzer"))
            .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0," 
                    "graphframes:graphframes:0.8.4-spark3.5-s_2.12")
            .config("spark.mongodb.input.uri", "mongodb://localhost:27017/youtubedb.youtube_vids"))
            .config("spark.mongodb.output.uri", "mongodb://localhost:27017/youtubedb.pagerank_results"))
            # The configs below prevents out of memory error that our team ran into and also increase efficiency
            .config("spark.driver.memory", "8g")
            .config("spark.executor.memory", "8g")
            .config("spark.executor.instances", "3")
            .config("spark.executor.cores", "7")
            .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
            .getOrCreate())
# Read data from Mongodb
yt_data = (((spark.read.format("mongodb") 
           .option("database", "youtubedb")) 
           .option("collection", "youtube_vids"))
           .load())

# Check to see if the pagerank results already exist in the database
pagerank_results = (spark.read.format("mongodb")
                    .option("database", "youtubedb")
                    .option("collection", "pagerank_results")
                    .load())
# Just load the results and display without having to rerun the whole PageRank algorithm if it already exists in database
if pagerank_results.count() > 0:
    print("PageRank results already exist in Database. Loading results...")
    sample_results = pagerank_results.select("id", "uploader", "category", "pagerank", "comments", "ratings")

else:
    # Repartition the DF into 128 partitions (for parallel processing)
    # Our team decided to store repartitioned result in memory to prevent re-computation or reloading
    # (we believe this can improve performance)
    yt_data_full = yt_data.repartition(128).cache()
    # Create dataframe for vertices (videos)
    vertices = yt_data_full.select(
        col("videoID").alias("id"),
        col("uploader"),
        col("category"),
        col("views"),
        col("rate"),
        col("ratings"),
        col("comments"),
        col("depth")
    )
    # Create dataframe for edges (related connections) and have a new column named dst
    # Explode will create separate row for each related videoID in dst
    edges = yt_data_full.select(
        col("videoID").alias("src"),
        split(col("relatedIDs"), ",").alias("dst")
    ).withColumn("dst", explode(col("dst")))
    # Create GraphFrame
    vid_graph = GraphFrame(vertices, edges)
    # PageRank algorithm
    # resetProbability controls how likely a random surfer will jump to a random page or node
    # Since link structure is important, our team decided to use a low resetProbability
    start_time = time.time()
    influences = vid_graph.pageRank(resetProbability = 0.15, tol=0.05)
    end_time = time.time()
    # Show total time it takes to apply pagerank algorithm on the whole dataset
    execution_time = end_time - start_time
    print(f"Execution time for applying PageRank on the full dataset: {execution_time}s")
    # Save the PageRank results back to MongoDB
    (influences.vertices.write.format("mongodb").mode("overwrite")
        .option("database", "youtubedb")
        .option("collection", "pagerank_results").save())
    # Display sample of the results
    sample_results = influences.vertices.select("id", "uploader", "category", "pagerank", "comments", "ratings")

sample_results.show(1000)

In [None]:
# Get top 10 videos with the lowest PageRank scores
lowest_pagerank_videos = (pagerank_results
                              .select("id", "uploader", "category", "pagerank", "comments", "ratings", "views")
                              .orderBy(col("pagerank").asc())
                              .limit(10))
print("Top 10 videos with the lowest PageRank scores:")
lowest_pagerank_videos.show()

As we can see from the output above, the top 10 YouTube videos with the lowest PageRank scores all have similar or the same scores (around 0.34) and this is expected due to the lack of quality inbound links, comments, ratings and views. Moreover, our team uses tol = 0.05, which means that the algorithm stops iterating when the changes in PageRank values are below the threshold of 0.05, causing the low-scoring videos to converge to similar scores.

In [None]:
# Get top 10 videos with highest PageRank scores
highest_pagerank_videos = (pagerank_results
                              .select("id", "uploader", "category", "pagerank", "comments", "ratings", "views")
                              .orderBy(col("pagerank").desc())
                              .limit(10))
print("Top 10 videos with the highest PageRank scores:")
highest_pagerank_videos.show()
# spark.stop()

In [None]:
from pymongo import MongoClient
from collections import Counter

# client = MongoClient("mongodb://localhost:27018/")
client = MongoClient("mongodb://localhost:27017/")

db = client["youtubedb"]
collection = db["youtube_vids"]

k = 10

def top_k_categories(collection, k):
    pipeline = [
        {"$group": {"_id": "$category", "count": {"$sum": 1}}},
        {"$sort": {"count": -1}},
        {"$limit": k}
    ]

    top_categories = list(collection.aggregate(pipeline))

    print("Top categories:")
    for entry in top_categories:
        print(f"{entry['_id']}: {entry['count']}")
    print("\n")

top_k_categories(collection, k)

# Top categories:
# Entertainment 162044
# Music 156208
# Comedy 61113
# People & Blogs 56036
# Film & Animation 51459
# Sports 48265
# News & Politics 30913
# Autos & Vehicles 19237
# Howto & Style 18031
# Pets & Animals 11325

def top_k_categories_gui(collection, k):
    pipeline = [
        {"$group": {"_id": "$category", "count": {"$sum": 1}}},
        {"$sort": {"count": -1}},
        {"$limit": k}
    ]
    top_categories = list(collection.aggregate(pipeline))
    return top_categories

In [None]:
#Top k rated videos
def top_k_rated_videos():
    print("Top rated videos:")
    top_rated_videos = collection.find().sort("rate", -1).limit(k)
    for video in top_rated_videos:
        print(video["videoID"], video["rate"])
    print("\n")

top_k_rated_videos()

def top_k_rated_videos_gui():
    top_rated_videos = collection.find().sort("rate", -1).limit(k)
    return top_rated_videos

In [None]:
#Top k viewed videos
def top_k_viewed_videos(collection, k):
    print("Top viewed videos:")
    top_viewed_videos = collection.find().sort("views", -1).limit(k)
    for video in top_viewed_videos:
        print(video["videoID"], video["views"])
    print("\n")

top_k_viewed_videos(collection, k)

def top_k_viewed_videos_gui(collection, k):
    top_viewed_videos = collection.find().sort("views", -1).limit(k)
    return top_viewed_videos

In [None]:
#Range queries

# Find videos in specific category with a duration between t1 and t2 seconds
category = "Comedy" 
t1 = 700 
t2 = 1000 

def videos_in_length_range_by_category(category, t1, t2):
    category_query = {
    "category": category,
    "length": {"$gte": t1, "$lte": t2}  # Range query for length
}

    print(f"Videos in category {category} with duration between {t1} and {t2} seconds:")
    result = collection.find(category_query)
    for video in result:
        print(f"Video ID: {video['videoID']}, Category: {video['category']}, Length: {video['length']}")
    print("\n")

videos_in_length_range_by_category(category, t1, t2)

def videos_in_length_range_by_category_gui(collection, category, t1, t2):
    category_query = {
        "category": category,
        "length": {"$gte": t1, "$lte": t2}
    }
    result = collection.find(category_query)
    return list(result)

In [None]:
t1 = 500
t2 = 1000 
def get_videos_in_length_range(t1, t2):
    general_query = {
 "length": {"$gte": t1, "$lte": t2}
}
    print(f"All videos with duration between {t1} and {t2} seconds:")
    result = collection.find(general_query)
    for video in result:
        print(f"Video ID: {video['videoID']}, Category: {video['category']}, Length: {video['length']}")

get_videos_in_length_range(t1, t2)

def get_videos_in_length_range_gui(collection, t1, t2):
    general_query = {
 "length": {"$gte": t1, "$lte": t2}
}
    result = collection.find(general_query)
    return list(result)




### GUI and Visualization

In [None]:
# Run tkinter GUI by using 'python GUI.py' command in terminal
import tkinter as tk
from tkinter import ttk, messagebox
import pandas as pd
from matplotlib import pyplot as plt
from pymongo import MongoClient
import time
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg

# MongoDB connection setup
uri = 'mongodb://localhost:27017/'
client = MongoClient(uri)
db = client["youtubedb"]
collection = db["youtube_vids"]

pagerank_collection = db["pagerank_results"]

# Helper function to measure execution time and display results
def execute_and_time(func, *args, description=""):
    start_time = time.time()
    result = list(func(*args))
    end_time = time.time()
    execution_time = end_time - start_time
    return result, execution_time

# Get page rank data
def get_pagerank(collection):
    top_vids = list(collection.find().sort("pagerank", -1).limit(20))
    return top_vids

# Clear existing plot
def clear_plot_area(plot_frame):
    for widget in plot_frame.winfo_children():
        widget.destroy()

# Plot PageRank & category
def plot_pagerank(top_vids, plot_frame):

    clear_plot_area(plot_frame)
    # Get data for visualization
    top_categories = [vid["category"] for vid in top_vids]
    top_scores = [vid["pagerank"] for vid in top_vids]

    # Create subplots
    fig, axes = plt.subplots(1, 1, figsize=(8, 4))

    # Top 20 videos
    axes.bar(top_categories, top_scores, color='green')
    axes.set_title("Top 20 Videos Category by PageRank")
    axes.set_ylabel("PageRank Score")
    axes.set_xlabel("Category")

    plt.tight_layout()
    # plt.show()

    # Display the plot in the Tkinter GUI
    canvas = FigureCanvasTkAgg(fig, master=plot_frame)
    canvas.get_tk_widget().pack(pady=10)

# Get all PageRank data
def get_pagerank_all(collection):
    all_vids = list(collection.find())
    return all_vids

# Plot PageRank vs Other attributes
def scatter_plot_pagerank(all_vids, plot_frame, x_key, x_label, plot_title):
    clear_plot_area(plot_frame)
    pagerank_scores = [vid["pagerank"] for vid in all_vids]
    x_data = [vid[x_key] for vid in all_vids]

    # Create subplots
    fig, axes = plt.subplots(1, 1, figsize=(11, 4))

    # Scatter plot
    axes.scatter(x_data, pagerank_scores, color='blue')
    axes.set_title(plot_title)
    axes.set_xlabel(x_label)
    axes.set_ylabel("PageRank Score")
    # x ticks for depth
    if x_key == "depth":
        axes.set_xticks([0, 1, 2, 3])

    plt.tight_layout()

    # Display the plot in the Tkinter GUI
    canvas = FigureCanvasTkAgg(fig, master=plot_frame)
    canvas.get_tk_widget().pack(pady=10)


# Main GUI window
class YouTubeAnalyzerApp:
    def __init__(self, root):
        self.root = root
        self.root.title("YouTube Analyzer")
        self.root.geometry("1000x800")

        # Title label
        tk.Label(root, text="YouTube Analyzer", font=("Arial", 20)).pack(pady=10)

        # Dropdown for selecting analysis option
        self.analysis_var = tk.StringVar()
        self.analysis_options = [
            "Total View Statistics",
            "Average Rating by Category",
            "Total Comments and Ratings by Category",
            "Average Video Length by Category",
            "Average Views per Video by Category",
            "Most Viewed Videos",
            "Most Viewed Video in Each Category",
            "Top Uploader by Views",
            "Top Commented Videos",
            "Top Video Category by PageRank",
            "PageRank vs Views analysis",
            "PageRank vs Rating analysis",
            "PageRank vs Comments analysis",
            "PageRank vs Depth analysis",
            "Top k Categories",
            "Top k Rated Videos",
            "Top k Viewed Videos",
            "All Videos with Length in Range",
            "Videos in Length Range by Category"
        ]
        tk.Label(root, text="Select Analysis:").pack()
        self.analysis_menu = ttk.Combobox(
            root,
            textvariable=self.analysis_var,
            values=self.analysis_options,
            width=50
        )
        self.analysis_menu.pack(pady=5)

          # Entry for 'k' value 
        self.k_label = tk.Label(root, text="Enter value for k:")
        self.k_label.pack()
        self.k_entry = tk.Entry(root)
        self.k_entry.pack()
        
        # Entries for start and end lengths
        self.min_length_label = tk.Label(root, text="Enter start length (seconds):")
        self.min_length_label.pack()
        self.min_length_entry = tk.Entry(root)
        self.min_length_entry.pack()

        self.max_length_label = tk.Label(root, text="Enter end length (seconds):")
        self.max_length_label.pack()
        self.max_length_entry = tk.Entry(root)
        self.max_length_entry.pack()

        self.category_var = tk.StringVar()
        self.category_options = [
            "Autos & Vehicles",
            "Comedy",
            "Education",
            "Entertainment",
            "Film & Animation",
            "Howto & Style",
            "Music",
            "News & Politics",
            "Nonprofits & Activism",
            "People & Blogs",
            "Pets & Animals",
            "Science & Technology",
            "Sports",
            "Travel & Events",
            "UNA"
        ]
        tk.Label(root, text="Select Category:").pack()
        self.analysis_menu = ttk.Combobox(
            root,
            textvariable=self.category_var,
            values=self.category_options,
            width=50
        )
        self.analysis_menu.pack(pady=5)
        # Button to execute analysis
        self.run_button = tk.Button(root, text="Run Analysis", command=self.run_analysis)
        self.run_button.pack(pady=10)

        # Text area for displaying results
        self.result_area = tk.Text(root, wrap=tk.WORD, height=20, width=70)
        self.result_area.pack(pady=10)

        # Canvas for Matplotlib plots
        self.plot_frame = tk.Frame(root)
        self.plot_frame.pack(fill=tk.BOTH, expand=True)

    def run_analysis(self):
        analysis = self.analysis_var.get()
        if not analysis:
            messagebox.showerror("Error", "Please select an analysis.")
            return

        self.result_area.delete("1.0", tk.END)

        try:
            if analysis == "Total View Statistics":
                stats, exec_time = execute_and_time(calculate_view_statistics, collection, description="Platform-Wide View Statistics")
                stats = stats[0]
                self.result_area.insert(tk.END, f"Average Views: {stats['avgViews']:.2f}\n")
                self.result_area.insert(tk.END, f"Maximum Views: {stats['maxViews']}\n")
                self.result_area.insert(tk.END, f"Minimum Views: {stats['minViews']}\n")
            elif analysis == "Average Rating by Category":
                results, exec_time = execute_and_time(calculate_avg_rating_by_category, collection)
                for res in results:
                    self.result_area.insert(tk.END, f"Category: {res['_id']}, Avg Rating: {res['avgRating']:.2f}\n")
            elif analysis == "Total Comments and Ratings by Category":
                results, exec_time = execute_and_time(calculate_total_comments_and_ratings_by_category, collection)
                for res in results:
                    self.result_area.insert(
                        tk.END,
                        f"Category: {res['_id']}, Total Comments: {res['totalComments']}, Total Ratings: {res['totalRatings']}\n"
                    )
            elif analysis == "Average Video Length by Category":
                results, exec_time = execute_and_time(calculate_avg_length_by_category, collection)
                for res in results:
                    self.result_area.insert(tk.END, f"Category: {res['_id']}, Avg Length: {res['avgLength']:.2f}\n")
            elif analysis == "Average Views per Video by Category":
                results, exec_time = execute_and_time(avg_views_per_video_by_category, collection)
                for res in results:
                    self.result_area.insert(tk.END, f"Category: {res['_id']}, Avg Views: {res['avgViews']:.2f}\n")
            elif analysis == "Most Viewed Videos":
                results, exec_time = execute_and_time(most_viewed_videos, collection, 5)
                for res in results:
                    self.result_area.insert(tk.END, f"Video ID: {res['videoID']}, Views: {res['views']}\n")
            elif analysis == "Most Viewed Video in Each Category":
                results, exec_time = execute_and_time(most_viewed_video_in_each_category, collection)
                for res in results:
                    self.result_area.insert(
                        tk.END,
                        f"Category: {res['_id']}, Video ID: {res['videoID']}, Views: {res['views']}\n"
                    )
            elif analysis == "Top Uploader by Views":
                result, exec_time = execute_and_time(top_uploader_by_views, collection)
                result = result[0]
                self.result_area.insert(tk.END, f"Uploader: {result['_id']}, Total Views: {result['totalViews']}\n")
            elif analysis == "Top Commented Videos":
                results, exec_time = execute_and_time(top_commented_videos, collection)
                for res in results:
                    self.result_area.insert(
                        tk.END,
                        f"Video ID: {res['videoID']}, Comments: {res['comments']}, Uploader: {res['uploader']}\n"
                    )
            elif analysis == "Top Video Category by PageRank":
                top_videos, exec_time = execute_and_time(get_pagerank, pagerank_collection)

                # Show results in text area
                self.result_area.insert(tk.END, "Top Videos Category by PageRank:\n")
                for video in top_videos:
                    self.result_area.insert(
                        tk.END,
                        f"Video ID: {video['id']}, PageRank: {video['pagerank']:.4f}, Category: {video['category']}\n"
                    )
                plot_pagerank(top_videos, self.plot_frame)

            elif analysis == "PageRank vs Views analysis":
                top_videos, exec_time = execute_and_time(get_pagerank, pagerank_collection)

                # Show only the results for the top 20 videos in text area
                self.result_area.insert(tk.END, "PageRank vs. Views:\n")
                for video in top_videos:
                    self.result_area.insert(
                        tk.END,
                        f"Video ID: {video['id']}, PageRank: {video['pagerank']:.4f}, Views: {video['views']}\n"
                    )
                all_videos = get_pagerank_all(pagerank_collection)
                # Plot the visualization for all 647k videos (pagerank score & number of views)
                scatter_plot_pagerank(all_videos, self.plot_frame, "views", "Number of views", "PageRank vs Views")

            elif analysis == "PageRank vs Rating analysis":
                top_videos, exec_time = execute_and_time(get_pagerank, pagerank_collection)

                # Show only the results for the top 20 videos in text area
                self.result_area.insert(tk.END, "PageRank vs. Rate:\n")
                for video in top_videos:
                    self.result_area.insert(
                        tk.END,
                        f"Video ID: {video['id']}, PageRank: {video['pagerank']:.4f}, Rate: {video['rate']}\n"
                    )
                all_videos = get_pagerank_all(pagerank_collection)
                # Plot the visualization for all 647k videos (pagerank & rating score)
                scatter_plot_pagerank(all_videos, self.plot_frame, "rate", "Rating", "PageRank vs Video Rating")

            elif analysis == "PageRank vs Comments analysis":
                top_videos, exec_time = execute_and_time(get_pagerank, pagerank_collection)

                # Show only the results for the top 20 videos in text area
                self.result_area.insert(tk.END, "PageRank vs. Comments:\n")
                for video in top_videos:
                    self.result_area.insert(
                        tk.END,
                        f"Video ID: {video['id']}, PageRank: {video['pagerank']:.4f}, Comment: {video['comments']}\n"
                    )
                all_videos = get_pagerank_all(pagerank_collection)
                # Plot the visualization for all 647k videos (pagerank score & number of comments)
                scatter_plot_pagerank(all_videos, self.plot_frame, "comments", "Number of comments", "PageRank vs Comments")

            elif analysis == "PageRank vs Depth analysis":
                top_videos, exec_time = execute_and_time(get_pagerank, pagerank_collection)

                # Show only the results for the top 20 videos in text area
                self.result_area.insert(tk.END, "PageRank vs. Depths:\n")
                for video in top_videos:
                    self.result_area.insert(
                        tk.END,
                        f"Video ID: {video['id']}, PageRank: {video['pagerank']:.4f}, Depth: {video['depth']}\n"
                    )
                all_videos = get_pagerank_all(pagerank_collection)
                # Plot the visualization for all 647k videos (pagerank score & depth level)
                scatter_plot_pagerank(all_videos, self.plot_frame, "depth", "Depth Level", "PageRank vs Depth")
            elif analysis == "Top k Categories":
                if self.k_entry.get():
                    k = int(self.k_entry.get())
                else:
                    messagebox.showerror("Error", "Please enter a value for k.")
                    return
                results, exec_time = execute_and_time(top_k_categories_gui, collection, k)
                self.result_area.insert(tk.END, "Top k Categories:\n")
                for res in results:
                    self.result_area.insert(tk.END, f"Category: {res['_id']}, Video Count: {res['count']:.2f}\n")
            elif analysis == "Top k Rated Videos":
                if self.k_entry.get():
                    k = int(self.k_entry.get())
                else:
                    messagebox.showerror("Error", "Please enter a value for k.")
                    return
                results, exec_time = execute_and_time(top_k_rated_videos_gui, collection, k)
                self.result_area.insert(tk.END, f"Top {k} Rated Videos:\n")
                for res in results:
                    self.result_area.insert(tk.END, f"Video: {res['_id']}, Rating: {res['rate']:.2f}\n")
            elif analysis == "Top k Viewed Videos":
                if self.k_entry.get():
                    k = int(self.k_entry.get())
                else:
                    messagebox.showerror("Error", "Please enter a value for k.")
                    return
                results, exec_time = execute_and_time(top_k_viewed_videos_gui, collection, k)
                self.result_area.insert(tk.END, f"Top {k} Viewed Videos\n")
                for res in results:
                    self.result_area.insert(tk.END, f"Category: {res['_id']}, View Count: {res['views']:.2f}\n")
            elif analysis == "All Videos with Length in Range":
                if self.min_length_entry.get() and self.max_length_entry.get():
                    start = int(self.min_length_entry.get())
                    end = int(self.max_length_entry.get())
                else:
                    messagebox.showerror("Error", "Please enter a min and max length.")
                    return
                results, exec_time = execute_and_time(get_videos_in_length_range_gui, collection, start, end)
                self.result_area.insert(tk.END, f"Videos with Length in Range {start} to {end} seconds:\n")
                for res in results:
                    self.result_area.insert(tk.END, f"Video: {res['_id']}, Length: {res['length']:.2f}\n")
            elif analysis == "Videos in Length Range by Category":
                if self.min_length_entry.get() and self.max_length_entry.get():
                    start = int(self.min_length_entry.get())
                    end = int(self.max_length_entry.get())
                else:
                    messagebox.showerror("Error", "Please enter a min and max length.")
                    return                
                if self.category_var.get():
                    category = self.category_var.get()
                else:
                    messagebox.showerror("Error", "Please select a category.")
                    return              
                results, exec_time = execute_and_time(videos_in_length_range_by_category_gui, collection, category, start, end)
                self.result_area.insert(tk.END, f"Videos in category {category} with Length in Range {start} to {end} seconds:\n")
                for res in results:
                    self.result_area.insert(tk.END, f"Video: {res['_id']}, Length: {res['length']:.2f}\n")
            # Show execution time
            self.result_area.insert(tk.END, f"\nExecution Time: {exec_time:.4f} seconds\n")

        except Exception as e:
            messagebox.showerror("Error", f"An error occurred: {e}")


# Run the application
if __name__ == "__main__":
    root = tk.Tk()
    app = YouTubeAnalyzerApp(root)
    root.mainloop()
