# **Execution Comparision Accross Various Platforms**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
data = {
    "Query": [
        "Top 10 Most Viewed Videos", "Average Rating by Category", "Videos with Ratings Greater Than 4.5",
        "Average Views per Category", "Top 5 Uploaders by Number of Videos", "Top 5 Longest Videos",
        "Total Comments Count by Category", "Total Number of Videos by Each Uploader",
        "Average Video Length by Category", "Videos with High Views but Low Ratings", 
        "Relationship Between Video Length and Views"
    ],
    "ApacheSpark": [0.02, 0.04, 0.013, 0.04, 0.05, 0.01, 0.03, 0.04, 0.24, 0.15, 0.02],
    "Hive": [5.42, 4.26, 6.45, 4.65, 13.14, 4.43, 4.98, 11.36, 4.09, 4.85, 4.12],
    "Pig": ["-", 22, 18, 18, "-", "-", 22, 22, 12, 12, 8],
    "HBase": [86.58, 91.15, 84.53, 80.96, 89.64, 86.74, 88.84, 79.06, 79.73, 83.96, 82.77],
    "MongoDB": [9.54, 10.02, 291.56, 15.21, 26.89, 9.67, 8.86, 21.3, 19.07, 15.51, 21.77]
}

In [None]:
df = pd.DataFrame(data)

In [None]:
df.replace("-", pd.NA, inplace=True)
df = df.apply(pd.to_numeric, errors="ignore")

In [None]:
plt.figure(figsize=(14, 8))
for column in df.columns[1:]:
    plt.plot(df["Query"], df[column], marker="o", label=column)

plt.xticks(rotation=45, ha="right")
plt.xlabel("Query")
plt.ylabel("Execution Time (seconds)")
plt.title("Execution Time Comparison Across Different Platforms")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(df.set_index("Query").T, annot=True, cmap="YlGnBu", fmt=".2f", cbar_kws={'label': 'Execution Time (seconds)'},annot_kws={"size": 8})
plt.yticks(rotation=0, ha="right", fontsize=9)
plt.xlabel("Query")
plt.ylabel("Platform")
plt.title("Execution Time Comparison Across Different Platforms (Heatmap)")
plt.tight_layout()
plt.show()

In [None]:
df.plot(x="Query", kind="bar", figsize=(14, 8))
plt.xlabel("Query")
plt.ylabel("Execution Time (seconds)")
plt.title("Execution Time Comparison Across Different Platforms (Grouped Bar Chart)")
plt.xticks(rotation=45, ha="right")
plt.legend(title="Platform")
plt.tight_layout()
plt.show()

# **Query Results Visualizations**

#### Replace the hdfs path to query results before executing the spark.read.csv()
#### Here ,results generated by Spark are used

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import round
import matplotlib.pyplot as plt
from pyspark.sql import functions as F
import seaborn as sns
import pandas as pd
import numpy as np
import dask.dataframe as dd

In [None]:
spark = SparkSession.builder \
    .appName("YT-Analysis") \
    .getOrCreate()

#### **Average Rating By Category**

In [None]:
df_1 = spark.read.csv("hdfs://localhost:9000/hdfs/path/Spark/avg_rating_by_category/part-.csv", header=True)

In [None]:
df_1 = df_1.withColumn("average_rating", round(df_1["average_rating"], 2))

In [None]:
pandas_df = df_1.toPandas()

In [None]:
print(pandas_df)

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(pandas_df['category'], pandas_df['average_rating'], color='orange')
plt.title('Average Rating by Category')
plt.xlabel('Category')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#### **Averaage Video Length By Category**

In [None]:
df_2 = spark.read.csv("hdfs://localhost:9000/hdfs/path/Spark/avg_video_length_by_category/part-.csv", header=True)

In [None]:
df_2 = df_2.withColumn("average_length", round(df_2["average_length"], 2))

In [None]:
pandas_df = df_2.toPandas()

In [None]:
print(pandas_df)

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(pandas_df['category'], pandas_df['average_length'], color='orange')
plt.title('Average Video Length by Category')
plt.xlabel('Category')
plt.ylabel('Average Video Lenght')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#### **Average Views By Category**

In [None]:
df_3 = spark.read.csv("hdfs://localhost:9000/hdfs/path/Spark/avg_views_by_category/part-.csv", header=True)

In [None]:
df_3 = df_3.withColumn("average_views", round(df_3["average_views"], 2))

In [None]:
pandas_df = df_3.toPandas()

In [None]:
print(pandas_df)

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(pandas_df['category'], pandas_df['average_views'], color='orange')
plt.title('Average Views by Category')
plt.xlabel('Category')
plt.ylabel('Average Views')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#### **High Views , Low Ratings**

In [None]:
df_4= spark.read.csv("hdfs://localhost:9000/hdfs/path/Spark/high_views_low_ratings/*.csv", header=True)

In [None]:
df_4.show()

In [None]:
pandas_df = df_4.toPandas()

In [None]:
pandas_df['views'] = pd.to_numeric(pandas_df['views'], errors='coerce')
pandas_df['rating'] = pd.to_numeric(pandas_df['rating'], errors='coerce')

In [None]:
pandas_df = pandas_df.dropna(subset=['views', 'rating'])
pandas_df = pandas_df[pandas_df['views'] > 100]  # Filter to remove very low view counts

In [None]:
plt.figure(figsize=(12, 6))

sc = plt.scatter(pandas_df['views'], pandas_df['rating'], 
                 c=np.log(pandas_df['views'] + 1),  
                 s=np.sqrt(pandas_df['views']) / 5 + 10, 
                 alpha=0.6, cmap='coolwarm')  

plt.xscale('log')

plt.title(' High Views vs Low Ratings')
plt.xlabel('Views (Log Scale)')
plt.ylabel('Rating')

plt.colorbar(sc, label='Log of Views')

plt.grid(True)

plt.show()

In [None]:
top_video = pandas_df.loc[pandas_df['views'].idxmax()]
print(top_video)

#### **Correlation between the Video Length and Views**

In [None]:
df_5 = spark.read.csv("hdfs://localhost:9000/hdfs/path/Spark/length_vs_views/*.csv", header=True)

In [None]:
pandas_df = df_5.toPandas()

In [None]:
print(pandas_df)

In [None]:
print(pandas_df.dtypes)

In [None]:
pandas_df['length'] = pd.to_numeric(pandas_df['length'], errors='coerce')
pandas_df['avg_views'] = pd.to_numeric(pandas_df['avg_views'], errors='coerce')

In [None]:
# Filter out extreme outliers in 'length'
filtered_df = pandas_df[pandas_df['length'] < 6000]  # Adjust the threshold as needed

In [None]:
plt.figure(figsize=(10, 6))
scatter = plt.scatter(filtered_df['length'], filtered_df['avg_views'], 
                      c=np.log(filtered_df['avg_views'] + 1),  
                      s=np.sqrt(filtered_df['avg_views']) + 20,
                      cmap='viridis', alpha=0.7)

plt.xscale('log')
plt.yscale('log')

plt.title('Length vs Average Views', fontsize=16)
plt.xlabel('Length (Log Scale)', fontsize=14)
plt.ylabel('Average Views (Log Scale)', fontsize=14)

plt.colorbar(scatter, label='Log of Average Views')

plt.grid(True, which="both", ls="--", lw=0.5)
plt.tight_layout()

plt.show()

#### **Top 10 Most Viewed Videos**

In [None]:
df_6 = spark.read.csv("hdfs://localhost:9000/hdfs/path/Spark/top_10_most_viewed_videos/*.csv", header=True)

In [None]:
pandas_df = df_6.toPandas()

In [None]:
print(pandas_df)

In [None]:
df_top10_sorted = pandas_df.sort_values(by='views', ascending=False)
df_top10_sorted['views'] = pd.to_numeric(df_top10_sorted['views'])

In [None]:
plt.figure(figsize=(12, 6))
plt.barh(df_top10_sorted['video_id'], df_top10_sorted['views'], color='orange')

plt.title('Top 10 Most Viewed Videos', fontsize=16)
plt.xlabel('Views', fontsize=14)
plt.ylabel('Video ID', fontsize=14)

for index, value in enumerate(df_top10_sorted['views']):
    plt.text(value, index, f'{int(value):,}', va='center', fontsize=10)

plt.tight_layout()
plt.show()

#### **Top 5 Longest Videos**

In [None]:
df_7 = spark.read.csv("hdfs://localhost:9000/hdfs/path/Spark/top_5_longest_videos/*.csv", header=True)

In [None]:
pandas_df = df_7.toPandas()

In [None]:
print(pandas_df)

In [None]:
df_top5_longest = pandas_df.sort_values(by='length', ascending=False)
df_top5_longest['length'] = pd.to_numeric(df_top5_longest['length'])

In [None]:
plt.figure(figsize=(10, 6))
plt.barh(df_top5_longest['video_id'], df_top5_longest['length'], color='orange')

plt.title('Top 5 Longest Videos by Length', fontsize=16)
plt.xlabel('Length (seconds)', fontsize=14)
plt.ylabel('Video ID', fontsize=14)

for index, value in enumerate(df_top5_longest['length']):
    plt.text(value, index, f'{int(value):,}', va='center', fontsize=10)

plt.tight_layout()
plt.show()

#### **Top 5 Uploaders by number of videos**

In [None]:
df_8 = spark.read.csv("hdfs://localhost:9000/hdfs/path/Spark/top_5_uploaders_by_number_of_videos/*.csv",header=True)

In [None]:
pandas_df = df_8.toPandas()

In [None]:
print(pandas_df)

In [None]:
plt.figure(figsize=(8, 8))
plt.pie(pandas_df['total_videos'], labels=pandas_df['uploader'], autopct='%1.1f%%', startangle=90,
        colors = ['orange','green','violet','purple', '#ff6666'], wedgeprops={'linewidth': 3, 'edgecolor': 'white'})

centre_circle = plt.Circle((0, 0), 0.70, fc='white')
plt.gca().add_artist(centre_circle)

plt.axis('equal')

plt.title('Total Videos by Uploader (Donut Chart)', fontsize=16)

plt.tight_layout()
plt.show()

#### **Total Comments by Category**

In [None]:
df_9 = spark.read.csv("hdfs://localhost:9000/hdfs/path/Spark/total_comments_by_category/*.csv", header=True)

In [None]:
pandas_df = df_9.toPandas()

In [None]:
print(pandas_df)

In [None]:
plt.figure(figsize=(9, 9))
colors = ['#ff9999', '#1abc9c', '#ffcc99', '#66ff66', '#ff6666', '#c2c2f0', '#ffb3e6', '#c4e17f', '#76D7C4', '#F7DC6F', '#D98880', '#A569BD', '#3498DB']

explode = [0.05] * len(pandas_df['category'])

wedges, texts, autotexts = plt.pie(pandas_df['total_comments'], labels=pandas_df['category'], autopct='%1.1f%%', 
                                   startangle=90, colors=colors, wedgeprops={'linewidth': 3, 'edgecolor': 'white'},
                                   explode=explode, pctdistance=0.85)

centre_circle = plt.Circle((0, 0), 0.70, fc='white')
plt.gca().add_artist(centre_circle)

plt.axis('equal')

# Improve text properties for legibility
for text in texts:
    text.set_fontsize(10) 
    text.set_horizontalalignment('left') 

for autotext in autotexts:
    autotext.set_fontsize(10)  
    autotext.set_color('black') 

plt.title('Total Comments by Category (Donut Chart)', fontsize=16)

plt.tight_layout()
plt.show()

#### **Total Videos By Uploader**

In [None]:
df_10 = spark.read.csv("hdfs://localhost:9000/hdfs/path/Spark/total_videos_by_uploader/*.csv", header=True)

In [None]:
pandas_df = df_10.toPandas()

In [None]:
print(pandas_df)

In [None]:
pandas_df['total_videos'] = pd.to_numeric(pandas_df['total_videos'])

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(pandas_df['total_videos'], bins=50, color='orange', edgecolor='black', log=True)

plt.title('Distribution of Total Videos Uploaded', fontsize=16)
plt.xlabel('Total Videos', fontsize=14)
plt.ylabel('Number of Uploaders (Log Scale)', fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
# Sort by total_videos to get the top 20 uploaders
df_top20 = pandas_df.sort_values(by='total_videos', ascending=False).head(20)

plt.figure(figsize=(10, 6))
plt.barh(df_top20['uploader'], df_top20['total_videos'], color='skyblue')

plt.title('Top 20 Uploaders by Total Videos', fontsize=16)
plt.xlabel('Total Videos', fontsize=14)
plt.ylabel('Uploader', fontsize=14)

for index, value in enumerate(df_top20['total_videos']):
    plt.text(value, index, str(value), va='center', fontsize=10)

plt.tight_layout()
plt.show()

#### **Videos with Ratings above 4.5**

In [None]:
df_11 = spark.read.csv("hdfs://localhost:9000/hdfs/path/Spark/videos_with_ratings_above_4_5/*.csv", header=True)

In [None]:
df_11 = df_11.repartition(100)

In [None]:
for partition in df_11.rdd.glom().toLocalIterator():
    spark_df_partition = spark.createDataFrame(partition)
    pandas_df = spark_df_partition.toPandas()
    print(pandas_df.head())

In [None]:
pandas_df['views'] = pd.to_numeric(pandas_df['views'], errors='coerce')

In [None]:
# Group by rating and calculate the sum of views for each rating
grouped_df = pandas_df.groupby('rating').agg({'views': 'sum'}).reset_index()

# Apply log transformation to the views to handle large numbers
grouped_df['log_views'] = np.log10(grouped_df['views'] + 1)

In [None]:
plt.figure(figsize=(12, 8)) 

plt.scatter(grouped_df['rating'], grouped_df['log_views'], 
            c='orange', s=200, alpha=0.7, edgecolor='black', linewidth=1.5)

for i in range(len(grouped_df)):
    plt.annotate(f'{int(grouped_df["views"].iloc[i]):,}', 
                 (grouped_df['rating'].iloc[i], grouped_df['log_views'].iloc[i]), 
                 fontsize=10, ha='right')

plt.title('Ratings vs Log of Total Views (Rating > 4.5)', fontsize=18)
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Log of Total Views', fontsize=14)

plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)

plt.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()

In [None]:
spark.stop()