In [787]:
from pyspark.sql.session import SparkSession
from pyspark.context import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql import functions as F

In [788]:
spark = SparkSession.builder.appName("Project_3").getOrCreate()
#was having problem with converting string to date hence used older version of spark
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

In [789]:
#video_id trending_date title channel_title category_id publish_time tags views likes dislikes comment_count comments_disabled ratings_disabled

schema = StructType(
    [
        StructField("video_id", StringType(), True),
        StructField("trending_date", StringType(), True),
        StructField("title", StringType(), True),
        StructField("channel_title", StringType(), True),
        StructField("category_id", IntegerType(), True),
        StructField("publish_time", StringType(), True),
        StructField("tags", StringType(), True),
        StructField("views", IntegerType(), True),
        StructField("likes", IntegerType(), True),
        StructField("dislikes", IntegerType(), True),
        StructField("comment_count", IntegerType(), True),
        StructField("comments_disabled", StringType(), True),
        StructField("ratings_disabled", StringType(), True)
    ]
)

In [790]:
#use the path to your file 
vdf = spark.read.csv("C:/Users/16072/Downloads/USvideos.csv", header=True, schema=schema)

In [791]:
vdf.show(10)

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+-----------------+----------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|comments_disabled|ratings_disabled|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+-----------------+----------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...|        CaseyNeistat|         22|2017-11-13T17:13:...|     SHANtell martin| 748374| 57527|    2966|        15954|            FALSE|           FALSE|
|1ZAPwfrtAFY|     17.14.11|The Trump Preside...|     LastWeekTonight|         24|2017-11-13T07:30:...|"last week tonigh...|2418783| 97185|    6146|        12703|            FALSE|           FALSE|
|5qpjK5DgCt4|  

In [792]:
vdf.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: integer (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)



In [793]:
schema1 = StructType(
    [
        StructField("category_id", IntegerType(), True),
        StructField("categor_name", StringType(), True)
    ]
)

In [794]:
cdf = spark.read.csv("C:/Users/16072/Downloads/category_title.csv", header=False, schema=schema1)

In [795]:
cdf.show(4)

+-----------+----------------+
|category_id|    categor_name|
+-----------+----------------+
|          1|Film & Animation|
|          2|Autos & Vehicles|
|         10|           Music|
|         15|  Pets & Animals|
+-----------+----------------+
only showing top 4 rows



In [796]:
vdf = (vdf.withColumn("total_user_interaction", col("views")+col("likes")+col("dislikes")+col("comment_count")))
#will be used for likes dislikes ratio question
vdf = (vdf.withColumn("likes_dislikes_ratio", col("likes")/col("dislikes")))
#will be used for views comments ratio question
vdf = (vdf.withColumn("views_comment_ratio", col("views")/col("comment_count")))
#will be used for likes views ratio question
vdf = (vdf.withColumn("likes_view_ratio", col("views")/col("likes")))
#Top 3 videos for which user interaction (views + likes + dislikes + comments) is the highest

top_3__videos_user_interaction= (vdf.groupBy("video_id").agg(sum("total_user_interaction").alias("total_user_interaction")).orderBy(desc("total_user_interaction")).limit(3))

In [797]:
top_3__videos_user_interaction.show()

+-----------+----------------------+
|   video_id|total_user_interaction|
+-----------+----------------------+
|VYOjWnS4cMY|            3871395306|
|ffxKSjUwKdU|            1584764275|
|7C2z4GqqS5E|            1372981678|
+-----------+----------------------+



In [798]:
#Bottom 3 videos for which user interaction (views + likes + dislikes + comments) is lowest
bottom_3__videos_user_interaction= (vdf1.groupBy("video_id").agg(sum("total_user_interaction").alias("total_user_interaction")).orderBy(asc("total_user_interaction")).limit(3))

In [799]:
bottom_3__videos_user_interaction.show()

+-----------+----------------------+
|   video_id|total_user_interaction|
+-----------+----------------------+
|zeQaJGkFyqQ|                   757|
|qg0GdM60syI|                   775|
|OmM425PFd3Y|                  1422|
+-----------+----------------------+



In [800]:
jdf = vdf.join(cdf,vdf.category_id ==  cdf.category_id,"left")
jdf.show()

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+-----------------+----------------+----------------------+--------------------+-------------------+------------------+-----------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|comments_disabled|ratings_disabled|total_user_interaction|likes_dislikes_ratio|views_comment_ratio|  likes_view_ratio|category_id|        categor_name|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+-----------------+----------------+----------------------+--------------------+-------------------+------------------+-----------+--------------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...|        Casey

In [801]:
from pyspark.sql.functions import unix_timestamp, from_unixtime

In [802]:

jdf = jdf.withColumn("trending_date",regexp_replace(col("trending_date"), "\\." ,"-"))
jdf = jdf.withColumn("trending_date_new", date_format(to_utc_timestamp(to_timestamp("trending_date", "yy-dd-MM"), "America/New_York"), "yy-dd-MM"))
jdf = jdf.withColumn("trending_date_new",to_date(jdf.trending_date, "yy-dd-MM"))
jdf = jdf.withColumn("publish_time_new", date_format(to_utc_timestamp(to_timestamp("publish_time", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"), "America/New_York"), "yyyy-MM-dd HH:mm:ss"))
jdf = jdf.withColumn("publish_time_new",to_date(jdf.publish_time_new, "yyyy-MM-dd HH:mm:ss"))
jdf.show(5)
jdf.printSchema()

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+-----------------+----------------+----------------------+--------------------+-------------------+------------------+-----------+--------------+-----------------+----------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|comments_disabled|ratings_disabled|total_user_interaction|likes_dislikes_ratio|views_comment_ratio|  likes_view_ratio|category_id|  categor_name|trending_date_new|publish_time_new|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+-----------------+----------------+----------------------+--------------------+-------------------+------------------+-----------+--------------+----------

In [803]:
jdf = jdf.withColumn("year_trending",year(jdf.trending_date_new))
jdf.show(5)

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+-----------------+----------------+----------------------+--------------------+-------------------+------------------+-----------+--------------+-----------------+----------------+-------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|comments_disabled|ratings_disabled|total_user_interaction|likes_dislikes_ratio|views_comment_ratio|  likes_view_ratio|category_id|  categor_name|trending_date_new|publish_time_new|year_trending|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+-----------------+----------------+----------------------+--------------------+-------------------+------------------+---------

In [804]:
#Top 3 videos of each category in each year by views
top_three_videos_by_views_year_wise = (jdf.groupBy("video_id",vdf.category_id,"year_trending").agg(sum(vdf.views).alias("total_views")).orderBy(desc("total_views")))

In [805]:
top_three_videos_by_views_year_wise.show()

+-----------+-----------+-------------+-----------+
|   video_id|category_id|year_trending|total_views|
+-----------+-----------+-------------+-----------+
|VYOjWnS4cMY|         10|         2018| 3758488765|
|ffxKSjUwKdU|         10|         2018| 1529291326|
|7C2z4GqqS5E|         10|         2018| 1283188291|
|zEf423kYfqk|         10|         2018| 1182971286|
|u9Mv98Gr5pY|         24|         2018| 1003151226|
|xTlNMmZKwpA|         10|         2018|  896558471|
|     #NAME?|         10|         2018|  823920309|
|ulNswX3If6U|          1|         2018|  818792483|
|vjI4Alon-3Q|         10|         2018|  803455479|
|DkeiKbqa02g|         10|         2018|  792843004|
|FlsCjmMhFmw|         24|         2017|  780801040|
|M4ZoCHID9GI|         10|         2018|  778810304|
|i0p1bmr0EmE|         10|         2018|  774320575|
|rRr1qiJRsXk|         24|         2018|  747820836|
|n_W54baizX8|         10|         2018|  735203270|
|tCXGJQYZ9JA|         10|         2018|  728576946|
|6ZfuNTqbHE8

In [806]:
#Top 3 videos of each category ineach year by comments
top_three_videos_by_comments_year_wise = (jdf.groupBy("video_id",vdf.category_id,"year_trending").agg(sum(jdf.comment_count).alias("total_comments")).orderBy(desc("total_comments")))
top_three_videos_by_comments_year_wise.show()

+-----------+-----------+-------------+--------------+
|   video_id|category_id|year_trending|total_comments|
+-----------+-----------+-------------+--------------+
|7C2z4GqqS5E|         10|         2018|      15568561|
|VYOjWnS4cMY|         10|         2018|      10151289|
|QwZT7T-TXT0|         24|         2018|       8190045|
|FlsCjmMhFmw|         24|         2017|       5898529|
|oWjxSkJpxFU|         29|         2018|       4710338|
|ffxKSjUwKdU|         10|         2018|       4143608|
|ooyjaVdt-jA|         20|         2018|       3590220|
|kTlv5_Bs8aw|         10|         2017|       3509630|
|i0p1bmr0EmE|         10|         2018|       3172060|
|p8npDG2ulKQ|         10|         2018|       3114441|
|6ZfuNTqbHE8|         24|         2017|       2906944|
|J2HytHu5VBI|         22|         2018|       2673859|
|JyUKqUTp9rc|         26|         2018|       2561173|
|FhllRWReNEE|         22|         2018|       2539351|
|u9Mv98Gr5pY|         24|         2018|       2533352|
|sS0LCjOiI

In [807]:
#Top 3 videos of each category ineach year by likes
top_three_videos_by_likes_year_wise = (jdf.groupBy("video_id",vdf.category_id,"year_trending").agg(sum(jdf.likes).alias("total_likes")).orderBy(desc("total_likes")))
top_three_videos_by_likes_year_wise.show()

+-----------+-----------+-------------+-----------+
|   video_id|category_id|year_trending|total_likes|
+-----------+-----------+-------------+-----------+
|VYOjWnS4cMY|         10|         2018|   96700818|
|7C2z4GqqS5E|         10|         2018|   71835050|
|ffxKSjUwKdU|         10|         2018|   49451353|
|tCXGJQYZ9JA|         10|         2018|   24868764|
|u9Mv98Gr5pY|         24|         2018|   23339807|
|xTlNMmZKwpA|         10|         2018|   23078293|
|D_6QmL6rExk|         10|         2018|   22622742|
|kX0vO4vlJuU|         10|         2018|   22470914|
|p8npDG2ulKQ|         10|         2018|   22189964|
|8O_MwlZ2dEg|         10|         2018|   21994730|
|6ZfuNTqbHE8|         24|         2017|   21555430|
|ulNswX3If6U|          1|         2018|   20165850|
|iWZmdoY1aTE|         10|         2018|   19836382|
|FlsCjmMhFmw|         24|         2017|   19781372|
|J-dv_DcDD_A|         10|         2018|   19546890|
|nQySbNGu4g0|         10|         2018|   19369126|
|zEf423kYfqk

In [808]:
#Top 3 videos of each category ineach year by user interaction
top_three_videos_by_user_interaction_year_wise = (jdf.groupBy("video_id",vdf.category_id,"year_trending").agg(sum(jdf.total_user_interaction).alias("total_user_interaction")).orderBy(desc("total_user_interaction")))
top_three_videos_by_user_interaction_year_wise.show()

+-----------+-----------+-------------+----------------------+
|   video_id|category_id|year_trending|total_user_interaction|
+-----------+-----------+-------------+----------------------+
|VYOjWnS4cMY|         10|         2018|            3871395306|
|ffxKSjUwKdU|         10|         2018|            1584764275|
|7C2z4GqqS5E|         10|         2018|            1372981678|
|zEf423kYfqk|         10|         2018|            1204950051|
|u9Mv98Gr5pY|         24|         2018|            1029856169|
|xTlNMmZKwpA|         10|         2018|             922240931|
|ulNswX3If6U|          1|         2018|             840306778|
|     #NAME?|         10|         2018|             836175559|
|vjI4Alon-3Q|         10|         2018|             820699240|
|FlsCjmMhFmw|         24|         2017|             816592094|
|DkeiKbqa02g|         10|         2018|             806420962|
|M4ZoCHID9GI|         10|         2018|             796950637|
|i0p1bmr0EmE|         10|         2018|             796

In [809]:
jdf = jdf.withColumn("month_trending",month(jdf.trending_date_new))
jdf.show(5)

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+-----------------+----------------+----------------------+--------------------+-------------------+------------------+-----------+--------------+-----------------+----------------+-------------+--------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|comments_disabled|ratings_disabled|total_user_interaction|likes_dislikes_ratio|views_comment_ratio|  likes_view_ratio|category_id|  categor_name|trending_date_new|publish_time_new|year_trending|month_trending|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+-----------------+----------------+----------------------+--------------------+------------------

In [810]:
jdf.groupBy("month_trending").agg(count(jdf.month_trending).alias("total_months")).orderBy(desc("total_months")).show(5)

+--------------+------------+
|month_trending|total_months|
+--------------+------------+
|            12|        6200|
|             5|        6199|
|             3|        6183|
|             1|        5796|
|             2|        5571|
+--------------+------------+
only showing top 5 rows



In [811]:
#Top 3 videos in each month by likes dislikes ratio
likes_dislikes_ratio_by_month=jdf.groupBy("year_trending","month_trending","video_id").agg(avg("likes_dislikes_ratio").alias("likes_dislikes_ratio")).orderBy(desc("likes_dislikes_ratio"))
likes_dislikes_ratio_by_month.show()

+-------------+--------------+-----------+--------------------+
|year_trending|month_trending|   video_id|likes_dislikes_ratio|
+-------------+--------------+-----------+--------------------+
|         2017|            12|p1af9PKM8Eo|   1484.472222222222|
|         2018|             1|ONI_06wGbsQ|    726.046418128655|
|         2017|            12|8Jmd7-1quDM|   619.4206349206349|
|         2018|             1|CFwXUarN-wg|               587.0|
|         2017|            12|w1zwGLBpULs|               552.5|
|         2017|            11|6ixU_vdE0Es|    533.921052631579|
|         2017|            12|exGGotfSuH0|            494.4375|
|         2017|            12|Q48VduIflPk|  480.14790685649456|
|         2017|            12|4SvJgnhnUag|               458.4|
|         2017|            12|IMzf3C34NNM|   429.3666666666667|
|         2018|             2|X5YJU6_Mfpg|   419.5860082771847|
|         2017|            12|wwX-RSqxEXc|   410.9533333333334|
|         2018|             1|Q48VduIflP

In [812]:
#Top 3 videos of each category in each month by views
top_three_videos_by_views_month_wise =  (jdf.groupBy("video_id",vdf.category_id,"month_trending").agg(sum(jdf.views).alias("total_views")).orderBy(desc("total_views")))
top_three_videos_by_views_month_wise.show()

+-----------+-----------+--------------+-----------+
|   video_id|category_id|month_trending|total_views|
+-----------+-----------+--------------+-----------+
|VYOjWnS4cMY|         10|             5| 3312786299|
|7C2z4GqqS5E|         10|             5| 1160177371|
|ffxKSjUwKdU|         10|             5| 1057426403|
|zEf423kYfqk|         10|             5|  878139569|
|xTlNMmZKwpA|         10|             6|  867607474|
|vjI4Alon-3Q|         10|             5|  803455479|
|DkeiKbqa02g|         10|             5|  792843004|
|FlsCjmMhFmw|         24|            12|  780801040|
|i0p1bmr0EmE|         10|             4|  774320575|
|     #NAME?|         10|             3|  759834314|
|u9Mv98Gr5pY|         24|             5|  736596294|
|tCXGJQYZ9JA|         10|             3|  728576946|
|6ZfuNTqbHE8|         24|            12|  631749120|
|M4ZoCHID9GI|         10|             4|  586022448|
|U9BwWKXjVaI|         10|             4|  583521598|
|QwievZ1Tx-8|         24|             3|  5634

In [813]:
#Top 3 videos of each category in each month by likes
top_three_videos_by_likes_month_wise =  (jdf.groupBy("video_id",vdf.category_id,"month_trending").agg(sum(jdf.likes).alias("total_likes")).orderBy(desc("total_likes")))
top_three_videos_by_likes_month_wise.show()

+-----------+-----------+--------------+-----------+
|   video_id|category_id|month_trending|total_likes|
+-----------+-----------+--------------+-----------+
|VYOjWnS4cMY|         10|             5|   86714965|
|7C2z4GqqS5E|         10|             5|   66221223|
|ffxKSjUwKdU|         10|             5|   28100134|
|tCXGJQYZ9JA|         10|             3|   24868764|
|D_6QmL6rExk|         10|             6|   22622742|
|kX0vO4vlJuU|         10|             4|   22470914|
|p8npDG2ulKQ|         10|             5|   22189964|
|8O_MwlZ2dEg|         10|             3|   21994730|
|xTlNMmZKwpA|         10|             6|   21428110|
|ffxKSjUwKdU|         10|             4|   21351219|
|6ZfuNTqbHE8|         24|            12|   19819535|
|FlsCjmMhFmw|         24|            12|   19781372|
|J-dv_DcDD_A|         10|             4|   19546890|
|QwievZ1Tx-8|         24|             3|   18763390|
|iWZmdoY1aTE|         10|             5|   17527049|
|i0p1bmr0EmE|         10|             4|   173

In [814]:
#Top 3 videos of each category in each month by dislikes
top_three_videos_by_dislikes_month_wise =  (jdf.groupBy("video_id",vdf.category_id,"month_trending").agg(sum(jdf.dislikes).alias("total_dislikes")).orderBy(desc("total_dislikes")))
top_three_videos_by_dislikes_month_wise.show(3)

+-----------+-----------+--------------+--------------+
|   video_id|category_id|month_trending|total_dislikes|
+-----------+-----------+--------------+--------------+
|FlsCjmMhFmw|         24|            12|      10111153|
|QwZT7T-TXT0|         24|             1|       9192539|
|VYOjWnS4cMY|         10|             5|       5372788|
+-----------+-----------+--------------+--------------+
only showing top 3 rows



In [815]:
#Top 3 channels by views 
top_three_channels_by_view =  (jdf.groupBy("channel_title").agg(sum(jdf.views).alias("total_channel_views")).orderBy(desc("total_channel_views")))
top_three_channels_by_view.show(3)

+-------------------+-------------------+
|      channel_title|total_channel_views|
+-------------------+-------------------+
|ChildishGambinoVEVO|         3758488765|
|            ibighit|         2235906679|
|       Dude Perfect|         1870085178|
+-------------------+-------------------+
only showing top 3 rows



In [816]:
#Top 3 channels by likes
top_three_channels_by_likes_dislikes_ratio =  (jdf.groupBy("channel_title").agg(avg("likes_dislikes_ratio").alias("likes_dislikes_ratio")).orderBy(desc("likes_dislikes_ratio")))
top_three_channels_by_likes_dislikes_ratio.show(3)

+--------------+--------------------+
| channel_title|likes_dislikes_ratio|
+--------------+--------------------+
|  Smyang Piano|   1484.472222222222|
|     Amber Liu|    726.046418128655|
|Duan Mackenzie|   619.4206349206349|
+--------------+--------------------+
only showing top 3 rows



In [817]:
#Top 3 channels by comments
top_three_channels_by_comment_count =  (jdf.groupBy("channel_title").agg(sum(jdf.comment_count).alias("total_comment_count")).orderBy(desc("total_comment_count")))
top_three_channels_by_comment_count.show(3)

+-------------------+-------------------+
|      channel_title|total_comment_count|
+-------------------+-------------------+
|            ibighit|           31817464|
|   Logan Paul Vlogs|           14870370|
|ChildishGambinoVEVO|           10151289|
+-------------------+-------------------+
only showing top 3 rows



In [818]:
#Top 3 categories by views
top_three_category_by_views =  (jdf.groupBy(vdf.category_id,cdf.categor_name).agg(sum(jdf.views).alias("total_views")).orderBy(desc("total_views")))
top_three_category_by_views.show(3)

+-----------+----------------+-----------+
|category_id|    categor_name|total_views|
+-----------+----------------+-----------+
|         10|           Music|40132892190|
|         24|   Entertainment|20604388195|
|          1|Film & Animation| 7284156721|
+-----------+----------------+-----------+
only showing top 3 rows



In [819]:
#Top 3 categories by likes dislikes ratio
top_three_category_by_likes_dislikes_ratio =  (jdf.groupBy(vdf.category_id,cdf.categor_name).agg(avg("likes_dislikes_ratio").alias("likes_dislikes_ratio")).orderBy(desc("likes_dislikes_ratio")))
top_three_category_by_likes_dislikes_ratio.show(3)

+-----------+--------------+--------------------+
|category_id|  categor_name|likes_dislikes_ratio|
+-----------+--------------+--------------------+
|         15|Pets & Animals|   62.55712745078967|
|         10|         Music|   60.11733871888926|
|         22|People & Blogs|  57.403180936078684|
+-----------+--------------+--------------------+
only showing top 3 rows



In [820]:
#Top 3 categories by comments
top_three_category_by_comments =  (jdf.groupBy(vdf.category_id,cdf.categor_name).agg(sum("comment_count").alias("comment_count")).orderBy(desc("comment_count")))
top_three_category_by_comments.show(3)

+-----------+--------------+-------------+
|category_id|  categor_name|comment_count|
+-----------+--------------+-------------+
|         10|         Music|    125296396|
|         24| Entertainment|     73566498|
|         22|People & Blogs|     24778032|
+-----------+--------------+-------------+
only showing top 3 rows



In [821]:
#Calculate any 3 videos which got at least 5 comments on every 1000 views
three_videos_above_five_comment_per_thousand = jdf.where(col("views_comment_ratio")>1000/5)
three_videos_above_five_comment_per_thousand.show(3)

+-----------+-------------+--------------------+-------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+-----------------+----------------+----------------------+--------------------+-------------------+------------------+-----------+--------------------+-----------------+----------------+-------------+--------------+
|   video_id|trending_date|               title|      channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|comments_disabled|ratings_disabled|total_user_interaction|likes_dislikes_ratio|views_comment_ratio|  likes_view_ratio|category_id|        categor_name|trending_date_new|publish_time_new|year_trending|month_trending|
+-----------+-------------+--------------------+-------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+-----------------+----------------+----------------------+--------------------+---------

In [822]:
#Calculate any 3 videos which got at least 4 likes on every 100 views
three_videos_above_four_likes_per_hundred_views = jdf.where(col("likes_view_ratio")>100/4)
three_videos_above_four_likes_per_hundred_views.show(3)

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+-----+--------+-------------+-----------------+----------------+----------------------+--------------------+-------------------+------------------+-----------+--------------------+-----------------+----------------+-------------+--------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views|likes|dislikes|comment_count|comments_disabled|ratings_disabled|total_user_interaction|likes_dislikes_ratio|views_comment_ratio|  likes_view_ratio|category_id|        categor_name|trending_date_new|publish_time_new|year_trending|month_trending|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+-----+--------+-------------+-----------------+----------------+----------------------+--------------------+---------

In [823]:
#Number of videos published in each category
total_videos_by_category= jdf.groupBy(vdf.category_id,cdf.categor_name).agg(count("video_id").alias("number_of_videos_published")).orderBy(desc("number_of_videos_published"))
total_videos_by_category.show()

+-----------+--------------------+--------------------------+
|category_id|        categor_name|number_of_videos_published|
+-----------+--------------------+--------------------------+
|         24|       Entertainment|                      9964|
|         10|               Music|                      6472|
|         26|       Howto & Style|                      4146|
|         23|              Comedy|                      3457|
|         22|      People & Blogs|                      3210|
|         25|     News & Politics|                      2487|
|         28|Science & Technology|                      2401|
|          1|    Film & Animation|                      2345|
|         17|              Sports|                      2174|
|         27|           Education|                      1656|
|         15|      Pets & Animals|                       920|
|         20|              Gaming|                       817|
|         19|     Travel & Events|                       402|
|       