In [1]:
import pyspark
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local") \
    .config('spark.sql.autoBroadcastJoinThreshold', 0) \
    .config('spark.sql.adaptive.enabled', 'false') \
    .getOrCreate()

spark

In [2]:
videos = spark.read.option('header', 'true').option("inferSchema", "true").csv('../datasets/USvideos.csv')
videos.orderBy('video_id').show()

+-----------+--------------------+-------------------+-----------+--------------------+-------+-----+--------+-------------+--------------------+-----+
|   video_id|               title|      channel_title|category_id|                tags|  views|likes|dislikes|comment_total|      thumbnail_link| date|
+-----------+--------------------+-------------------+-----------+--------------------+-------+-----+--------+-------------+--------------------+-----+
|--JinobXWPk|DANGEROUS Jungle ...|   Brave Wilderness|         15|adventure|adventu...|1319945|38949|     533|         6768|https://i.ytimg.c...|20.10|
|-1fzGnFwz9M|9 Things You Need...|        Simon's Cat|         15|cartoon|simons ca...| 189414| 7070|     112|          288|https://i.ytimg.c...|13.09|
|-3AGlBYyLjo|Best Tom Petty In...|   CrazyLaughAction|         24|tom|petty|tom pet...|   2143|   16|       2|            4|https://i.ytimg.c...|06.10|
|-3lMEZ6k5NA|170912 BTS singin...|        Kpop Plus01|         10|          170912 BTS| 

In [3]:
comments_schema = StructType([ \
    StructField("video_id", StringType(), True), \
    StructField("comment_text", StringType(), True), \
    StructField("likes", IntegerType(), True), \
    StructField("replies", IntegerType(), True)])
comments = spark.read.option('header', 'true').option("mode", "DROPMALFORMED").schema(comments_schema).csv('../datasets/UScomments.csv')
comments.show()

+-----------+--------------------+-----+-------+
|   video_id|        comment_text|likes|replies|
+-----------+--------------------+-----+-------+
|XpVt6Z1Gjjo|Logan Paul it's y...|    4|      0|
|XpVt6Z1Gjjo|I've been followi...|    3|      0|
|XpVt6Z1Gjjo|Say hi to Kong an...|    3|      0|
|XpVt6Z1Gjjo| MY FAN . attendance|    3|      0|
|XpVt6Z1Gjjo|         trending üòâ|    3|      0|
|XpVt6Z1Gjjo|#1 on trending AY...|    3|      0|
|XpVt6Z1Gjjo|The end though üò≠...|    4|      0|
|XpVt6Z1Gjjo|#1 trending!!!!!!!!!|    3|      0|
|XpVt6Z1Gjjo|Happy one year vl...|    3|      0|
|XpVt6Z1Gjjo|You and your shit...|    0|      0|
|XpVt6Z1Gjjo|There should be a...|    0|      0|
|XpVt6Z1Gjjo|Dear Logan, I rea...|    0|      0|
|XpVt6Z1Gjjo|Honestly Evan is ...|    0|      0|
|XpVt6Z1Gjjo|Casey is still be...|    0|      0|
|XpVt6Z1Gjjo|aw geez rick this...|    0|      0|
|XpVt6Z1Gjjo|He happy cause he...|    0|      0|
|XpVt6Z1Gjjo|Ayyyyoooo Logang ...|    1|      0|
|XpVt6Z1Gjjo|Bro

## –ó–∞–¥–∞–Ω–∏–µ 1

In [99]:
# –°—á–∏—Ç–∞–µ–º –≤ –¥–∞—Ç–∞—Å–µ—Ç–µ —Å –≤–∏–¥–µ–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–æ—Å–º–æ—Ç—Ä–æ–≤, –ª–∞–π–∫–æ–≤, –¥–∏–∑–ª–∞–π–∫–æ–≤ –∏ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–µ–≤ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –≤–∏–¥–µ–æ. 
# –°—Ä–∞–∑—É —Å—á–∏—Ç–∞–µ–º –∑–Ω–∞—á–µ–Ω–∏–µ —Å–∫–æ—Ä–∞ –ø–æ –ø—Ä–æ—Å–º–æ—Ç—Ä–∞–º –∫–∞–∫ –æ–±—â–µ–µ –∫–æ–ª-–≤–æ –ª–∞–π–∫–æ–≤ –º–∏–Ω—É—Å –æ–±—â–µ–µ –∫–æ–ª-–≤–æ –¥–∏–∑–ª–∞–π–∫–æ–≤ –¥–µ–ª–∏—Ç—å –Ω–∞ –æ–±—â–µ–µ –∫–æ–ª-–≤–æ –ø—Ä–æ—Å–º–æ—Ç—Ä–æ–≤ (–∑–∞ –≤—Å–µ –¥–∞—Ç—ã)

videos_gr = videos.groupBy('video_id', 'title', 'channel_title', 'category_id', 'tags', 'thumbnail_link')\
                  .agg(sum('views').alias('views'),
                       sum('likes').alias('likes'),
                       sum('dislikes').alias('dislikes'),
                       sum('comment_total').alias('comment_total'),
                       round(((sum('likes') - sum('dislikes')) / sum('views')), 5).alias('score_view')
                      )

In [5]:
videos_gr.show()

+-----------+--------------------+--------------------+-----------+--------------------+--------------------+--------+------+--------+-------------+----------+
|   video_id|               title|       channel_title|category_id|                tags|      thumbnail_link|   views| likes|dislikes|comment_total|score_view|
+-----------+--------------------+--------------------+-----------+--------------------+--------------------+--------+------+--------+-------------+----------+
|fQxLlq7lxVM|FENTY BEAUTY | FU...|      Shameless Maya|         26|fenty beauty|riha...|https://i.ytimg.c...|  820416| 41338|    1037|        12956|   0.04912|
|5eSSL8hRU_E|Kelly Clarkson Ta...|    Z100 is New York|         10|kelly clarkson|lo...|https://i.ytimg.c...|  247255|  1327|      22|          156|   0.00528|
|2y7rk7eHHAM|Jennifer Lawrence...|Late Night with S...|         23|Late Night|Seth M...|https://i.ytimg.c...| 4205914| 34210|   12283|         8289|   0.00521|
|Q07Cp6tswnQ|The Road to 5,000...|    Ma

In [100]:
# –°—á–∏—Ç–∞–µ–º –≤ –¥–∞—Ç–∞—Å–µ—Ç–µ —Å –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏—è–º–∏–¥–ª—è –∫–∞–∂–¥–æ–≥–æ –≤–∏–¥–µ–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ª–∞–π–∫–æ–≤ –∏ –æ—Ç–≤–µ—Ç–æ–≤ –Ω–∞ –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–∏. 

comments_gr = comments.groupBy('video_id').agg(sum('likes').alias('comment_likes'),
                                               sum('replies').alias('comment_replies')
                                                )

In [None]:
# –∏—Å—Ö–æ–¥–Ω—ã–µ –¥–∞—Ç–∞—Å–µ—Ç—ã –±—ã–ª–∏ —Å–≥—Ä—É–ø–ø–∏—Ä–æ–≤–∞–Ω—ã –¥–ª—è —É–º–µ–Ω—å—à–µ–Ω–∏—è –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ —Å—Ç—Ä–æ–∫ –≤ –∫–∞–∂–¥–æ–º –ø–µ—Ä–µ–¥ –¥–∂–æ–π–Ω–æ–º. –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–æ –¥–ª—è –æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–∏ –º–æ–∂–Ω–æ –±—ã–ª–æ –ø—Ä–∏–º–µ–Ω–∏—Ç—å –±–∞–∫–µ—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ

In [101]:
# –û–±—ä–µ–¥–∏–Ω—è–µ–º —Å–≥—Ä—É–ø–ø–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –¥–∞—Ç–∞—Å–µ—Ç—ã –∏ —Å—á–∏—Ç–∞–µ–º –∑–Ω–∞—á–µ–Ω–∏–µ —Å–∫–æ–æ—Ä–∞ –ø–æ –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏—è–º –∏ –∏—Ç–æ–≥–æ–≤—ã–π —Å–∫–æ—Ä –∫–∞–∫ –ø—Ä–æ–∏–∑–≤–µ–¥–µ–Ω–∏–µ —Å–∫–æ—Ä–æ–≤ –ø–æ –ø—Ä–æ—Å–º–æ—Ç—Ä–∞–º –∏ –ø–æ –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏—è–º,
# –∏—Ç–æ–≥–æ–≤–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ —É–º–Ω–æ–∂–∞–µ–º –Ω–∞ 100000 –¥–ª—è –Ω–∞–≥–ª—è–¥–Ω–æ—Å—Ç–∏

videos_all = videos_gr.join(comments_gr,'video_id', 'left').fillna(0)\
                      .withColumn('score_comment', round(col('comment_likes')/(col('comment_total') + col('comment_replies')), 5))\
                      .withColumn('score', (round(col('score_view')*col('score_comment'), 5) * 100000).cast('int'))

In [8]:
scored_videos = videos_all.select('video_id', 'title', 'channel_title', 'category_id', 'tags', 'thumbnail_link', 'score')

scored_videos.show()

+-----------+--------------------+--------------------+-----------+--------------------+--------------------+-----+
|   video_id|               title|       channel_title|category_id|                tags|      thumbnail_link|score|
+-----------+--------------------+--------------------+-----------+--------------------+--------------------+-----+
|4yCkkOvIkUI|EXCLUSIVE: Zonniq...|            YBF Chic|         24|              [none]|https://i.ytimg.c...|  154|
|7TN09IP5JuI|Terry Crews Hallu...|      First We Feast|         26|First we feast|fw...|https://i.ytimg.c...|    1|
|Bo-qp-Zu0OY|Meeting Talking D...|        TouringPlans|         19|talking mickey|ta...|https://i.ytimg.c...| 1536|
|JkqTeQHFoBY|Guardians of the ...|          Framestore|          1|marvel|vfx|CG|CGI...|https://i.ytimg.c...| 1701|
|K7pQsR8WFSo|Schlieren Imaging...|          Veritasium|         27|veritasium|scienc...|https://i.ytimg.c...|   55|
|RE-far-FvRs|PUPPIES FIRST BAT...|          VLOGTOWSKI|         22|vlog|

## –ó–∞–¥–∞–Ω–∏–µ 2

In [102]:
# –ß–∏—Ç–∞–µ–º json

categories = spark.read.format('json').option('multiline', 'true').load('../datasets/US_category_id.json').select(inline_outer('items'))

In [103]:
# –í—ã—Ç–∞—Å–∫–∏–≤–∞–µ–º id –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –∏ –Ω–∞–∑–≤–∞–Ω–∏–µ

categories = categories.select(categories.id.alias('category_id'), categories.snippet.title.alias('title'))

In [11]:
categories.show()

+-----------+--------------------+
|category_id|               title|
+-----------+--------------------+
|          1|    Film & Animation|
|          2|    Autos & Vehicles|
|         10|               Music|
|         15|      Pets & Animals|
|         17|              Sports|
|         18|        Short Movies|
|         19|     Travel & Events|
|         20|              Gaming|
|         21|       Videoblogging|
|         22|      People & Blogs|
|         23|              Comedy|
|         24|       Entertainment|
|         25|     News & Politics|
|         26|       Howto & Style|
|         27|           Education|
|         28|Science & Technology|
|         29|Nonprofits & Acti...|
|         30|              Movies|
|         31|     Anime/Animation|
|         32|    Action/Adventure|
+-----------+--------------------+
only showing top 20 rows



In [114]:
# UDF-—Ñ—É–Ω–∫—Ü–∏—è –¥–ª—è –≤—ã—á–∏—Å–ª–µ–Ω–∏—è –º–µ–¥–∏–∞–Ω—ã. –ù–∞ –≤—Ö–æ–¥ –ø–æ–ª—É—á–∞–µ—Ç —Å–≥—Ä—É–ø–ø–∏—Ä–æ–≤–∞–Ω–Ω—ã–π –¥–∞—Ç–∞—Ñ—Ä–µ–π–º

@pandas_udf('category_id int, score int', PandasUDFType.GROUPED_MAP)
def compute_median(pdf):
    return pdf.assign(score = pdf.score.median())

In [118]:
# –ü—Ä–∏–º–µ–Ω—è–µ–º —Ñ—É–Ω–∫—Ü–∏—é –∫ –¥–∞—Ç–∞—Ñ—Ä–µ–π–º—É

scored_categories = scored_videos.select('category_id', 'score').groupBy('category_id').apply(compute_median).dropDuplicates().withColumnRenamed('score', 'median_score')

In [116]:
scored_categories.show()

+-----------+------------+
|category_id|median_score|
+-----------+------------+
|         17|          89|
|          1|         227|
|         25|          27|
|         23|          38|
|         26|         300|
|         22|         101|
|         19|         172|
|          2|         150|
|         20|          43|
|         28|          91|
|         29|        1246|
|         27|         219|
|         43|        1494|
|         15|         304|
|         24|          93|
|         10|         183|
+-----------+------------+



In [120]:
# –ü—Ä–∏–¥–∂–æ–π–Ω–∏–≤–∞–µ–º –Ω–∞–∑–≤–∞–Ω–∏—è –∫–∞—Ç–µ–≥–æ—Ä–∏–π. –¢.–∫. –¥–∞—Ç–∞—Å–µ—Ç —Å –Ω–∞–∑–≤–∞–Ω–∏—è–º–∏ –º–∞–ª–µ–Ω—å–∫–∏–π, —Ç–æ –ø—Ä–∏—Å–æ–µ–¥–∏–Ω—è–µ–º –µ–≥–æ –±—Ä–æ–∞–¥–∫–∞—Å—Ç–æ–º.

scored_categories = scored_categories.join(broadcast(categories), 'category_id', 'inner').select('title', 'median_score')

In [121]:
scored_categories.orderBy(desc('median_score')).show(truncate=False)

+---------------------+------------+
|title                |median_score|
+---------------------+------------+
|Shows                |1494        |
|Nonprofits & Activism|1246        |
|Pets & Animals       |304         |
|Howto & Style        |300         |
|Film & Animation     |227         |
|Education            |219         |
|Music                |183         |
|Travel & Events      |172         |
|Autos & Vehicles     |150         |
|People & Blogs       |101         |
|Entertainment        |93          |
|Science & Technology |91          |
|Sports               |89          |
|Gaming               |43          |
|Comedy               |38          |
|News & Politics      |27          |
+---------------------+------------+



## –ó–∞–¥–∞–Ω–∏–µ 3

In [78]:
import pandas as pd
import timeit

In [128]:
# UDF-—Ñ—É–Ω–∫—Ü–∏—è –¥–ª—è —Ä–∞–∑–±–∏–µ–Ω–∏—è –∫–æ–ª–æ–Ω–∫–∏ —Å —Ç—ç–≥–∞–º–∏ –Ω–∞ Scala

from pyspark.sql.column import Column, _to_java_column, _to_seq

sc = spark.sparkContext

def udfSplitTagsScalaWraper(tags):
    _udf = sc._jvm.CustomUDFs.splitTagsUDF()
    return Column(_udf.apply(_to_seq(sc, [tags], _to_java_column)))

splited_tags = videos.select('tags', udfSplitTagsScalaWraper(col('tags')).alias('tags'))

print(f"–í—Ä–µ–º—è –≤—ã–ø–æ–ª–Ω–µ–Ω–∏—è UDF –Ω–∞ Scala: {timeit.timeit('splited_tags.count()', number=1, globals=globals())} —Å–µ–∫")

–í—Ä–µ–º—è –≤—ã–ø–æ–ª–Ω–µ–Ω–∏—è UDF –Ω–∞ Scala: 0.08486641698982567 —Å–µ–∫


In [127]:
# UDF-—Ñ—É–Ω–∫—Ü–∏—è –¥–ª—è —Ä–∞–∑–±–∏–µ–Ω–∏—è –∫–æ–ª–æ–Ω–∫–∏ —Å —Ç—ç–≥–∞–º–∏ –Ω–∞ Python

@pandas_udf(ArrayType(StringType()), PandasUDFType.SCALAR)
def split_tags(tags):
    return tags.str.split('|')

splited_tags = videos.select(split_tags('tags').alias('tags'))

print(f"–í—Ä–µ–º—è –≤—ã–ø–æ–ª–Ω–µ–Ω–∏—è UDF –Ω–∞ Python: {timeit.timeit('splited_tags.count()', number=1, globals=globals())}  —Å–µ–∫")

–í—Ä–µ–º—è –≤—ã–ø–æ–ª–Ω–µ–Ω–∏—è UDF –Ω–∞ Python: 0.05377666593994945  —Å–µ–∫


In [129]:
# –ü—Ä–∏–º–µ–Ω–∏—è–µ–º UDF-—Ñ—É–Ω–∫—Ü–∏—é –¥–ª—è —Ä–∞–∑–±–∏–µ–Ω–∏—è –∫–æ–ª–æ–Ω–∫–∏ —Å —Ç—ç–≥–∞–º–∏, –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —Ç—ç–≥–∞ —Å–æ–∑–¥–∞—ë–º –æ—Ç–¥–µ–ª—å–Ω—É—é –∑–∞–ø–∏—Å—å

videos_tags = videos.select('video_id', explode_outer(split_tags('tags')).alias('tags'))

In [130]:
# –°—á–∏—Ç–∞–µ–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –≤–∏–¥–µ–æ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —Ç—ç–≥–∞, –≤—ã–≤–æ–¥–∏–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç –≤ –ø–æ—Ä—è–¥–∫–µ —É–±—ã–≤–∞–Ω–∏—è –ø–æ–ø—É–ª—è—Ä–Ω–æ—Å—Ç–∏ —Ç—ç–≥–æ–≤

popular_tags = videos_tags.distinct().groupBy('tags').agg(count('video_id').alias('sum_videos')).orderBy(col('sum_videos').desc()).show()

+-----------+----------+
|       tags|sum_videos|
+-----------+----------+
|      funny|       217|
|     comedy|       163|
|     [none]|       146|
|       2017|        93|
|      humor|        92|
|     how to|        84|
|     makeup|        77|
|      music|        74|
|       vlog|        73|
|      video|        72|
|  interview|        70|
|   tutorial|        69|
|  celebrity|        64|
|     review|        61|
|       news|        61|
|celebrities|        59|
|     beauty|        58|
|       food|        57|
|    science|        56|
|   comedian|        55|
+-----------+----------+
only showing top 20 rows



## –ó–∞–¥–∞–Ω–∏–µ 4

In [131]:
# –§–∏–ª—å—Ç—Ä—É–µ–º –¥–∞—Ç–∞—Å–µ—Ç —Å –≤–∏–¥–µ–æ –ø–æ —Ç—ç–≥—É cat —á–µ—Ä–µ–∑ —Ä–µ–≥—É–ª—è—Ä–Ω–æ–µ –≤—ã—Ä–∞–∂–µ–Ω–∏–µ (–≤—ã–±–∏—Ä–∞–µ–º –≤ –¥–∞–Ω–Ω–æ–º —Å–ª—É—á–∞–µ —Ç–æ–ª—å–∫–æ –æ—Ç–¥–µ–ª—å–Ω–æ —Å—Ç–æ—è—â–∏–π —Ä–µ–≥–∏—Å—Ç—Ä–æ–∑–∞–≤–∏—Å–∏–º—ã–π —Ç—ç–≥ "cat"), —É–±–∏—Ä–∞–µ–º –¥—É–±–ª–∏–∫–∞—Ç—ã

video_cats = videos.filter(regexp_extract('tags', r'\|cat\|', 0) != '').select('video_id').distinct()

In [132]:
# –ö –æ—Ç—Ñ–∏–ª—å—Ç—Ä–æ–≤–∞–Ω–Ω–æ–º—É –¥–∞—Ç–∞—Å–µ—Ç—É —Å –≤–∏–¥–µ–æ –¥–∂–æ–π–Ω–∏–º –¥–∞—Ç–∞—Å–µ—Ç —Å –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏—è–º–∏ (–∫–∞–∫ –∏ –≤ –ø–µ—Ä–≤–æ–º –∑–∞–¥–∞–Ω–∏–∏ –¥–∞—Ç–∞—Å–µ—Ç—ã –∑–∞—Ä–∞–Ω–µ–µ –º–æ–∂–Ω–æ –±—ã–ª–æ –±—ã –ø–æ–¥–≤–µ—Ä–≥–Ω—É—Ç—å –±–∞–∫–µ—Ç–∏—Ä–æ–≤–∞–Ω–∏—é)
# –≥—Ä—É–ø–ø–∏—Ä—É–µ–º –ø–æ –≤–∏–¥–µ–æ –∏ –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏—è–º, —Å—á–∏—Ç–∞–µ–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ª–∞–π–∫–æ–≤, —Å–æ—Ä—Ç–∏—Ä—É–µ–º –∏ –≤—ã–≤–æ–¥–∏–º 5 –∫–æ–º–º–µ–Ω–∞—Ä–∏–µ–≤ —Å —Å–∞–º—ã–º –±–æ–ª—å—à–∏–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ–º –ª–∞–π–∫–æ–≤

top_5_comments = video_cats.join(comments, 'video_id', 'inner')\
                           .groupBy('video_id', 'comment_text')\
                           .agg(sum('likes').alias('likes'),
                                sum('replies').alias('replies'))\
                           .orderBy(desc('likes'))\
                           .limit(5)

In [98]:
top_5_comments.show(truncate=False)

+-----------+------------------------------------------------------------------------------------------------------+-----+-------+
|video_id   |comment_text                                                                                          |likes|replies|
+-----------+------------------------------------------------------------------------------------------------------+-----+-------+
|tp9aQXDFHbY|Make sure to check back next Friday as we are launching our brand new animated HALLOWEEN special! üê±üï∑|1329 |162    |
|-1fzGnFwz9M|I make interesting cartoons and I need your help! Go to the channel, rate my work!                    |839  |5      |
|tp9aQXDFHbY|1:51 so your nuts are your most prized possession?                                                    |121  |8      |
|Vjc459T6wX8|How does Mugumogu not collapse in a heap of laughter?!! Maru's liquified form is hilarious!           |50   |7      |
|tp9aQXDFHbY|If Simon will be make animation movie of Simons Cat adventures, I‚

In [133]:
spark.stop()