In [1]:
import timeit
import pyspark
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql.column import _to_java_column
from pyspark.sql.column import _to_seq

spark = SparkSession.builder.master("local") \
    .config('spark.sql.autoBroadcastJoinThreshold', 0) \
    .config('spark.sql.adaptive.enabled', 'false') \
    .getOrCreate()

sc = spark.sparkContext



# scored_videos

In [9]:
videos = spark.read.option('header', 'true').option("inferSchema", "true").csv('../datasets/USvideos.csv')
videos.write \
    .bucketBy(10, 'video_id') \
    .saveAsTable('videos_bucketed', format='csv', mode='overwrite')

In [22]:
comments_schema = StructType([ \
    StructField("video_id", StringType(), True), \
    StructField("comment_text", StringType(), True), \
    StructField("likes", IntegerType(), True), \
    StructField("replies", IntegerType(), True)])
comments = spark.read.option('header', 'true').option("mode", "DROPMALFORMED").schema(comments_schema).csv('../datasets/UScomments.csv')
comments.write \
    .bucketBy(10, 'video_id') \
    .saveAsTable('comment_bucketed', format='csv', mode='overwrite')

In [23]:
videos_bucketed = spark.table('videos_bucketed')
videos_bucketed.show(5)

+-----------+--------------------+------------------+-----------+--------------------+-------+-----+--------+-------------+--------------------+-----+
|   video_id|               title|     channel_title|category_id|                tags|  views|likes|dislikes|comment_total|      thumbnail_link| date|
+-----------+--------------------+------------------+-----------+--------------------+-------+-----+--------+-------------+--------------------+-----+
|WYYvHb03Eog|Apple iPhone X fi...|         The Verge|         28|apple iphone x ha...|2642103|24975|    4542|        12829|https://i.ytimg.c...|13.09|
|1L7JFN7tQLs|iPhone X Hands on...| Jonathan Morrison|         28|Apple|iPhone X|iP...| 514972|18936|     641|         3817|https://i.ytimg.c...|13.09|
|B7YaMkCl3XA|Hurricane Irma de...|Al Jazeera English|         25|5573051142001|ame...| 382525| 1521|     270|         1168|https://i.ytimg.c...|13.09|
|5ywKal6-anc|Gigi Hadid Loses ...|               TMZ|         24|TMZ2016FS11221|TM...| 703750|

In [24]:
videos_bucketed.count()

7998

In [25]:
videos_bucketed.dropDuplicates(('video_id',)).count()

2364

In [30]:
# отбираем последнюю запись для каждого видео
windowSpec  = Window.partitionBy("video_id").orderBy(col("date").desc())

videos_drop_dubl = videos_bucketed.withColumn("row_number",row_number().over(windowSpec)) \
                                  .where(col("row_number") == 1) \
                                  .drop("row_number")
videos_drop_dubl.count()

2364

In [28]:
videos_drop_dubl.show(5)

+-----------+--------------------+-------------------+-----------+--------------------+-------+-----+--------+-------------+--------------------+-----+
|   video_id|               title|      channel_title|category_id|                tags|  views|likes|dislikes|comment_total|      thumbnail_link| date|
+-----------+--------------------+-------------------+-----------+--------------------+-------+-----+--------+-------------+--------------------+-----+
|--JinobXWPk|DANGEROUS Jungle ...|   Brave Wilderness|         15|adventure|adventu...|1319945|38949|     533|         6768|https://i.ytimg.c...|20.10|
|-3lMEZ6k5NA|170912 BTS singin...|        Kpop Plus01|         10|          170912 BTS| 201901|10034|     148|          591|https://i.ytimg.c...|15.09|
|-5sCWsLlTCI|SNL Host Kumail N...|Saturday Night Live|         24|saturday night li...|  85052| 1458|      97|          132|https://i.ytimg.c...|14.10|
|-SiRV2kWIxg|Madison Beer Play...|                RAW|         22|Madison Beer|RAW|...| 

In [31]:
comments_bucketed = spark.table('comment_bucketed')
comments_bucketed.count()

691318

In [32]:
comments_bucketed.show(5)

+-----------+--------------------+-----+-------+
|   video_id|        comment_text|likes|replies|
+-----------+--------------------+-----+-------+
|sjlHnJvXdQs|Nothing's scarier...|    3|      0|
|sjlHnJvXdQs|          Wa da fuq?|    0|      0|
|sjlHnJvXdQs|Jacksfilms = sarc...|    0|      0|
|sjlHnJvXdQs|Idióta, iphone ma...|    0|      0|
|sjlHnJvXdQs|Iphone x imbibing...|    0|      0|
+-----------+--------------------+-----+-------+
only showing top 5 rows



In [33]:
comments_bucketed.dropDuplicates(('video_id',)).count()

2266

In [35]:
# агрегируем лайки к комментариям для добавления их в расчета скора
comments_likes = comments_bucketed.groupBy('video_id')\
                                  .agg(sum('likes').alias('comment_likes'), 
                                       sum('replies').alias('comment_replies'))
comments_likes.count()

2266

In [36]:
# добавляем лайки к комментариям к датасету с видео
videos = videos_drop_dubl.join(comments_likes, 'video_id')
videos.count()

2266

In [37]:
videos.show(5)

+-----------+--------------------+-------------------+-----------+--------------------+-------+-----+--------+-------------+--------------------+-----+-------------+---------------+
|   video_id|               title|      channel_title|category_id|                tags|  views|likes|dislikes|comment_total|      thumbnail_link| date|comment_likes|comment_replies|
+-----------+--------------------+-------------------+-----------+--------------------+-------+-----+--------+-------------+--------------------+-----+-------------+---------------+
|--JinobXWPk|DANGEROUS Jungle ...|   Brave Wilderness|         15|adventure|adventu...|1319945|38949|     533|         6768|https://i.ytimg.c...|20.10|           19|              1|
|-5sCWsLlTCI|SNL Host Kumail N...|Saturday Night Live|         24|saturday night li...|  85052| 1458|      97|          132|https://i.ytimg.c...|14.10|          494|             39|
|-SiRV2kWIxg|Madison Beer Play...|                RAW|         22|Madison Beer|RAW|...|  1

In [38]:
# Определяем веса для факторов
weight_views = 0.5
weight_comments = 0.2
weight_comments_likes = 0.1
weight_likes = 0.2
weight_dislikes = -0.2

In [42]:
# формула для расчета скора
score = videos['views'] * lit(weight_views) \
        + videos['likes'] * lit(weight_likes) \
        + videos['comment_likes'] * lit(weight_comments_likes) \
        + videos['comment_total'] * lit(weight_comments) \
        + videos['dislikes'] * lit(weight_dislikes) 

# categories_score

In [None]:
датасет по категориям, в котором присутствуют следующие поля: Название категории (не id, он непонятный для аналитиков!) - 
можно найти в файле US_category_id.json . Медиана показателя score из датасета scored_videos по каждой категории.

In [44]:
# читаем json us_category_id
us_category_id = spark.read \
    .option("multiline","true") \
    .json('../datasets/US_category_id.json') \
    .select('items')
us_category_id.show(10, True)

+--------------------+
|               items|
+--------------------+
|[{"m2yskBQFythfE4...|
+--------------------+



In [45]:
# извлекаем id категорий
id = us_category_id.select(posexplode(col("items.id")).alias("pos", "id"))
id.show(5)

+---+---+
|pos| id|
+---+---+
|  0|  1|
|  1|  2|
|  2| 10|
|  3| 15|
|  4| 17|
+---+---+
only showing top 5 rows



In [46]:
# извлекаем категории
category = us_category_id.select(posexplode(col("items.snippet.title")).alias("pos", "title"))

In [47]:
# мапим категории и id, приводим id к типу Integer
category_id_name = id.join(category, "pos", "inner").select(col("id").cast(IntegerType()).alias("category_id"), col("title").alias("category_name"))
category_id_name.orderBy("id").show(10)

+-----------+----------------+
|category_id|   category_name|
+-----------+----------------+
|          1|Film & Animation|
|         10|           Music|
|         15|  Pets & Animals|
|         17|          Sports|
|         18|    Short Movies|
|         19| Travel & Events|
|          2|Autos & Vehicles|
|         20|          Gaming|
|         21|   Videoblogging|
|         22|  People & Blogs|
+-----------+----------------+
only showing top 10 rows



In [16]:
scored_videos_and_category.count()

2364

In [None]:
# датафрейм с маппингом категория - id является не большим, это своего рода справочник, который нужно приджойнить к основному 
# датафрейму с видео для того, чтобы получить понятные для аналитика названия категорий, поэтому для оптимизации джойна использован 
# бродкаст маленького датафрейма

In [49]:
scored_videos_and_category = scored_videos.join(broadcast(category_id_name), "category_id", "inner")
scored_videos_and_category.count()

2266

In [50]:
scored_videos_and_category.show(5)

+-----------+-----------+--------------------+-------------------+--------------------+-------+-----+--------+-------------+--------------------+-----+-------------+---------------+-----------------+--------------+
|category_id|   video_id|               title|      channel_title|                tags|  views|likes|dislikes|comment_total|      thumbnail_link| date|comment_likes|comment_replies|            score| category_name|
+-----------+-----------+--------------------+-------------------+--------------------+-------+-----+--------+-------------+--------------------+-----+-------------+---------------+-----------------+--------------+
|         15|--JinobXWPk|DANGEROUS Jungle ...|   Brave Wilderness|adventure|adventu...|1319945|38949|     533|         6768|https://i.ytimg.c...|20.10|           19|              1|669011.2000000001|Pets & Animals|
|         24|-5sCWsLlTCI|SNL Host Kumail N...|Saturday Night Live|saturday night li...|  85052| 1458|      97|          132|https://i.ytimg.

In [51]:
categories_score_pandas = scored_videos_and_category.pandas_api()

In [52]:
categories_score_pandas.head(5)

Unnamed: 0,category_id,video_id,title,channel_title,tags,views,likes,dislikes,comment_total,thumbnail_link,date,comment_likes,comment_replies,score,category_name
0,15,--JinobXWPk,DANGEROUS Jungle Spider!,Brave Wilderness,adventure|adventurous|animals|breaking|breakin...,1319945,38949,533,6768,https://i.ytimg.com/vi/--JinobXWPk/default.jpg,20.1,19,1,669011.2,Pets & Animals
1,24,-5sCWsLlTCI,SNL Host Kumail Nanjiani and P!nk Share Favori...,Saturday Night Live,saturday night live|snl|snl season 43|kumail n...,85052,1458,97,132,https://i.ytimg.com/vi/-5sCWsLlTCI/default.jpg,14.1,494,39,42874.0,Entertainment
2,22,-SiRV2kWIxg,Madison Beer Plays RAW's This Or That,RAW,Madison Beer|RAW|RAW Pages|Interview|Word Play...,12305,404,16,14,https://i.ytimg.com/vi/-SiRV2kWIxg/default.jpg,30.09,100,1,6242.9,People & Blogs
3,24,-UAdFerZMWc,"Watch Us Build a 7,500 Piece Lego Millennium F...",WIRED,lego|lego star wars|millennium falcon|star war...,35730,1311,44,160,https://i.ytimg.com/vi/-UAdFerZMWc/default.jpg,13.1,968,99,18247.2,Entertainment
4,24,-_CmfnzbLFc,"Idris Elba Talks Childhood, Playing American R...",The View,Idris Elba|The Mountain Between Us|The Dark To...,107469,1487,31,414,https://i.ytimg.com/vi/-_CmfnzbLFc/default.jpg,9.1,1038,101,54212.3,Entertainment


In [53]:
# мсчитаем медиану по категориям
categories_score = categories_score_pandas.groupby('category_name')['score'].median()

In [21]:
categories_score

category_name
Shows                      4395.1
Education                127538.4
Gaming                   126462.4
Entertainment            192165.7
Travel & Events          105931.7
Science & Technology     158050.0
Sports                    66815.1
Howto & Style            124715.6
Nonprofits & Activism     28155.5
Film & Animation         160161.3
People & Blogs           121552.8
News & Politics           83608.7
Pets & Animals            96185.0
Autos & Vehicles          84353.2
Music                    122129.2
Comedy                   398643.1
Name: score, dtype: float64

# popular_tags

In [54]:
def udfSplitTagsScalaWrapper(field):
    _splitTagsUDF = sc._jvm.CustomUDFs.splitTagsUDF()
    return Column(_splitTagsUDF.apply(_to_seq(sc, [field], _to_java_column)))

In [55]:
videos_drop_dubl_spl_tags1 = videos_drop_dubl.withColumn('Tag', explode(udfSplitTagsScalaWrapper(col('tags'))))
popular_tags1 = videos_drop_dubl_spl_tags1.groupBy('Tag').count()
popular_tags1.orderBy(col('count').desc()).show(10)

+------+-----+
|   Tag|count|
+------+-----+
| funny|  217|
|comedy|  163|
|[none]|  144|
|  2017|   93|
| humor|   92|
|how to|   84|
|makeup|   77|
| music|   74|
|  vlog|   73|
| video|   71|
+------+-----+
only showing top 10 rows



In [56]:
code_to_measure = """
videos_drop_dubl_spl_tags1 = videos_drop_dubl.withColumn('Tag', explode(udfSplitTagsScalaWrapper(col('tags'))))
popular_tags1 = videos_drop_dubl_spl_tags1.groupBy('Tag').count()
"""
setup_code = """
import pyspark
from pyspark.sql.functions import row_number, col, explode, udf
from pyspark.sql.column import _to_java_column
from pyspark.sql.column import _to_seq
from pyspark.sql import SparkSession, Column
from pyspark.sql.window import Window
from pyspark.sql.types import ArrayType, StringType


spark = SparkSession.builder.master("local") \
    .config('spark.sql.autoBroadcastJoinThreshold', 0) \
    .config('spark.sql.adaptive.enabled', 'false') \
    .getOrCreate()
sc = spark.sparkContext

videos = spark.read.option('header', 'true').option("inferSchema", "true").csv('../datasets/USvideos.csv')
windowSpec  = Window.partitionBy("video_id").orderBy(col("date").desc())
videos_drop_dubl = videos.withColumn("row_number",row_number().over(windowSpec)) \
                         .where(col("row_number") == 1) \
                         .drop("row_number")

def udfSplitTagsScalaWrapper(field):
    _splitTagsUDF = sc._jvm.CustomUDFs.splitTagsUDF()
    return Column(_splitTagsUDF.apply(_to_seq(sc, [field], _to_java_column)))
"""
execution_time = timeit.timeit(stmt=code_to_measure, setup=setup_code, number=1)
print(f"Execution time: {execution_time} seconds")

Execution time: 0.010183083999436349 seconds


In [57]:
# определяем UDF для разбивки тегов
def split_tags(tags):
    if tags:
        return tags.split('|')
    else:
        return []
# Create a UDF from the split_tags function
split_tags_udf = udf(split_tags, ArrayType(StringType()))

In [58]:
videos_drop_dubl_spl_tags = videos_drop_dubl.withColumn('Tag', explode(split_tags_udf(col('tags'))))
popular_tags = videos_drop_dubl_spl_tags.groupBy('Tag').count()
popular_tags.orderBy(col('count').desc()).show(10)

+------+-----+
|   Tag|count|
+------+-----+
| funny|  217|
|comedy|  163|
|[none]|  144|
|  2017|   93|
| humor|   92|
|how to|   84|
|makeup|   77|
| music|   74|
|  vlog|   73|
| video|   71|
+------+-----+
only showing top 10 rows



In [59]:
code_to_measure = """
videos_drop_dubl_spl_tags = videos_drop_dubl.withColumn('Tag', explode(split_tags_udf(col('tags'))))
popular_tags = videos_drop_dubl_spl_tags.groupBy('Tag').count()
"""
setup_code = """
import pyspark
from pyspark.sql.functions import row_number, col, explode, udf
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.types import ArrayType, StringType

spark = SparkSession.builder.master("local") \
    .config('spark.sql.autoBroadcastJoinThreshold', 0) \
    .config('spark.sql.adaptive.enabled', 'false') \
    .getOrCreate()
sc = spark.sparkContext

videos = spark.read.option('header', 'true').option("inferSchema", "true").csv('../datasets/USvideos.csv')
windowSpec  = Window.partitionBy("video_id").orderBy(col("date").desc())
videos_drop_dubl = videos.withColumn("row_number",row_number().over(windowSpec)) \
                         .where(col("row_number") == 1) \
                         .drop("row_number")


def split_tags(tags):
    if tags:
        return tags.split('|')
    else:
        return []
# Create a UDF from the split_tags function
split_tags_udf = udf(split_tags, ArrayType(StringType()))
"""
execution_time = timeit.timeit(stmt=code_to_measure, setup=setup_code, number=1)
print(f"Execution time: {execution_time} seconds")

Execution time: 0.01330812499509193 seconds


# Cats

In [60]:
# сортируем видео у которых есть тег cat
videos_drop_dubl_spl_tags.show(5)

+-----------+--------------------+----------------+-----------+--------------------+-------+-----+--------+-------------+--------------------+-----+--------------+
|   video_id|               title|   channel_title|category_id|                tags|  views|likes|dislikes|comment_total|      thumbnail_link| date|           Tag|
+-----------+--------------------+----------------+-----------+--------------------+-------+-----+--------+-------------+--------------------+-----+--------------+
|--JinobXWPk|DANGEROUS Jungle ...|Brave Wilderness|         15|adventure|adventu...|1319945|38949|     533|         6768|https://i.ytimg.c...|20.10|     adventure|
|--JinobXWPk|DANGEROUS Jungle ...|Brave Wilderness|         15|adventure|adventu...|1319945|38949|     533|         6768|https://i.ytimg.c...|20.10|   adventurous|
|--JinobXWPk|DANGEROUS Jungle ...|Brave Wilderness|         15|adventure|adventu...|1319945|38949|     533|         6768|https://i.ytimg.c...|20.10|       animals|
|--JinobXWPk|DAN

In [61]:
videos_drop_dubl_spl_tags.count()

43165

In [62]:
# отбираем видео с тегами про котов
videos_cat_tag = videos_drop_dubl_spl_tags.where(lower((col("Tag"))) == "cat")
videos_cat_tag.count()

17

In [66]:
videos_cat_tag.select('video_id', 'tags').show(5, False)

+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|video_id   |tags                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+-----------+-------------------------------------------------------------------

In [None]:
+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|video_id   |tags                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|7V1J_MDi9Lg|Husky's First Howl|cat|dog|dogs|funny dogs|funny cat                                                                                                                                                                                                                                                                                                                                                                                                         |
|xbBMVa2A68s|cat|dog|cute|gaming|overwatch|runescape|osrs                                                                                                                                                                                                                                                                                                                                                                                                                 |
|-1fzGnFwz9M|cartoon|simons cat|simon's cat|simonscat|simon tofield|simon the cat|funny cats|cute cats|cat fails|family friendly|animated animals|short animation|animated cats|tofield|simon's katze|simon|cat|black and white|kitty|traditional animation|black and white cat|Кот Саймона|cat lovers|animal (film character)|fail|funny cat|cats|cute|kitten|kittens|pets|simons cats|Cat|Simon|Tofield|cartoons|Toons|Animated|Animation|Kitten|Funny|Humour|fun|videos|
|-1fzGnFwz9M|cartoon|simons cat|simon's cat|simonscat|simon tofield|simon the cat|funny cats|cute cats|cat fails|family friendly|animated animals|short animation|animated cats|tofield|simon's katze|simon|cat|black and white|kitty|traditional animation|black and white cat|Кот Саймона|cat lovers|animal (film character)|fail|funny cat|cats|cute|kitten|kittens|pets|simons cats|Cat|Simon|Tofield|cartoons|Toons|Animated|Animation|Kitten|Funny|Humour|fun|videos|
|S9VIKOuZcds|cat|feline|pets|beloved|family member|grief|grieving|kitty                                                                                                                                                                                                                                                                                                                                                                                                   |
+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 5 rows

In [70]:
comments.count()

691722

In [72]:
import math
import mmh3
from bitarray import bitarray
  
  
class BloomFilter(object):
  
    '''
    Class for Bloom filter, using murmur3 hash function
    '''
  
    def __init__(self, items_count, fp_prob):
        '''
        items_count : int
            Number of items expected to be stored in bloom filter
        fp_prob : float
            False Positive probability in decimal
        '''
        self.items_count = items_count
        
        # False possible probability in decimal
        self.fp_prob = fp_prob
  
        # Size of bit array to use
        self.size = self.get_size(items_count, fp_prob)
  
        # number of hash functions to use
        self.hash_count = self.get_hash_count(self.size, items_count)
  
        # Bit array of given size
        self.bit_array = bitarray(self.size)
  
        # initialize all bits as 0
        self.bit_array.setall(0)
  
    def add(self, item):
        '''
        Add an item in the filter
        '''
        digests = []
        for i in range(self.hash_count):
  
            # create digest for given item.
            # i work as seed to mmh3.hash() function
            # With different seed, digest created is different
            digest = mmh3.hash(item, i) % self.size
            digests.append(digest)
  
            # set the bit True in bit_array
            self.bit_array[digest] = True
        
    def union(self, other):
        """ Calculates the union of the two underlying bitarrays and returns
        a new bloom filter object."""
        new_bloom = self.copy()
        new_bloom.bit_array = new_bloom.bit_array | other.bit_array
        return new_bloom
  
    def check(self, item):
        '''
        Check for existence of an item in filter
        '''
        for i in range(self.hash_count):
            digest = mmh3.hash(item, i) % self.size
            if self.bit_array[digest] == False:
  
                # if any of bit is False then,its not present
                # in filter
                # else there is probability that it exist
                return False
        return True
    
    def copy(self):
        """Return a copy of this bloom filter.
        """
        new_filter = BloomFilter(self.items_count, self.fp_prob)
        new_filter.bit_array = self.bit_array.copy()
        return new_filter
    
    def set_bit_array(self, bit_array):
        self.bit_array = bit_array
  
    @classmethod
    def get_size(self, n, p):
        '''
        Return the size of bit array(m) to used using
        following formula
        m = -(n * lg(p)) / (lg(2)^2)
        n : int
            number of items expected to be stored in filter
        p : float
            False Positive probability in decimal
        '''
        m = -(n * math.log(p))/(math.log(2)**2)
        return int(m)
  
    @classmethod
    def get_hash_count(self, m, n):
        '''
        Return the hash function(k) to be used using
        following formula
        k = (m/n) * lg(2)
  
        m : int
            size of bit array
        n : int
            number of items expected to be stored in filter
        '''
        k = (m/n) * math.log(2)
        return int(k)

In [74]:
# используем фильтр Блума для того чтобы предварительно отсортировать большой датасет с комментариями 

filterSize = 691722
prob = 0.05

def fill_bloom_filter(bf, items):
    for i in items:
        bf.add(str(i[0]))
    return bf

bloom_filter = BloomFilter(filterSize, prob)

general_bit_array = comments.select(col('video_id')).rdd \
    .mapPartitions(lambda p: [fill_bloom_filter(BloomFilter(filterSize, prob), p).bit_array]) \
    .reduce(lambda a, b: a.bit_array | b.bit_array)

bloom_filter.set_bit_array(general_bit_array)

maybe_in_bf = udf(lambda video_id: bloom_filter.check(str(video_id)))

In [75]:
# отбираем комментарии у к видео про котов
video_cat_tag_comments = comments.filter(maybe_in_bf(col('video_id')) == True) \
                                 .join(videos_cat_tag, "video_id", "inner")
video_cat_tag_comments.count()

5901

In [76]:
# отображаем топ 5 комментариев по количеству лайков к видео про котов
video_cat_tag_comments.orderBy(comments["likes"].desc()).select("comment_text", comments["likes"]).show(5, False)

+----------------------------------------------------------------------------------+-----+
|comment_text                                                                      |likes|
+----------------------------------------------------------------------------------+-----+
|The second I read this title in my notification, I started to giggle.             |2355 |
|talk about the ocean sunfish build                                                |1070 |
|talk about the ocean sunfish build                                                |1021 |
|talk about the ocean sunfish build                                                |957  |
|I make interesting cartoons and I need your help! Go to the channel, rate my work!|839  |
+----------------------------------------------------------------------------------+-----+
only showing top 5 rows



In [77]:
spark.stop()