In [1]:
import os
import collections

from pyspark.sql import SparkSession
from operator import add

spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.t10") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [2]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
df = df.limit(100000)
df.show()

+--------------------+--------------------+--------------------+
|                 _id|               emoji|            sentence|
+--------------------+--------------------+--------------------+
|[5ea4eddb59a37f98...|      :red_heart:,18|No object is so b...|
|[5ea4eddb59a37f98...|:person_shrugging...|Cant expect diffe...|
|[5ea4eddb59a37f98...|:face_with_tears_...|â€œ Lets go Marcus ...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Asahd really is a...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Yoongi Tweet Hell...|
|[5ea4eddb59a37f98...|:backhand_index_p...|we cannot afford ...|
|[5ea4eddb59a37f98...|:party_popper:,8 ...|ranks 6th in Janu...|
|[5ea4eddb59a37f98...|:person_facepalmi...|Ok people are rea...|
|[5ea4eddb59a37f98...|:smiling_face_wit...|Cant wait to meet...|
|[5ea4eddb59a37f98...| :clapping_hands:,11|Congratulations M...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Met orlando brown...|
|[5ea4eddb59a37f98...|      :weary_face:,4|Im goin to bed :w...|
|[5ea4eddb59a37f98...| 

## For every emoji, summarize the frequency that it could be used more than once in a sentence.
If the times that it used more than once, we consider it equally.

In [3]:
emojis = df.select("emoji")

def split_arr(line):
    dict_ = collections.defaultdict(int)
    res = []
    words = line.emoji.split(" ")
    for word in words:
        tmp = word.split(',')[0]
        dict_[tmp] += 1
    
    for k, v in dict_.items():
        if v > 1:
            res.append(k)
    return res

emojis = emojis.rdd.map(split_arr)
emojis_filter = emojis.filter(lambda x: len(x) > 0) \
                .flatMap(lambda x: x) \
                .map(lambda x: (x, 1)) \
                .reduceByKey(add)

emojis_filter.take(3)

[(':clapping_hands:', 526), (':female_sign:', 357), (':hundred_points:', 57)]

In [4]:
result = emojis_filter.toDF()
result = result.selectExpr("_1 as emoji", "_2 as count")
result.show()

+--------------------+-----+
|               emoji|count|
+--------------------+-----+
|    :clapping_hands:|  526|
|       :female_sign:|  357|
|    :hundred_points:|   57|
|:white_heavy_chec...|  432|
|         :red_heart:| 1433|
|:face_with_tears_...|  348|
|     :raising_hands:|  173|
|           :OK_hand:|   60|
|              :fire:|  464|
|:loudly_crying_face:|  123|
|:backhand_index_p...|  444|
|      :purple_heart:|   57|
|      :folded_hands:|  238|
|     :thinking_face:|   10|
|     :flexed_biceps:|  101|
|         :male_sign:|  279|
|  :person_shrugging:|  100|
|     :speaking_head:|   28|
|  :heavy_check_mark:|  336|
|      :smiling_face:|   86|
+--------------------+-----+
only showing top 20 rows



In [5]:
result.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()