In [1]:
import os
import collections

from pyspark.sql import SparkSession
from operator import add

spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.t2") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [2]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
df = df.limit(100000)
df.show()

+--------------------+--------------------+--------------------+
|                 _id|               emoji|            sentence|
+--------------------+--------------------+--------------------+
|[5ea4eddb59a37f98...|      :red_heart:,18|No object is so b...|
|[5ea4eddb59a37f98...|:person_shrugging...|Cant expect diffe...|
|[5ea4eddb59a37f98...|:face_with_tears_...|“ Lets go Marcus ...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Asahd really is a...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Yoongi Tweet Hell...|
|[5ea4eddb59a37f98...|:backhand_index_p...|we cannot afford ...|
|[5ea4eddb59a37f98...|:party_popper:,8 ...|ranks 6th in Janu...|
|[5ea4eddb59a37f98...|:person_facepalmi...|Ok people are rea...|
|[5ea4eddb59a37f98...|:smiling_face_wit...|Cant wait to meet...|
|[5ea4eddb59a37f98...| :clapping_hands:,11|Congratulations M...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Met orlando brown...|
|[5ea4eddb59a37f98...|      :weary_face:,4|Im goin to bed :w...|
|[5ea4eddb59a37f98...|  :

## For every emoji, ﬁnd the 3 other emojis that are used most frequently with it.

In [3]:
from itertools import combinations

emojis = df.select('emoji')

def split_arr(line):
    res = []
    
    words = line.emoji.split(" ")
    for word in words:
        tmp = word.split(',')[0]
        res.append(tmp)
    return res

def combination(line):
    combs = list(combinations(line, 2))
    res = []
    
    top10 = [':face_with_tears_of_joy:', ':red_heart:',':loudly_crying_face:', ':fire:', \
             ':smiling_face_with_heart-eyes:', ':female_sign:', ':clapping_hands:', \
             ':folded_hands:', ':male_sign:', ':backhand_index_pointing_right:']
    for comb in combs:
        p0 = comb[0]
        p1 = comb[1]
        if p0 not in top10:
            p0 = 'others'
        if p1 not in top10:
            p1 = 'others'
    
        if p0 != p1:
            res.append((p0, p1))
            res.append((p1, p0))
    return res

emojis_comb = emojis.rdd.map(split_arr) \
        .filter(lambda x: len(x) > 1) \
        .map(combination) \
        .flatMap(lambda x: x) \
        .map(lambda x: (x, 1)) \
        .reduceByKey(add) \
        .map(lambda x: (x[0][0], (x[0][1], x[1]))) 

def sort_func(x):
    return x[1]

def top_10(line):
    candidate = tuple(list(line[1]))
    return (line[0], candidate)


result = emojis_comb.groupByKey() \
        .map(top_10) \

result.take(5)

[('others',
  ((':female_sign:', 4242),
   (':male_sign:', 3278),
   (':loudly_crying_face:', 465),
   (':backhand_index_pointing_right:', 515),
   (':red_heart:', 1956),
   (':face_with_tears_of_joy:', 1006),
   (':smiling_face_with_heart-eyes:', 634),
   (':fire:', 1407),
   (':clapping_hands:', 666),
   (':folded_hands:', 627))),
 (':female_sign:',
  (('others', 4242),
   (':face_with_tears_of_joy:', 526),
   (':loudly_crying_face:', 111),
   (':male_sign:', 273),
   (':red_heart:', 47),
   (':smiling_face_with_heart-eyes:', 27),
   (':fire:', 29),
   (':clapping_hands:', 27),
   (':backhand_index_pointing_right:', 62),
   (':folded_hands:', 17))),
 (':male_sign:',
  (('others', 3278),
   (':loudly_crying_face:', 58),
   (':face_with_tears_of_joy:', 416),
   (':female_sign:', 273),
   (':red_heart:', 22),
   (':fire:', 53),
   (':folded_hands:', 14),
   (':clapping_hands:', 28),
   (':smiling_face_with_heart-eyes:', 7),
   (':backhand_index_pointing_right:', 26))),
 (':loudly_crying

In [4]:
result = result.toDF()
result = result.selectExpr("_1 as emoji", "_2 as col")
result.show()

+--------------------+--------------------+
|               emoji|                 col|
+--------------------+--------------------+
|              others|[[:female_sign:, ...|
|       :female_sign:|[[others, 4242], ...|
|         :male_sign:|[[others, 3278], ...|
|:loudly_crying_face:|[[others, 465], [...|
|:backhand_index_p...|[[others, 515], [...|
|         :red_heart:|[[others, 1956], ...|
|:face_with_tears_...|[[:male_sign:, 41...|
|:smiling_face_wit...|[[:red_heart:, 33...|
|    :clapping_hands:|[[:face_with_tear...|
|              :fire:|[[others, 1407], ...|
|      :folded_hands:|[[others, 627], [...|
+--------------------+--------------------+



In [5]:
result.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()