In [1]:
import os
import collections

from pyspark.sql import SparkSession
from operator import add

spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.t8") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [2]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
df = df.limit(100000)
df.show()

+--------------------+--------------------+--------------------+
|                 _id|               emoji|            sentence|
+--------------------+--------------------+--------------------+
|[5ea4eddb59a37f98...|      :red_heart:,18|No object is so b...|
|[5ea4eddb59a37f98...|:person_shrugging...|Cant expect diffe...|
|[5ea4eddb59a37f98...|:face_with_tears_...|“ Lets go Marcus ...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Asahd really is a...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Yoongi Tweet Hell...|
|[5ea4eddb59a37f98...|:backhand_index_p...|we cannot afford ...|
|[5ea4eddb59a37f98...|:party_popper:,8 ...|ranks 6th in Janu...|
|[5ea4eddb59a37f98...|:person_facepalmi...|Ok people are rea...|
|[5ea4eddb59a37f98...|:smiling_face_wit...|Cant wait to meet...|
|[5ea4eddb59a37f98...| :clapping_hands:,11|Congratulations M...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Met orlando brown...|
|[5ea4eddb59a37f98...|      :weary_face:,4|Im goin to bed :w...|
|[5ea4eddb59a37f98...|  :

## For every emoji, ﬁnd the position (head, middle, end) that the emoji occurs most in a sentence.
0 - head, 1 - middle, 2 - end

In [3]:
def position(line):
    sentence = line.sentence.split(' ')
    emojis = line.emoji.split(' ')
    
    sentence_length = len(sentence)
    res = []
    
    for emoji in emojis:
        e, place = emoji.split(',')
        if int(place)/sentence_length < 1/3:
            res.append((e, 0))
        elif int(place)/sentence_length > 2/3:
            res.append((e, 2))
        else:
            res.append((e, 1))
    return res

def sort_func(x):
    return x[1]

def most_position(line):
    positions = list(line[1])
    positions_sort = sorted(positions, key=sort_func, reverse=True)
    
    return (line[0], positions_sort[0][0])


position = df.rdd.map(position) \
            .flatMap(lambda x: x) \
            .map(lambda x: ((x[0], x[1]), 1)) \
            .reduceByKey(add) \
            .map(lambda x: (x[0][0], (x[0][1], x[1]))) \
            .groupByKey() \
            .map(most_position)

position.take(3)

[(':red_heart:', 2), (':person_shrugging:', 1), (':female_sign:', 2)]

In [4]:
result = position.toDF()
result = result.selectExpr("_1 as emoji", "_2 as pos")
result.show()

+--------------------+---+
|               emoji|pos|
+--------------------+---+
|         :red_heart:|  2|
|  :person_shrugging:|  1|
|       :female_sign:|  2|
|:face_with_tears_...|  2|
|:backhand_index_p...|  2|
|      :party_popper:|  2|
|:person_facepalming:|  1|
|:smiling_face_wit...|  2|
|    :clapping_hands:|  1|
|        :weary_face:|  2|
|         :male_sign:|  2|
|:loudly_crying_face:|  2|
|      :folded_hands:|  2|
|    :hundred_points:|  2|
|:rolling_on_the_f...|  2|
|     :flexed_biceps:|  2|
|:backhand_index_p...|  1|
|       :crying_face:|  2|
|      :purple_heart:|  2|
|      :yellow_heart:|  2|
+--------------------+---+
only showing top 20 rows



In [5]:
result.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()