In [1]:
import os
import collections

from pyspark.sql import SparkSession
from operator import add

spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.t11") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [2]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
df = df.limit(100000)
df.show()

+--------------------+--------------------+--------------------+
|                 _id|               emoji|            sentence|
+--------------------+--------------------+--------------------+
|[5ea4eddb59a37f98...|      :red_heart:,18|No object is so b...|
|[5ea4eddb59a37f98...|:person_shrugging...|Cant expect diffe...|
|[5ea4eddb59a37f98...|:face_with_tears_...|“ Lets go Marcus ...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Asahd really is a...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Yoongi Tweet Hell...|
|[5ea4eddb59a37f98...|:backhand_index_p...|we cannot afford ...|
|[5ea4eddb59a37f98...|:party_popper:,8 ...|ranks 6th in Janu...|
|[5ea4eddb59a37f98...|:person_facepalmi...|Ok people are rea...|
|[5ea4eddb59a37f98...|:smiling_face_wit...|Cant wait to meet...|
|[5ea4eddb59a37f98...| :clapping_hands:,11|Congratulations M...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Met orlando brown...|
|[5ea4eddb59a37f98...|      :weary_face:,4|Im goin to bed :w...|
|[5ea4eddb59a37f98...|  :

## For every emoji, ﬁnd the average word length in the sentences that contain it.

In [3]:
def average_length(line):
    res = []
    sentence = line.sentence.split(' ')
    sentence_length = len(sentence)
    
    emojis = line.emoji.split(' ')
    for emoji in emojis:
        tmp = emoji.split(',')[0]
        res.append((tmp, sentence_length))
    return res
    
emojis_ave = df.rdd.map(average_length) \
            .flatMap(lambda x: x) \
            .map(lambda x: (x, 1)) \
            .reduceByKey(add)

# emojis_ave.take(3)

sentences_count = emojis_ave.map(lambda x: (x[0][0], x[1])) \
                    .reduceByKey(add)

# sentences_count.take(3)

words_count = emojis_ave.map(lambda x: (x[0][0], x[0][1]*x[1])) \
                    .reduceByKey(add)
# words_count.take(3)

ave_result = sentences_count.join(words_count) \
            .map(lambda x: (x[0], round(x[1][1] / x[1][0], 0)))

ave_result.take(10)

[(':red_heart:', 15.0),
 (':person_shrugging:', 18.0),
 (':female_sign:', 18.0),
 (':face_with_tears_of_joy:', 13.0),
 (':backhand_index_pointing_down:', 17.0),
 (':party_popper:', 15.0),
 (':person_facepalming:', 18.0),
 (':smiling_face_with_heart-eyes:', 12.0),
 (':clapping_hands:', 19.0),
 (':weary_face:', 13.0)]

In [4]:
result = ave_result.toDF()
result = result.selectExpr("_1 as emoji", "_2 as ave_len")
result.show()

+--------------------+-------+
|               emoji|ave_len|
+--------------------+-------+
|         :red_heart:|   15.0|
|  :person_shrugging:|   18.0|
|       :female_sign:|   18.0|
|:face_with_tears_...|   13.0|
|:backhand_index_p...|   17.0|
|      :party_popper:|   15.0|
|:person_facepalming:|   18.0|
|:smiling_face_wit...|   12.0|
|    :clapping_hands:|   19.0|
|        :weary_face:|   13.0|
|         :male_sign:|   18.0|
|:loudly_crying_face:|   13.0|
|      :folded_hands:|   15.0|
|    :hundred_points:|   14.0|
|:rolling_on_the_f...|   13.0|
|     :flexed_biceps:|   14.0|
|:backhand_index_p...|   17.0|
|       :crying_face:|   13.0|
|      :purple_heart:|   14.0|
|      :yellow_heart:|   13.0|
+--------------------+-------+
only showing top 20 rows



In [5]:
result.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()