In [5]:
import os
import collections

from pyspark.sql import SparkSession
from operator import add

In [6]:
# 创建sparkSession对象
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.q7") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [7]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
df.show()

+--------------------+--------------------+--------------------+
|                 _id|               emoji|            sentence|
+--------------------+--------------------+--------------------+
|[5ea4eddb59a37f98...|      :red_heart:,18|No object is so b...|
|[5ea4eddb59a37f98...|:person_shrugging...|Cant expect diffe...|
|[5ea4eddb59a37f98...|:face_with_tears_...|“ Lets go Marcus ...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Asahd really is a...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Yoongi Tweet Hell...|
|[5ea4eddb59a37f98...|:backhand_index_p...|we cannot afford ...|
|[5ea4eddb59a37f98...|:party_popper:,8 ...|ranks 6th in Janu...|
|[5ea4eddb59a37f98...|:person_facepalmi...|Ok people are rea...|
|[5ea4eddb59a37f98...|:smiling_face_wit...|Cant wait to meet...|
|[5ea4eddb59a37f98...| :clapping_hands:,11|Congratulations M...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Met orlando brown...|
|[5ea4eddb59a37f98...|      :weary_face:,4|Im goin to bed :w...|
|[5ea4eddb59a37f98...|  :

## Find the appearance frequency of every emoji.

In [4]:
# emoji = ":red_heart:"
emojis = df.select('emoji')
# emojis.show()
def split_str(line):
    res = []
    
    words = line.emoji.split(" ")
    for word in words:
        tmp = word.split(',')[0]
        res.append(tmp)
    return " ".join(res)

emojis = emojis.rdd.map(split_str)
emojis.take(3)

[':red_heart:', ':person_shrugging: :female_sign:', ':face_with_tears_of_joy:']

In [11]:
result = emojis.flatMap(lambda x: x.split(" ")) \
        .map(lambda x: (x, 1)) \
        .reduceByKey(add) \
        .sortBy(lambda x: x[1], ascending= False) 
#         sortBy(lambda x: x[1], False)
# result.take(3)
result.take(10)
# for v, k in result:
#     print("{} {}".format(v, k))

[(':face_with_tears_of_joy:', 306231),
 (':red_heart:', 179779),
 (':loudly_crying_face:', 122740),
 (':fire:', 95684),
 (':smiling_face_with_heart-eyes:', 89766),
 (':female_sign:', 73358),
 (':clapping_hands:', 56803),
 (':folded_hands:', 54839),
 (':male_sign:', 54502),
 (':backhand_index_pointing_right:', 43654)]

In [11]:
new_names = ['emoji', 'fre']
result = result.toDF(*new_names)
result = result.withColumn('fre', result['fre']/1890000)

+--------------------+--------------------+
|               emoji|                 fre|
+--------------------+--------------------+
|:face_with_tears_...| 0.16202698412698413|
|:backhand_index_p...|0.010362962962962963|
|:loudly_crying_face:| 0.06494179894179894|
|      :smiling_face:| 0.00924074074074074|
|        :heart_suit:|0.013633862433862434|
|:face_with_rollin...|0.013595238095238096|
|              :eyes:|0.014297354497354498|
|       :right_arrow:| 0.00927037037037037|
|           :OK_hand:|0.010248677248677248|
|:beaming_face_wit...| 0.00891005291005291|
|  :person_shrugging:|0.021713227513227514|
|:person_facepalming:|0.018807407407407408|
|:smiling_face_wit...|  0.0474952380952381|
|    :clapping_hands:|0.030054497354497355|
|        :weary_face:|0.022332275132275134|
|      :folded_hands:|0.029015343915343914|
|:rolling_on_the_f...|0.014669312169312169|
|     :flexed_biceps:|0.013198941798941799|
|      :winking_face:|0.010104232804232805|
|             :skull:| 0.0117507

In [13]:
# result.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()

## For every emoji, ﬁnd the 3 other emojis that are used most frequently with it.

In [37]:
from itertools import combinations

emojis = df.select('emoji')

def split_arr(line):
    res = []
    
    words = line.emoji.split(" ")
    for word in words:
        tmp = word.split(',')[0]
        res.append(tmp)
    return res

def combination(line):
    combs = list(combinations(line, 2))
    res = []
    
    top10 = [':face_with_tears_of_joy:', ':red_heart:',':loudly_crying_face:', ':fire:', \
             ':smiling_face_with_heart-eyes:', ':female_sign:', ':clapping_hands:', \
             ':folded_hands:', ':male_sign:', ':backhand_index_pointing_right:']
    for comb in combs:
        p0 = comb[0]
        p1 = comb[1]
        if p0 not in top10:
            p0 = 'others'
        if p1 not in top10:
            p1 = 'others'
    
        if p0 != p1:
            res.append((p0, p1))
            res.append((p1, p0))
    return res

emojis_comb = emojis.rdd.map(split_arr) \
        .filter(lambda x: len(x) > 1) \
        .map(combination) \
        .flatMap(lambda x: x) \
        .map(lambda x: (x, 1)) \
        .reduceByKey(add) \
        .map(lambda x: (x[0][0], (x[0][1], x[1]))) 

def sort_func(x):
    return x[1]

def top_10(line):
    candidate = tuple(list(line[1]))
    return (line[0], candidate)
#     sort_candidate = sorted(candidate, key=sort_func, reverse=True)
#     res = []
#     count = 0
#     while count < 10 and count < len(sort_candidate):
#         res.append(sort_candidate[count][0])
#         count += 1
    
#     return (line[0], res)

result = emojis_comb.groupByKey() \
        .map(top_10) \

result.take(1)

# for v, k in result:
#     print("{} {}".format(v, k))

[(':loudly_crying_face:',
  (('others', 7285),
   (':male_sign:', 941),
   (':clapping_hands:', 838),
   (':smiling_face_with_heart-eyes:', 835),
   (':backhand_index_pointing_right:', 42),
   (':fire:', 297),
   (':face_with_tears_of_joy:', 1710),
   (':folded_hands:', 806),
   (':female_sign:', 1816),
   (':red_heart:', 5380)))]

In [38]:
# new_names = ['emoji', '']
# result = result.toDF(*new_names)
result = result.toDF()
result = result.selectExpr("_1 as emoji", "_2 as col")
result.show()

+--------------------+--------------------+
|               emoji|                 col|
+--------------------+--------------------+
|:loudly_crying_face:|[[others, 7285], ...|
|:face_with_tears_...|[[:male_sign:, 59...|
|              others|[[:female_sign:, ...|
|    :clapping_hands:|[[:face_with_tear...|
|:smiling_face_wit...|[[others, 10135],...|
|      :folded_hands:|[[others, 11455],...|
|         :male_sign:|[[:loudly_crying_...|
|              :fire:|[[:male_sign:, 85...|
|       :female_sign:|[[others, 69837],...|
|         :red_heart:|[[:face_with_tear...|
|:backhand_index_p...|[[:loudly_crying_...|
+--------------------+--------------------+



In [39]:
result.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()

## For every emoji, determine it is used more with words begin with lower case or word begin with upper case.

In [9]:
def check_case(line):
    res = []
    sentence = line.sentence.split(' ')
    emojis = line.emoji.split(' ')
    
    for emoji in emojis:
        e, place = emoji.split(',')
        word = sentence[int(place)-1]

        if word[0].isupper():
            res.append((e, 1))
        elif word[0].islower():
            res.append((e, -1))
        else:
            res.append((e, 0))
    return res

def lower_or_upper(line):
    word, fre = line[0], line[1]
    if fre < 0:
        return (word, 'lower')
    else:
        return (word, 'upper')

result = df.rdd.map(check_case) \
        .flatMap(lambda x: x) \
        .reduceByKey(add) \
        .map(lower_or_upper)

result.take(22)

[(':face_with_tears_of_joy:', 'lower'),
 (':backhand_index_pointing_down:', 'lower'),
 (':loudly_crying_face:', 'lower'),
 (':smiling_face:', 'lower'),
 (':heart_suit:', 'lower'),
 (':face_with_rolling_eyes:', 'lower'),
 (':eyes:', 'lower'),
 (':right_arrow:', 'upper'),
 (':OK_hand:', 'lower'),
 (':beaming_face_with_smiling_eyes:', 'lower'),
 (':person_shrugging:', 'lower'),
 (':person_facepalming:', 'lower'),
 (':smiling_face_with_heart-eyes:', 'lower'),
 (':clapping_hands:', 'lower'),
 (':weary_face:', 'lower'),
 (':folded_hands:', 'lower'),
 (':rolling_on_the_floor_laughing:', 'lower'),
 (':flexed_biceps:', 'lower'),
 (':winking_face:', 'lower'),
 (':skull:', 'lower'),
 (':party_popper:', 'upper'),
 (':male_sign:', 'upper')]

## Find the average of the number of emoji used in a sentence.

In [8]:
emojis = df.select('emoji')
def cal_count(line):
    res = 0
    
    words = line.emoji.split(" ")
    for word in words:
        res += 1
    return res

emojis_count = emojis.rdd.map(cal_count)
emojis_mapped = emojis_count.map(lambda x: (x, 1))
total_count = emojis_mapped.reduceByKey(add).sortByKey()
total_count.take(10)

[(1, 1299368),
 (2, 191538),
 (3, 52918),
 (4, 19750),
 (5, 6993),
 (6, 3593),
 (7, 1714),
 (8, 1074),
 (9, 769),
 (10, 415)]

In [10]:
result = total_count.toDF()
result = result.selectExpr("_1 as num", "_2 as counts")
result.show()

+---+-------+
|num| counts|
+---+-------+
|  1|1299368|
|  2| 191538|
|  3|  52918|
|  4|  19750|
|  5|   6993|
|  6|   3593|
|  7|   1714|
|  8|   1074|
|  9|    769|
| 10|    415|
| 11|    330|
| 12|    347|
| 13|    282|
| 14|    145|
| 15|    148|
| 16|    113|
| 17|     45|
| 18|    150|
| 19|     66|
| 20|     46|
+---+-------+
only showing top 20 rows



In [11]:
result.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()

## For every emoji, ﬁnd the position (head, middle, end) that the emoji occurs most in a sentence.

0 - head, 1 - middle, 2 - end

In [68]:
def position(line):
    sentence = line.sentence.split(' ')
    emojis = line.emoji.split(' ')
    
    sentence_length = len(sentence)
    res = []
    
    for emoji in emojis:
        e, place = emoji.split(',')
        if int(place)/sentence_length < 1/3:
            res.append((e, 0))
        elif int(place)/sentence_length > 2/3:
            res.append((e, 2))
        else:
            res.append((e, 1))
    return res

def sort_func(x):
    return x[1]

def most_position(line):
    positions = list(line[1])
    positions_sort = sorted(positions, key=sort_func, reverse=True)
    
    return (line[0], positions_sort[0][0])


position = df.rdd.map(position) \
            .flatMap(lambda x: x) \
            .map(lambda x: ((x[0], x[1]), 1)) \
            .reduceByKey(add) \
            .map(lambda x: (x[0][0], (x[0][1], x[1]))) \
            .groupByKey() \
            .map(most_position)

position.take(3)

[(':loudly_crying_face:', 2), (':right_arrow:', 2), (':eyes:', 2)]

## Analyze the relation between the length of sentence and the number of emoji used in the sentence.

In [16]:
def length_relation(line):
    
    sentence = line.sentence.split(' ')
    emojis = line.emoji.split(' ')
    
    sentence_length = len(sentence)
    emojis_length = len(emojis)
    return (sentence_length, emojis_length)

length = df.rdd.map(length_relation)
length_mapped = length.map(lambda x: (x, 1))
relation = length_mapped.reduceByKey(add).sortByKey()

relation.take(10)

[((5, 1), 110363),
 ((6, 1), 110027),
 ((6, 2), 6169),
 ((7, 1), 102133),
 ((7, 2), 8122),
 ((7, 3), 1099),
 ((8, 1), 96853),
 ((8, 2), 8779),
 ((8, 3), 2155),
 ((8, 4), 243)]

## For every emoji, summarize the frequency that it could be used more than once in a sentence.

If the times that it used more than once, we consider it equally.

In [31]:
emojis = df.select("emoji")

def split_arr(line):
    dict_ = collections.defaultdict(int)
    res = []
    words = line.emoji.split(" ")
    for word in words:
        tmp = word.split(',')[0]
        dict_[tmp] += 1
    
    for k, v in dict_.items():
        if v > 1:
            res.append(k)
    return res

emojis = emojis.rdd.map(split_arr)
emojis_filter = emojis.filter(lambda x: len(x) > 0) \
                .flatMap(lambda x: x) \
                .map(lambda x: (x, 1)) \
                .reduceByKey(add)

emojis_filter.take(3)

[(':face_with_tears_of_joy:', 5458),
 (':OK_hand:', 945),
 (':loudly_crying_face:', 2032)]

## For every emoji, ﬁnd the average word length in the sentences that contain it.

In [41]:
def average_length(line):
    res = []
    sentence = line.sentence.split(' ')
    sentence_length = len(sentence)
    
    emojis = line.emoji.split(' ')
    for emoji in emojis:
        tmp = emoji.split(',')[0]
        res.append((tmp, sentence_length))
    return res
    
emojis_ave = df.rdd.map(average_length) \
            .flatMap(lambda x: x) \
            .map(lambda x: (x, 1)) \
            .reduceByKey(add)

# emojis_ave.take(3)

sentences_count = emojis_ave.map(lambda x: (x[0][0], x[1])) \
                    .reduceByKey(add)

# sentences_count.take(3)

words_count = emojis_ave.map(lambda x: (x[0][0], x[0][1]*x[1])) \
                    .reduceByKey(add)
# words_count.take(3)

ave_result = sentences_count.join(words_count) \
            .map(lambda x: (x[0], round(x[1][1] / x[1][0], 0)))

ave_result.take(10)

[(':face_with_tears_of_joy:', 13.0),
 (':face_with_rolling_eyes:', 14.0),
 (':loudly_crying_face:', 13.0),
 (':backhand_index_pointing_down:', 17.0),
 (':beaming_face_with_smiling_eyes:', 13.0),
 (':OK_hand:', 13.0),
 (':eyes:', 13.0),
 (':right_arrow:', 17.0),
 (':smiling_face:', 14.0),
 (':heart_suit:', 16.0)]