In [21]:
import os
import collections

from pyspark.sql import SparkSession
from operator import add

In [2]:
# 创建sparkSession对象
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [3]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
df.show()

+--------------------+--------------------+--------------------+
|                 _id|               emoji|            sentence|
+--------------------+--------------------+--------------------+
|[5ea4eddb59a37f98...|      :red_heart:,18|No object is so b...|
|[5ea4eddb59a37f98...|:person_shrugging...|Cant expect diffe...|
|[5ea4eddb59a37f98...|:face_with_tears_...|“ Lets go Marcus ...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Asahd really is a...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Yoongi Tweet Hell...|
|[5ea4eddb59a37f98...|:backhand_index_p...|we cannot afford ...|
|[5ea4eddb59a37f98...|:party_popper:,8 ...|ranks 6th in Janu...|
|[5ea4eddb59a37f98...|:person_facepalmi...|Ok people are rea...|
|[5ea4eddb59a37f98...|:smiling_face_wit...|Cant wait to meet...|
|[5ea4eddb59a37f98...| :clapping_hands:,11|Congratulations M...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Met orlando brown...|
|[5ea4eddb59a37f98...|      :weary_face:,4|Im goin to bed :w...|
|[5ea4eddb59a37f98...|  :

## Find the appearance frequency of every emoji.

In [57]:
# emoji = ":red_heart:"
emojis = df.select('emoji')
# emojis.show()
def split_str(line):
    res = []
    
    words = line.emoji.split(" ")
    for word in words:
        tmp = word.split(',')[0]
        res.append(tmp)
    return " ".join(res)

emojis = emojis.rdd.map(split_str)
emojis.take(3)

[':red_heart:', ':person_shrugging: :female_sign:', ':face_with_tears_of_joy:']

In [58]:
result = emojis.flatMap(lambda x: x.split(" ")) \
        .map(lambda x: (x, 1)) \
        .reduceByKey(add).take(10)
#         sortBy(lambda x: x[1], False)
# result.take(3)
for v, k in result:
    print("{} {}".format(v, k))

:face_with_tears_of_joy: 306231
:backhand_index_pointing_down: 19586
:loudly_crying_face: 122740
:smiling_face: 17465
:heart_suit: 25768
:face_with_rolling_eyes: 25695
:eyes: 27022
:right_arrow: 17521
:OK_hand: 19370
:beaming_face_with_smiling_eyes: 16840


## For every emoji, ﬁnd the 3 other emojis that are used most frequently with it.

In [61]:
from itertools import combinations

emojis = df.select('emoji')

def split_arr(line):
    res = []
    
    words = line.emoji.split(" ")
    for word in words:
        tmp = word.split(',')[0]
        res.append(tmp)
    return res

def combination(line):
    combs = list(combinations(line, 2))
    res = []
    for comb in combs:
        if comb[0] != comb[1]:
            res.append(comb)
            res.append((comb[1], comb[0]))
    return res

emojis_comb = emojis.rdd.map(split_arr) \
        .filter(lambda x: len(x) > 1) \
        .map(combination) \
        .flatMap(lambda x: x) \
        .map(lambda x: (x, 1)) \
        .reduceByKey(add) \
        .map(lambda x: (x[0][0], (x[0][1], x[1]))) 

def sort_func(x):
    return x[1]

def top_3(line):
    candidate = list(line[1])
    sort_candidate = sorted(candidate, key=sort_func, reverse=True)
    res = []
    count = 0
    while count < 3 and count < len(sort_candidate):
        res.append(sort_candidate[count][0])
        count += 1
    
    return (line[0], res)

result = emojis_comb.groupByKey() \
        .map(top_3) \

result.take(20)

# for v, k in result:
#     print("{} {}".format(v, k))

[(':loudly_crying_face:',
  [':red_heart:', ':female_sign:', ':face_with_tears_of_joy:']),
 (':face_with_tears_of_joy:',
  [':female_sign:', ':male_sign:', ':person_facepalming:']),
 (':backhand_index_pointing_down:',
  [':fire:', ':backhand_index_pointing_right:', ':double_exclamation_mark:']),
 (':OK_hand:', [':red_heart:', ':fire:', ':hundred_points:']),
 (':smiling_face:',
  [':red_heart:', ':two_hearts:', ':smiling_face_with_heart-eyes:']),
 (':heart_suit:', [':fire:', ':red_heart:', ':smiling_face_with_heart-eyes:']),
 (':right_arrow:', [':heavy_check_mark:', ':fire:', ':red_heart:']),
 (':face_with_rolling_eyes:',
  [':female_sign:', ':person_facepalming:', ':male_sign:']),
 (':eyes:', [':face_with_tears_of_joy:', ':male_sign:', ':fire:']),
 (':beaming_face_with_smiling_eyes:',
  [':red_heart:',
   ':face_with_tears_of_joy:',
   ':smiling_face_with_heart-eyes:']),
 (':clapping_hands:',
  [':red_heart:', ':raising_hands:', ':face_with_tears_of_joy:']),
 (':smiling_face_with_heart

## For every emoji, determine it is used more with words begin with lower case or word begin with upper case.

In [9]:
def check_case(line):
    res = []
    sentence = line.sentence.split(' ')
    emojis = line.emoji.split(' ')
    
    for emoji in emojis:
        e, place = emoji.split(',')
        word = sentence[int(place)-1]

        if word[0].isupper():
            res.append((e, 1))
        elif word[0].islower():
            res.append((e, -1))
        else:
            res.append((e, 0))
    return res

def lower_or_upper(line):
    word, fre = line[0], line[1]
    if fre < 0:
        return (word, 'lower')
    else:
        return (word, 'upper')

result = df.rdd.map(check_case) \
        .flatMap(lambda x: x) \
        .reduceByKey(add) \
        .map(lower_or_upper)

result.take(22)

[(':face_with_tears_of_joy:', 'lower'),
 (':backhand_index_pointing_down:', 'lower'),
 (':loudly_crying_face:', 'lower'),
 (':smiling_face:', 'lower'),
 (':heart_suit:', 'lower'),
 (':face_with_rolling_eyes:', 'lower'),
 (':eyes:', 'lower'),
 (':right_arrow:', 'upper'),
 (':OK_hand:', 'lower'),
 (':beaming_face_with_smiling_eyes:', 'lower'),
 (':person_shrugging:', 'lower'),
 (':person_facepalming:', 'lower'),
 (':smiling_face_with_heart-eyes:', 'lower'),
 (':clapping_hands:', 'lower'),
 (':weary_face:', 'lower'),
 (':folded_hands:', 'lower'),
 (':rolling_on_the_floor_laughing:', 'lower'),
 (':flexed_biceps:', 'lower'),
 (':winking_face:', 'lower'),
 (':skull:', 'lower'),
 (':party_popper:', 'upper'),
 (':male_sign:', 'upper')]

## Find the average of the number of emoji used in a sentence.

In [12]:
emojis = df.select('emoji')
def cal_count(line):
    res = 0
    
    words = line.emoji.split(" ")
    for word in words:
        res += 1
    return res

emojis_count = emojis.rdd.map(cal_count)
emojis_mapped = emojis_count.map(lambda x: (x, 1))
total_count = emojis_mapped.reduceByKey(add).sortByKey()
total_count.take(10)

[(1, 1299368),
 (2, 191538),
 (3, 52918),
 (4, 19750),
 (5, 6993),
 (6, 3593),
 (7, 1714),
 (8, 1074),
 (9, 769),
 (10, 415)]

## For every emoji, ﬁnd the position (head, middle, end) that the emoji occurs most in a sentence.

In [None]:
def position(line):
    sentence = line.sentence.split(' ')
    emojis = line.emoji.split(' ')
    
    sentence_length = len(sentence)
    res = []
    
    for emoji in emojis:
        e, place = emoji.split(',')
        if int(place)/sentence_length < 1/3:
            res.append((e, 0))
        elif int(place)/sentence_length > 2/3:
            res.append((e, 2))
        else:
            res.append((e, 1))
    return res

def sort_func(x):
    return x[1]

def most_position(line):
    positions = list(line[1])
    positions_sort = sorted(positions, key=sort_func, reverse=True)
    
    return (line[0], positions_sort[0][0])


position = df.rdd.map(position) \
            .flatMap(lambda x: x) \
            .map(lambda x: ((x[0], x[1]), 1)) \
            .reduceByKey(add) \
            .map(lambda x: (x[0][0], (x[0][1], x[1]))) \
            .groupByKey() \
            .map(most_position)

position.take(3)

## Analyze the relation between the length of sentence and the number of emoji used in the sentence.

In [16]:
def length_relation(line):
    
    sentence = line.sentence.split(' ')
    emojis = line.emoji.split(' ')
    
    sentence_length = len(sentence)
    emojis_length = len(emojis)
    return (sentence_length, emojis_length)

length = df.rdd.map(length_relation)
length_mapped = length.map(lambda x: (x, 1))
relation = length_mapped.reduceByKey(add).sortByKey()

relation.take(10)

[((5, 1), 110363),
 ((6, 1), 110027),
 ((6, 2), 6169),
 ((7, 1), 102133),
 ((7, 2), 8122),
 ((7, 3), 1099),
 ((8, 1), 96853),
 ((8, 2), 8779),
 ((8, 3), 2155),
 ((8, 4), 243)]

## For every emoji, summarize the frequency that it could be used more than once in a sentence.

If the times that it used more than once, we consider it equally.

In [31]:
emojis = df.select("emoji")

def split_arr(line):
    dict_ = collections.defaultdict(int)
    res = []
    words = line.emoji.split(" ")
    for word in words:
        tmp = word.split(',')[0]
        dict_[tmp] += 1
    
    for k, v in dict_.items():
        if v > 1:
            res.append(k)
    return res

emojis = emojis.rdd.map(split_arr)
emojis_filter = emojis.filter(lambda x: len(x) > 0) \
                .flatMap(lambda x: x) \
                .map(lambda x: (x, 1)) \
                .reduceByKey(add)

emojis_filter.take(3)

[(':face_with_tears_of_joy:', 5458),
 (':OK_hand:', 945),
 (':loudly_crying_face:', 2032)]

## For every emoji, ﬁnd the average word length in the sentences that contain it.

In [41]:
def average_length(line):
    res = []
    sentence = line.sentence.split(' ')
    sentence_length = len(sentence)
    
    emojis = line.emoji.split(' ')
    for emoji in emojis:
        tmp = emoji.split(',')[0]
        res.append((tmp, sentence_length))
    return res
    
emojis_ave = df.rdd.map(average_length) \
            .flatMap(lambda x: x) \
            .map(lambda x: (x, 1)) \
            .reduceByKey(add)

# emojis_ave.take(3)

sentences_count = emojis_ave.map(lambda x: (x[0][0], x[1])) \
                    .reduceByKey(add)

# sentences_count.take(3)

words_count = emojis_ave.map(lambda x: (x[0][0], x[0][1]*x[1])) \
                    .reduceByKey(add)
# words_count.take(3)

ave_result = sentences_count.join(words_count) \
            .map(lambda x: (x[0], round(x[1][1] / x[1][0], 0)))

ave_result.take(10)

[(':face_with_tears_of_joy:', 13.0),
 (':face_with_rolling_eyes:', 14.0),
 (':loudly_crying_face:', 13.0),
 (':backhand_index_pointing_down:', 17.0),
 (':beaming_face_with_smiling_eyes:', 13.0),
 (':OK_hand:', 13.0),
 (':eyes:', 13.0),
 (':right_arrow:', 17.0),
 (':smiling_face:', 14.0),
 (':heart_suit:', 16.0)]