In [7]:
import os
from pyspark.sql import SparkSession

from operator import add

In [2]:
# 创建sparkSession对象
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [3]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
df.show()

+--------------------+--------------------+--------------------+
|                 _id|               emoji|            sentence|
+--------------------+--------------------+--------------------+
|[5ea4eddb59a37f98...|      :red_heart:,18|No object is so b...|
|[5ea4eddb59a37f98...|:person_shrugging...|Cant expect diffe...|
|[5ea4eddb59a37f98...|:face_with_tears_...|“ Lets go Marcus ...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Asahd really is a...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Yoongi Tweet Hell...|
|[5ea4eddb59a37f98...|:backhand_index_p...|we cannot afford ...|
|[5ea4eddb59a37f98...|:party_popper:,8 ...|ranks 6th in Janu...|
|[5ea4eddb59a37f98...|:person_facepalmi...|Ok people are rea...|
|[5ea4eddb59a37f98...|:smiling_face_wit...|Cant wait to meet...|
|[5ea4eddb59a37f98...| :clapping_hands:,11|Congratulations M...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Met orlando brown...|
|[5ea4eddb59a37f98...|      :weary_face:,4|Im goin to bed :w...|
|[5ea4eddb59a37f98...|  :

## Find the appearance frequency of every emoji.

In [None]:
# emoji = ":red_heart:"
emojis = df.select('emoji')
# emojis.show()
def split_str(line):
    res = []
    
    words = line.emoji.split(" ")
    for word in words:
        tmp = word.split(',')[0]
        res.append(tmp)
    return " ".join(res)

emojis = emojis.rdd.map(split_str)
emojis.take(3)

In [None]:
result = emojis.flatMap(lambda x: x.split(" ")) \
        .map(lambda x: (x, 1)) \
        .reduceByKey(add).take(10)
#         sortBy(lambda x: x[1], False)
# result.take(3)
for v, k in result:
    print("{} {}".format(v, k))

## For every emoji, ﬁnd the 3 other emojis that are used most frequently with it.

In [None]:
from itertools import combinations

emojis = df.select('emoji')

def split_arr(line):
    res = []
    
    words = line.emoji.split(" ")
    for word in words:
        tmp = word.split(',')[0]
        res.append(tmp)
    return res

def combination(line):
    combs = list(combinations(line, 2))
    res = []
    for comb in combs:
        res.append(str(comb))
    return 
result = emojis.rdd.map(split_arr) \
        .filter(lambda x: len(x) > 1) \
        .map(combination) \
        .map(lambda x: " ".join(x)) \
#         .flatMap(lambda x: x.split(" ")) \
#         .map(lambda x: (x, 1)). \
#         reduceByKey(add).take(10)
result.take(3)
# for v, k in result:
#     print("{} {}".format(v, k))

## For every emoji, determine it is used more with words begin with lower case or word begin with upper case.

In [9]:
def check_case(line):
    res = []
    sentence = line.sentence.split(' ')
    emojis = line.emoji.split(' ')
    
    for emoji in emojis:
        e, place = emoji.split(',')
        word = sentence[int(place)-1]

        if word[0].isupper():
            res.append((e, 1))
        elif word[0].islower():
            res.append((e, -1))
        else:
            res.append((e, 0))
    return res

def lower_or_upper(line):
    word, fre = line[0], line[1]
    if fre < 0:
        return (word, 'lower')
    else:
        return (word, 'upper')

result = df.rdd.map(check_case) \
        .flatMap(lambda x: x) \
        .reduceByKey(add) \
        .map(lower_or_upper)

result.take(22)

[(':face_with_tears_of_joy:', 'lower'),
 (':backhand_index_pointing_down:', 'lower'),
 (':loudly_crying_face:', 'lower'),
 (':smiling_face:', 'lower'),
 (':heart_suit:', 'lower'),
 (':face_with_rolling_eyes:', 'lower'),
 (':eyes:', 'lower'),
 (':right_arrow:', 'upper'),
 (':OK_hand:', 'lower'),
 (':beaming_face_with_smiling_eyes:', 'lower'),
 (':person_shrugging:', 'lower'),
 (':person_facepalming:', 'lower'),
 (':smiling_face_with_heart-eyes:', 'lower'),
 (':clapping_hands:', 'lower'),
 (':weary_face:', 'lower'),
 (':folded_hands:', 'lower'),
 (':rolling_on_the_floor_laughing:', 'lower'),
 (':flexed_biceps:', 'lower'),
 (':winking_face:', 'lower'),
 (':skull:', 'lower'),
 (':party_popper:', 'upper'),
 (':male_sign:', 'upper')]