# SI608 Task 2 Recommendation network

First, we need to import and define some important function and repo

In [1]:
import emoji # pip install emoji
import re
import os
import sys
import time
import string
import json
import pandas as pd
import csv
import sklearn

from surprise import SVD
from surprise import accuracy
from surprise.model_selection import KFold
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate

In [13]:
def gen_emoji_to_word_dict(num_line=1000, filename='emojitweets-01-04-2018.txt', stopword_filename="english.txt", threshold=2):
    '''
    This function helps you to get the raw dict format: {emoji:{word:weight}}
    :param num_line: how many lines are used, this is implemented by a unix-like command line
    :param filename: the filename of the raw data
    :param stopword_filename: a stopword list, which will be provided
    :param threshold: if `weight` in {emoji:{word:weight}} is smaller than `threshold`, then the word is abandoned.
    '''
    time_start = time.time()
    cmdline = "head -n {} {} > tmp_{}.txt".format(num_line, filename, num_line)
    print("processing:", cmdline)
    os.system(cmdline)
    print("complete, {} second is used".format(time.time()-time_start))
    
    time_start = time.time()
    print("generating stopword from", stopword_filename)
    stopword = set()
    fs = open(stopword_filename, 'r', encoding='utf-8')
    line = fs.readline()
    while line:
        stopword.add(line.strip())
        line = fs.readline()
    fs.close()
    print("complete, {} second is used".format(time.time()-time_start))
    
    time_start = time.time()
    print("generating dict from", "tmp_{}.txt".format(num_line))
    f = open("tmp_{}.txt".format(num_line), 'r', encoding='utf-8')
    line = f.readline()
    sl = {}
    while line:
        line = line.strip()
        line = emoji.demojize(line) 
        emoji_list = re.findall(r":[\w-]*:",line)
        emoji_set = set(emoji_list)
        for emoji_item in emoji_set:
            line = line.replace(emoji_item, "")
        word_set = set()
        for word in line.lower().split():
            if (not word in stopword) and (not word in string.punctuation):
                word_set.add(word)
        for emoji_item in emoji_set:
            if emoji_item in sl:
                for word in word_set:
                    if word in sl[emoji_item]:
                        sl[emoji_item][word] += 1
                    else:
                        sl[emoji_item][word] = 1
            else:
                sl[emoji_item] = {}
                for word in word_set:
                    if word in sl[emoji_item]:
                        sl[emoji_item][word] += 1
                    else:
                        sl[emoji_item][word] = 1
        line = f.readline()
    f.close()
    print("complete, {} second is used".format(time.time()-time_start))
    
    time_start = time.time()
    print("filtering dict")
    sl_tmp = {}
    for key in sl.keys():
        sl_tmp[key] = {}
        for word in sl[key].keys():
            if sl[key][word] >= threshold:
                sl_tmp[key][word] = sl[key][word]
        if len(sl_tmp[key]) == 0:
            del sl_tmp[key]
    print("complete, {} second is used".format(time.time()-time_start))
    
    return sl_tmp

This part you need to generate a dict at format {emoji:{word:weight}}, if you use the following param, the typical time for processing is 270 seconds on i7-9750H.

In [17]:
sl = gen_emoji_to_word_dict(num_line=1000000, threshold=10)

processing: head -n 1000000 emojitweets-01-04-2018.txt > tmp_1000000.txt
complete, 0.2802700996398926 second is used
generating stopword from english.txt
complete, 0.0009958744049072266 second is used
generating dict from tmp_1000000.txt
complete, 264.52676796913147 second is used
filtering dict
complete, 0.3306615352630615 second is used


Explore a little bit about the dict. 

In [18]:
len(sl)

1309

In [19]:
sorted(sl[':rocket:'].items(), key=lambda x:-x[1])[0:5]

[('followers', 269), ('…', 256), ('grow', 206), ('&amp;', 201), (':)', 201)]

You can save it and load it next time for saving your time. 

In [20]:
out_file = open("myfile.json", "w") 
json.dump(sl, out_file) 
out_file.close()

In [2]:
f = open("myfile.json", "r", encoding='utf-8')
sl = json.load(f)
f.close()

In [3]:
sorted(sl[':rocket:'].items(), key=lambda x:-x[1])[0:5]

[('followers', 269), ('…', 256), ('grow', 206), ('&amp;', 201), (':)', 201)]

The we need to extract a word corpus, and find out its size.

In [4]:
all_word_set = {}
for key in sl.keys():
    for word in sl[key].keys():
        if word not in all_word_set:
            all_word_set[word] = 1
        else:
            all_word_set[word] += 1

In [5]:
len(all_word_set)

19217

If you want to set a filter here, please change the filter number below. It is 1 by default which means all word are accepted.

TODO: !!! This filter_num is not used in the code following, you should set it to 1 at least for now to make the following code runnable.

In [6]:
filter_num = 1

In [7]:
new_word_list = list(filter(lambda x:x[1]>=filter_num, all_word_set.items()))
len(new_word_list)

19217

In [8]:
new_word_list[0:5]

[('game', 164), ('2', 275), ('rating:', 3), ('gave', 52), ('5/5', 4)]

Here we generate a map from emoji/word to its unique id.

In [9]:
new_word_list = list(map(lambda x:x[0], new_word_list))
new_emoji_list = list(sl.keys())
new_word_set = set(new_word_list)
emoji_enu = list(enumerate(new_emoji_list))
word_enu = list(enumerate(new_word_list))
map_emoji = dict(emoji_enu)
map_word = dict(word_enu)
map_emoji=dict(zip(map_emoji.values(),map_emoji.keys()))
map_word=dict(zip(map_word.values(),map_word.keys()))

In [10]:
map_emoji[":rocket:"]

0

In [11]:
map_word["history"]

902

Here are two kinds of rate calculation method, the first is maxmin scale, the second is uniform distribution scale.

In [12]:
out = open('all_rec.csv','w',newline='')
csv_write = csv.writer(out,dialect='excel')
header = ["word_id", "emoji_id", "weight"]
csv_write.writerow(header)
for key in sl.keys():
    maxval = max(sl[key].values())
    minval = min(sl[key].values())
    for word in sl[key].keys() :
        item = [map_word[word], map_emoji[key], int((sl[key][word]-minval)/(maxval-minval+0.01)*10)+1]
        csv_write.writerow(item)
out.close()

In [13]:
out = open('all_rec.csv','w',newline='')
csv_write = csv.writer(out,dialect='excel')
header = ["word_id", "emoji_id", "weight"]
csv_write.writerow(header)
for key in sl.keys():
    val_list = list(sl[key].values())
    val_list.sort()
    length = len(val_list)
    for word in sl[key].keys():
        item = [map_word[word], map_emoji[key], int(val_list.index(sl[key][word])/length*10)+1]
        csv_write.writerow(item)
out.close()

Load the csv you just dumped. and explore it.

In [14]:
rec_df = pd.read_csv("all_rec.csv")

In [15]:
max(rec_df.weight.values)

10

In [16]:
min(rec_df.weight.values)

1

In [17]:
rec_df = sklearn.utils.shuffle(rec_df,random_state=671)

In [18]:
rec_df

Unnamed: 0,word_id,emoji_id,weight
83264,21,228,6
106508,4939,492,5
29304,11633,33,6
66177,2802,132,4
110933,16415,595,2
...,...,...,...
32408,2370,37,8
105135,417,475,5
83679,4657,228,10
4385,2470,11,7


Now we can apply some baseline

In [19]:
reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(rec_df[['emoji_id', 'word_id', 'weight']], reader)
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    algo = SVD(n_epochs=100)
    algo.fit(trainset)
    predictions = algo.test(testset)
    print(accuracy.mae(predictions, verbose=False))

2.3164549156315317
2.3089295396279628
2.316460608671727
