# SI608 Task 2 Recommendation network

First, we need to import and define some important function and repo

In [1]:
import emoji # pip install emoji
import re
import os
import sys
import time
import string
import json
import pandas as pd
import csv
import sklearn

from surprise import SVD
from surprise import accuracy
from surprise.model_selection import KFold
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate

In [5]:
def gen_emoji_to_word_dict(num_line=1000, filename='emojitweets-01-04-2018.txt', stopword_filename="english.txt", threshold=2):
    '''
    This function helps you to get the raw dict format: {emoji:{word:weight}}
    :param num_line: how many lines are used, this is implemented by a unix-like command line
    :param filename: the filename of the raw data
    :param stopword_filename: a stopword list, which will be provided
    :param threshold: if `weight` in {emoji:{word:weight}} is smaller than `threshold`, then the word is abandoned.
    '''
    time_start = time.time()
    
    # For windows OS
    # Note that type command on windows does not receive -n as parameter
    # Extract first lines before running the program 
    cmdline = "type -n {} {} > tmp_{}.txt".format(num_line, filename, num_line)
    
    # For Unix OS
    # cmdline = "head -n {} {} > tmp_{}.txt".format(num_line, filename, num_line)
    print("processing:", cmdline)
    os.system(cmdline)
    print("complete, {} second is used".format(time.time()-time_start))
    
    time_start = time.time()
    print("generating stopword from", stopword_filename)
    stopword = set()
    fs = open(stopword_filename, 'r', encoding='utf-8')
    line = fs.readline()
    while line:
        stopword.add(line.strip())
        line = fs.readline()
    fs.close()
    print("complete, {} second is used".format(time.time()-time_start))
    
    time_start = time.time()
    print("generating dict from", "tmp_{}.txt".format(num_line))
    f = open("tmp_{}.txt".format(num_line), 'r', encoding='utf-8')
    line = f.readline()
    sl = {}
    while line:
        line = line.strip()
        line = emoji.demojize(line) 
        emoji_list = re.findall(r":[\w-]*:",line)
        emoji_set = set(emoji_list)
        for emoji_item in emoji_set:
            line = line.replace(emoji_item, "")
        word_set = set()
        for word in line.lower().split():
            if (not word in stopword) and (not word in string.punctuation):
                word_set.add(word)
        for emoji_item in emoji_set:
            if emoji_item in sl:
                for word in word_set:
                    if word in sl[emoji_item]:
                        sl[emoji_item][word] += 1
                    else:
                        sl[emoji_item][word] = 1
            else:
                sl[emoji_item] = {}
                for word in word_set:
                    if word in sl[emoji_item]:
                        sl[emoji_item][word] += 1
                    else:
                        sl[emoji_item][word] = 1
        line = f.readline()
    f.close()
    print("complete, {} second is used".format(time.time()-time_start))
    
    time_start = time.time()
    print("filtering dict")
    sl_tmp = {}
    for key in sl.keys():
        sl_tmp[key] = {}
        for word in sl[key].keys():
            if sl[key][word] >= threshold:
                sl_tmp[key][word] = sl[key][word]
        if len(sl_tmp[key]) == 0:
            del sl_tmp[key]
    print("complete, {} second is used".format(time.time()-time_start))
    
    return sl_tmp

This part you need to generate a dict at format {emoji:{word:weight}}, if you use the following param, the typical time for processing is 270 seconds on i7-9750H.

In [6]:
sl = gen_emoji_to_word_dict(num_line=1000000, threshold=10)

processing: type -n 1000000 emojitweets-01-04-2018.txt > tmp_1000000.txt
complete, 2.287729263305664 second is used
generating stopword from english.txt
complete, 0.0008327960968017578 second is used
generating dict from tmp_1000000.txt
complete, 345.1900963783264 second is used
filtering dict
complete, 0.3267679214477539 second is used


Explore a little bit about the dict. 

In [7]:
len(sl)

1309

In [8]:
sorted(sl[':rocket:'].items(), key=lambda x:-x[1])[0:5]

[('followers', 269), ('…', 256), ('grow', 206), ('&amp;', 201), (':)', 201)]

You can save it and load it next time for saving your time. 

In [9]:
out_file = open("myfile.json", "w") 
json.dump(sl, out_file) 
out_file.close()

In [3]:
f = open("myfile.json", "r", encoding='utf-8')
sl = json.load(f)
f.close()

In [4]:
sorted(sl[':rocket:'].items(), key=lambda x:-x[1])[0:5]

[('followers', 269), ('…', 256), ('grow', 206), ('&amp;', 201), (':)', 201)]

The we need to extract a word corpus, and find out its size.

In [5]:
all_word_set = {}
for key in sl.keys():
    for word in sl[key].keys():
        if word not in all_word_set:
            all_word_set[word] = 1
        else:
            all_word_set[word] += 1

In [6]:
len(all_word_set)

19219

If you want to set a filter here, please change the filter number below. It is 1 by default which means all word are accepted.

TODO: !!! This filter_num is not used in the code following, you should set it to 1 at least for now to make the following code runnable.

When filter_num is set to be bigger than 1, there are bugs in later parts.

In [7]:
filter_num = 1

In [8]:
new_word_list = list(filter(lambda x:x[1]>=filter_num, all_word_set.items()))
len(new_word_list)

19219

In [9]:
new_word_list[0:5]

[('game', 164), ('2', 275), ('rating:', 3), ('vote', 141), ('5/5', 4)]

Here we generate a map from emoji/word to its unique id.

In [10]:
new_word_list = list(map(lambda x:x[0], new_word_list))
new_emoji_list = list(sl.keys())
new_word_set = set(new_word_list)
emoji_enu = list(enumerate(new_emoji_list))
word_enu = list(enumerate(new_word_list))
map_emoji = dict(emoji_enu)
map_word = dict(word_enu)
map_emoji=dict(zip(map_emoji.values(),map_emoji.keys()))
map_word=dict(zip(map_word.values(),map_word.keys()))

In [11]:
map_emoji[":rocket:"]

0

In [12]:
map_word["history"]

902

Here are two kinds of rate calculation method, the first is maxmin scale, the second is uniform distribution scale.

In [48]:
out = open('all_rec.csv','w',newline='')
csv_write = csv.writer(out,dialect='excel')
header = ["word_id", "emoji_id", "weight"]
csv_write.writerow(header)
for key in sl.keys():
    maxval = max(sl[key].values())
    minval = min(sl[key].values())
    for word in sl[key].keys() :
        item = [map_word[word], map_emoji[key], int((sl[key][word]-minval)/(maxval-minval+0.01)*10)+1]
        csv_write.writerow(item)
out.close()

In [46]:
out = open('all_rec.csv','w',newline='')
csv_write = csv.writer(out,dialect='excel')
header = ["word_id", "emoji_id", "weight"]
csv_write.writerow(header)
for key in sl.keys():
    val_list = list(sl[key].values())
    val_list.sort()
    length = len(val_list)
    for word in sl[key].keys():
        item = [map_word[word], map_emoji[key], int(val_list.index(sl[key][word])/length*10)+1]
        csv_write.writerow(item)
out.close()

Load the csv you just dumped. and explore it.

In [50]:
rec_df = pd.read_csv("all_rec.csv")

In [51]:
max(rec_df.weight.values)

10

In [52]:
min(rec_df.weight.values)

1

In [53]:
rec_df = sklearn.utils.shuffle(rec_df,random_state=671)

In [54]:
rec_df

Unnamed: 0,word_id,emoji_id,weight
57744,492,100,1
20885,10099,22,1
41872,1671,53,1
122006,14235,1297,10
40583,211,52,1
...,...,...,...
32408,285,37,1
105135,63,475,6
83679,1703,229,1
4385,2470,11,1


In [20]:
from surprise import SVD, SVDpp

Now we can apply some baseline

In [55]:
reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(rec_df[['word_id','emoji_id', 'weight']], reader)
kf = KFold(n_splits=5)
for trainset, testset in kf.split(data):
    algo = SVD(n_epochs=100)
    algo.fit(trainset)
    predictions = algo.test(testset)
    print(accuracy.mae(predictions, verbose=False))

0.5745512981715518
0.5685175083164841
0.5906072412476591
0.5847890903573209
0.5780746919874106


In [40]:
for Epo in range(10,50,5):
    print("Epo:",Epo)
    for trainset, testset in kf.split(data):
        algo = SVD(n_epochs=Epo)
        algo.fit(trainset)
        predictions = algo.test(testset)
        print(accuracy.mae(predictions, verbose=False))

Epo: 10
2.2630787799430183
2.2651218084006826
2.2612007740459545
2.274592767666333
2.261736018412689
Epo: 15
2.2825234383166606
2.2764753514928295
2.283537704485021
2.27949260525437
2.2715511884832433
Epo: 20
2.274181809124232
2.2808806787789355
2.2923654921386243
2.2876485218192966
2.2960202385527633
Epo: 25
2.301118255934808
2.2759921519447572
2.284951253606249
2.2807146902463473
2.2906928433272644
Epo: 30
2.298290831049141
2.2942903041132556
2.2978235161687044
2.290378376896937
2.2947824177547065
Epo: 35
2.2910681098362287
2.3091150218337777
2.2960966714209263
2.278933536585193
2.287557945726545
Epo: 40
2.2940166958911785
2.3050201295364308
2.273115405773457
2.2902336104180803
2.281170923453311
Epo: 45
2.290335515581869
2.28338474003143
2.2988633363376794
2.3123873115718054
2.2870585872013


In [41]:
for numFactor in range(10,50,5):
    print("numFactor:",numFactor)
    for trainset, testset in kf.split(data):
        algo = SVD(n_factors=numFactor)
        algo.fit(trainset)
        predictions = algo.test(testset)
        print(accuracy.mae(predictions, verbose=False))

numFactor: 10
2.2405843753853327
2.250882760237451
2.262324661871938
2.271567048539495
2.2481562677456046
numFactor: 15
2.2670949023262366
2.2702554753364517
2.28977385326955
2.282471399683947
2.254729505192972
numFactor: 20
2.27275800035319
2.273506435220229
2.2866123401318705
2.2838360684101757
2.271087853657984
numFactor: 25
2.2850697196442664
2.3009130815719154
2.2915283363203334
2.290892377818273
2.2718098393413126
numFactor: 30
2.2924794733903116
2.2906999238394303
2.282478069191084
2.290597079643735
2.27135095451869
numFactor: 35
2.299844785532678
2.2966783860801785
2.295739719654007
2.2868723646159457
2.293771588129018
numFactor: 40
2.2953382526492154
2.29253525249451
2.2937788799813954
2.286710004334314
2.2877780798679197
numFactor: 45
2.281992862927258
2.3035781540948332
2.2875192743150174
2.296842255316335
2.302946697681829


In [43]:
rec_df = pd.read_csv("all_rec.csv")
rec_df = sklearn.utils.shuffle(rec_df,random_state=671)

reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(rec_df[['word_id','emoji_id', 'weight']], reader)
kf = KFold(n_splits=5)
for trainset, testset in kf.split(data):
    algo = SVD(n_epochs=100)
    algo.fit(trainset)
    predictions = algo.test(testset)
    print(accuracy.mae(predictions, verbose=False))

2.3127181856928045
2.2717430191206383
2.27986205564029
2.292383304419127
2.288940078806011


In [44]:
for Epo in range(10,50,5):
    print("Epo:",Epo)
    for trainset, testset in kf.split(data):
        algo = SVD(n_epochs=Epo)
        algo.fit(trainset)
        predictions = algo.test(testset)
        print(accuracy.mae(predictions, verbose=False))

Epo: 10
2.281087685994986
2.3025376343308217
2.290967943155244
2.2783210085760808
2.295548425523583
Epo: 15
2.2939380218577323
2.2911577603391464
2.290860927082873
2.3083703301077922
2.3017557400735638
Epo: 20
2.3067662611009094
2.3040988906518836
2.285838474242703
2.316827078843667
2.2990869345964295
Epo: 25
2.3087526358343538
2.310936674161544
2.300216543656992
2.288293790307453
2.3041986727982398
Epo: 30
2.2878789349257675
2.2928219303990938
2.300355559125552
2.304371802061929
2.3092572342383715
Epo: 35
2.3035371526229094
2.3067262635046837
2.302119736045831
2.2995035304564406
2.2959834367807166
Epo: 40
2.2914909132190253
2.297633659247429
2.285864542951691
2.300900814698275
2.322125106003285
Epo: 45
2.2852900512750414
2.309173931461023
2.282335812005215
2.3012337537899152
2.2893188914107774


In [45]:
for numFactor in range(10,50,5):
    print("numFactor:",numFactor)
    for trainset, testset in kf.split(data):
        algo = SVD(n_factors=numFactor)
        algo.fit(trainset)
        predictions = algo.test(testset)
        print(accuracy.mae(predictions, verbose=False))

numFactor: 10
2.283794949465569
2.287377702669221
2.2628856432505198
2.2768887238368336
2.2593797116608156
numFactor: 15
2.281471702587086
2.278799396629668
2.2769786733515898
2.2792702286479893
2.284990726699721
numFactor: 20
2.313926041818974
2.285622521115598
2.305580190719195
2.3034166129794635
2.267844616744328
numFactor: 25
2.290913135218191
2.281968496785418
2.3061673392597593
2.305268560439139
2.285147785965376
numFactor: 30
2.3105209351989697
2.302026540016102
2.2957071242488496
2.299301253732101
2.303591985557009
numFactor: 35
2.293022041615946
2.3189443542549957
2.28645040735304
2.3069781485679512
2.31364284592973
numFactor: 40
2.316022920531921
2.3254813461166854
2.302033619346231
2.316766166173648
2.3012633935651507
numFactor: 45
2.3178181633896155
2.3073218279987517
2.3078556107205745
2.299126710391307
2.3196950537472496


SVDpp takes forever to run.

In [23]:
# for trainset, testset in kf.split(data):
#     algo = SVDpp(n_epochs=100)
#     algo.fit(trainset)
#     predictions = algo.test(testset)
#     print(accuracy.mae(predictions, verbose=False))

In [24]:
import networkx as nx

In [32]:
rec_df.columns=["source","target","weight"]

In [33]:
rec_df

Unnamed: 0,source,target,weight
57744,492,100,2
20885,10099,22,6
41872,1671,53,5
122006,14235,1297,3
40583,211,52,6
...,...,...,...
32408,285,37,4
105135,63,475,10
83679,1703,229,2
4385,2470,11,7


In [34]:
Graphtype = nx.Graph()
G = nx.from_pandas_edgelist(rec_df, edge_attr='weight', create_using=Graphtype)

In [36]:
nx.diameter(G)

5

In [37]:
nx.average_shortest_path_length(G)

2.889818832420269

In [38]:
nx.algorithms.cluster.average_clustering(G)

0.341056954755401