# Imports

In [44]:
!pip install glove_python

Collecting glove_python
[?25l  Downloading https://files.pythonhosted.org/packages/3e/79/7e7e548dd9dcb741935d031117f4bed133276c2a047aadad42f1552d1771/glove_python-0.1.0.tar.gz (263kB)
[K     |################################| 266kB 3.6MB/s 
Building wheels for collected packages: glove-python
  Building wheel for glove-python (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/88/4b/6d/10c0d2ad32c9d9d68beec9694a6f0b6e83ab1662a90a089a4b
Successfully built glove-python
Installing collected packages: glove-python
Successfully installed glove-python-0.1.0


In [45]:
import turicreate as tc
from itertools import combinations
import turicreate.aggregate as agg
import numpy as np
tc.config.set_runtime_config('TURI_DEFAULT_NUM_PYLAMBDA_WORKERS', 64)
from glove import Glove
from glove import Corpus

# Read Data

In [51]:
topics = tc.SFrame.read_csv('/data/notebooks/PersonalityPred/ScoredTopics.csv', header=False, verbose=False)
topics.print_rows(3)

+--------------+------------+--------------------+
|      X1      |     X2     |         X3         |
+--------------+------------+--------------------+
| 447341250114 |   airbnb   | 1.9547491087071431 |
| 447341250114 | apartment  | 1.9547491087071431 |
| 447341250114 | apple inc. | 9.387984787699388  |
+--------------+------------+--------------------+
[395337386 rows x 3 columns]



In [52]:
topics = topics.rename({'X1': 'userID', 'X2': 'topic', 'X3': 'score'})

In [54]:
import turicreate.aggregate as agg
topics_agg = topics.groupby('userID', {'topics': agg.CONCAT('topic')})

In [4]:
topics_agg.print_rows(3)

+--------------+-------------------------------+
|    userID    |             topics            |
+--------------+-------------------------------+
| 447787571427 | [instant messaging, photo ... |
| 447972320460 | [apple inc., cloud storage... |
| 447427607839 | [broadsheets, facebook, fr... |
+--------------+-------------------------------+
[1863980 rows x 2 columns]



In [56]:
topics_agg.save('./topicsConcat')

# Create Combinations

In [2]:
topics_agg = tc.load_sframe('./topicsConcat')

In [3]:
topics_agg.shape

(1863980, 2)

In [5]:
#Take a sample to check
topics_agg_samp = topics_agg.sample(0.05, seed = 234)

In [6]:
topic1 = []
topic2 = []
for row in topics_agg_samp:
    comb = list(combinations(row['topics'], 2))
    for pair in comb:
        topic1.append(pair[0])
        topic2.append(pair[1])

In [13]:
col1 = tc.SArray(topic1)
col2 = tc.SArray(topic2)
sf = tc.SFrame({'topic1':col1,'topic2':col2})

In [20]:
sf.save('./sf_frame_combinations')

# Create Co-occurrence Matrix

In [2]:
sf = tc.load_sframe('./sf_frame_combinations')

In [3]:
count_topic_combs = sf.groupby(['topic1', 'topic2'], agg.COUNT())

In [4]:
count_topic_combs.print_rows(3)

+---------------------+-------------------------------+-------+
|        topic1       |             topic2            | Count |
+---------------------+-------------------------------+-------+
|     foot locker     |          formal wear          |   3   |
| strategy video game | strikeforce (mixed martial... |   1   |
|  o2audience:TENNIS  |       betting exchanges       |   6   |
+---------------------+-------------------------------+-------+
[11271195 rows x 3 columns]



In [66]:
comb_samp = count_topic_combs[0:50]

In [67]:
comb_samp.print_rows(100)

+--------------------------------+-------------------------------+-------+
|             topic1             |             topic2            | Count |
+--------------------------------+-------------------------------+-------+
|          foot locker           |          formal wear          |   3   |
|      strategy video game       | strikeforce (mixed martial... |   1   |
|       o2audience:TENNIS        |       betting exchanges       |   6   |
|        personalization         |          supermarkets         |   23  |
|       expedia (website)        |          little tikes         |   9   |
|          hotel du vin          |     newcastle united f.c.     |   1   |
|          dictionaries          |            pet food           |   1   |
|           change.org           |        vehicle recovery       |   34  |
|             dance              |           supergroup          |   2   |
|        fortnum & mason         |          desi cuisine         |   1   |
|     remote backup servi

In [68]:
col_co = comb_samp['topic1'].unique()
col_row = comb_samp['topic2'].unique()

In [69]:
len(col_co)

50

In [70]:
def diff(li1, li2): 
    return (list(set(li1) - set(li2)))

In [71]:
differ1 = diff(col_row, col_co)
differ2 = diff(col_co, col_row)

48
49


In [72]:
#These features will be rows and columns for the matrix
merged_features = list(col_co) + differ1

In [73]:
# Didn't realize but we need an index lookupm thus adding the same
features = tc.SFrame(merged_features)
features['idx'] = range(0, len(merged_features))

In [76]:
matrix = np.zeros(shape=(features.shape[0], features.shape[0]))

In [77]:
matrix.shape

(98, 98)

In [78]:
for row in comb_samp:
    i = row['topic1']
    j = row['topic2']
    score = row['Count']
    iIdx = features[features['X1'] == i]['idx']
    jIdx = features[features['X1'] == j]['idx']
    
    matrix[iIdx, jIdx] = score


34 85 3
35 75 1
0 89 6
38 57 23
44 62 9
17 65 1
28 94 1
47 86 34
2 54 2
46 71 1
19 72 1
1 80 4
27 70 1
25 68 20
33 76 6
29 77 1
8 51 3
32 56 1
42 69 1
3 84 1
23 90 1
15 97 3
26 95 1
9 58 1
43 55 4
11 67 1
24 78 6
20 52 1
39 59 10
5 92 1
7 63 3
13 87 1
49 64 1
40 53 2
4 84 3
16 50 1
21 79 1
41 88 1
31 66 1
37 93 1
18 60 5
12 91 1
30 81 6
36 96 3
10 61 13
22 82 16
45 73 1
6 20 7
48 83 26
14 74 1


In [None]:
# Create a dictionary as glove doesn't accept strings. Dictionary will help in locating the word and index
lookup_dict = {}
for element in features:
    lookup_dict[element['X1']] = element['idx']

In [37]:
matrix[14][74]

1.0

In [79]:
np.save('./cooccurMatrix', matrix)


In [80]:
from scipy.sparse import coo_matrix
c_mat = coo_matrix(matrix)

# Building Embeddings

In [87]:
glove = Glove(no_components= 3, learning_rate= 0.5)

In [88]:
glove.fit(c_mat, epochs = 30, no_threads = 4, verbose = True)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [91]:
glove.add_dictionary(lookup_dict)

In [100]:
glove.most_similar('dance')

[('healthy diet', 0.92553703643116),
 ('bandcamp', 0.924506085474508),
 ('vehicle recovery', 0.9025076918595519),
 ('change.org', 0.901353452243554)]

In [96]:
glove.save('glove.model')

In [98]:
features['embeddings'] = glove.word_vectors
features.save('./topicEmbeddings3d')

In [99]:
features

X1,embeddings
o2audience:TENNIS,"[-0.25590186831341794, -0.10507115523421791, ..."
tv listings (uk),"[-0.15299538565246212, 0.06522701523406063, ..."
dance,"[0.048705577857946825, -0.06441650907482477, ..."
jeep,"[-0.012075309501221331, 0.007716132573596034, ..."
dermatology,"[-0.13284929977707105, -0.09501078610233918, ..."
hair care,"[-0.020616192059875173, -0.11343396734128898, ..."
rock festivals,"[-0.19858491196146452, 0.07103996327525687, ..."
windows phone games,"[0.1254480146499355, -0.07543661813548388, ..."
opodo,"[-0.033564251313613985, 0.026202752127400565, ..."
teppanyaki,"[0.12177376835567781, -0.06265174139718836, ..."
