In [1]:
import os
import numpy as np
import pandas as pd
from pprint import pprint 
import  matplotlib.pyplot as plt

import networkx as nx

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_csv("/content/drive/MyDrive/dataset.csv" , index_col=0)

In [4]:

text = data['text']
text

0      Several inorganic flocculating agents, includi...
1      Video-oculography (VOG) is one of eye movement...
2      The first-principles calculations are performe...
3      Microhardness can be related to other macrosco...
4      We consider finite-time, future (sudden or Big...
                             ...                        
488    With development of performance-based design, ...
489    ELRs are particularly attractive for the synth...
490    This paper proposes a sentence stress feedback...
491    Arrays of TFTs and circuits were fabricated on...
492    MicroCT has been applied to AM parts in variou...
Name: text, Length: 493, dtype: object

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
docs = text
count_model = CountVectorizer(ngram_range=(1,1)) # default unigram model
X = count_model.fit_transform(docs)
# X[X > 0] = 1 # run this line if you don't want extra within-text cooccurence (see below)
Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
Xc.setdiag(0) # sometimes you want to fill same word cooccurence to 0
print(Xc.todense()) # print out matrix in dense format

[[0 0 0 ... 0 0 0]
 [0 0 4 ... 0 0 0]
 [0 4 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]]


In [6]:
words = count_model.get_feature_names()
words[1000]



'appendix'

In [7]:
Xc.shape


(9929, 9929)

In [8]:
G = nx.Graph()
mat = Xc.toarray()
for i in range (0, 9929):
    for j in range(0, 9929):
        if mat[i][j] != 0:
            G.add_edge(words[i], words[j], weight = mat[i][j])

In [9]:
G

<networkx.classes.graph.Graph at 0x7fb2e01a7890>

In [10]:
pip install stellargraph


Collecting stellargraph
  Downloading stellargraph-1.2.1-py3-none-any.whl (435 kB)
[K     |████████████████████████████████| 435 kB 4.6 MB/s 
Installing collected packages: stellargraph
Successfully installed stellargraph-1.2.1


In [11]:
from stellargraph import StellarGraph

In [12]:
Gs = StellarGraph(G)

  """Entry point for launching an IPython kernel.


In [13]:
Gs.info()

'StellarGraph: Undirected multigraph\n Nodes: 9929, Edges: 1639317\n\n Node types:\n  default: [9929]\n    Features: none\n    Edge types: default-default->default\n\n Edge types:\n    default-default->default: [1639317]\n        Weights: range=[1, 52324], mean=4.47302, std=75.2184\n        Features: none'

In [14]:
from stellargraph.data import BiasedRandomWalk

In [15]:
rw = BiasedRandomWalk(Gs)

walks = rw.run(
    nodes=list(Gs.nodes()),  # root nodes
    length=100,  # maximum length of a random walk
    n=10,  # number of random walks per root node
    p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
)
print("Number of random walks: {}".format(len(walks)))

Number of random walks: 99290


In [16]:
from gensim.models import Word2Vec

In [18]:
str_walks = [[str(n) for n in walk] for walk in walks]
model = Word2Vec(str_walks, size=768, window=5, min_count=0, sg=1, workers=2, iter=1)

In [21]:
node_ids = model.wv.index2word 

In [24]:
len(node_ids)

9929

In [26]:
node_embeddings = (
    model.wv.vectors
)  # numpy.ndarray of size number of nodes times embeddings dimensionality


In [29]:
len(node_embeddings[0])

768

In [31]:
len(node_embeddings)

9929

In [32]:
node_ids

['00',
 '00am',
 '05',
 '11',
 '5am',
 '5h',
 'above',
 'accounting',
 'additional',
 'also',
 'an',
 'and',
 'applied',
 'appropriate',
 'are',
 'as',
 'at',
 'average',
 'be',
 'because',
 'below',
 'between',
 'by',
 'can',
 'capture',
 'capturing',
 'chain',
 'chains',
 'changes',
 'cloud',
 'consider',
 'considered',
 'cut',
 'demonstrates',
 'differences',
 'different',
 'diurnal',
 'due',
 'duration',
 'each',
 'effect',
 'eight',
 'energy',
 'entire',
 'fig',
 'for',
 'four',
 'from',
 'height',
 'impacts',
 'in',
 'inclusion',
 'increased',
 'intent',
 'is',
 'it',
 'locations',
 'markov',
 'mean',
 'morning',
 'number',
 'occurs',
 'of',
 'off',
 'okta',
 'presence',
 'pressure',
 'probability',
 'produced',
 'propensity',
 'represented',
 'representing',
 'represents',
 'result',
 'season',
 'seasonal',
 'seasons',
 'seen',
 'shift',
 'slight',
 'solar',
 'speed',
 'split',
 'study',
 'summer',
 'sunrise',
 'that',
 'the',
 'times',
 'to',
 'totalling',
 'towards',
 'transit

In [33]:
list_of_tuples = list(zip(node_ids, node_embeddings))


In [35]:
df = pd.DataFrame(list_of_tuples,columns = ['Word', 'Node_Embedding'])

In [36]:
df.head()

Unnamed: 0,Word,Node_Embedding
0,00,"[6.539589e-05, 0.0002178079, 0.00014181895, -0..."
1,00am,"[-7.828184e-05, 0.0004514263, -0.00029000576, ..."
2,05,"[-0.00021342117, -0.00024161664, 0.0005116154,..."
3,11,"[-0.0001020737, 7.3072224e-05, -0.00012835703,..."
4,5am,"[0.00063675264, -0.00024753454, -0.00022098399..."


In [41]:
print(df[100:102])
print(len(df))

         Word                                     Node_Embedding
100  visually  [-0.00054977287, 0.00021664942, 6.0253013e-05,...
101   weather  [0.00065070135, 0.00019303898, 0.0006425555, 0...
9929


In [42]:
df.to_csv('Node2Vec_Embeddings.csv')

Old Method For Finding Embeddings which didn't work

In [None]:
pip install node2vec


Collecting node2vec
  Downloading node2vec-0.4.3.tar.gz (4.6 kB)
Building wheels for collected packages: node2vec
  Building wheel for node2vec (setup.py) ... [?25l[?25hdone
  Created wheel for node2vec: filename=node2vec-0.4.3-py3-none-any.whl size=5980 sha256=ef1939b37b6ade7031e3b5334420526c79a145c74ba8f12c2f4d938255b27a68
  Stored in directory: /root/.cache/pip/wheels/07/62/78/5202cb8c03cbf1593b48a8a442fca8ceec2a8c80e22318bae9
Successfully built node2vec
Installing collected packages: node2vec
Successfully installed node2vec-0.4.3


In [None]:
from node2vec import Node2Vec

In [None]:
while True:pass

KeyboardInterrupt: ignored

In [None]:
node2vec = Node2Vec(G, dimensions=768, walk_length=100, num_walks=50)

Computing transition probabilities:   0%|          | 0/9929 [00:00<?, ?it/s]

In [11]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import os
import networkx as nx
import numpy as np
import pandas as pd

from IPython.display import display, HTML

%matplotlib inline

In [13]:
pip install stellargraph

Collecting stellargraph
  Downloading stellargraph-1.2.1-py3-none-any.whl (435 kB)
[?25l[K     |▊                               | 10 kB 17.5 MB/s eta 0:00:01[K     |█▌                              | 20 kB 9.7 MB/s eta 0:00:01[K     |██▎                             | 30 kB 8.9 MB/s eta 0:00:01[K     |███                             | 40 kB 8.3 MB/s eta 0:00:01[K     |███▊                            | 51 kB 5.1 MB/s eta 0:00:01[K     |████▌                           | 61 kB 5.6 MB/s eta 0:00:01[K     |█████▎                          | 71 kB 5.6 MB/s eta 0:00:01[K     |██████                          | 81 kB 6.3 MB/s eta 0:00:01[K     |██████▊                         | 92 kB 4.8 MB/s eta 0:00:01[K     |███████▌                        | 102 kB 5.1 MB/s eta 0:00:01[K     |████████▎                       | 112 kB 5.1 MB/s eta 0:00:01[K     |█████████                       | 122 kB 5.1 MB/s eta 0:00:01[K     |█████████▉                      | 133 kB 5.1 MB/s eta 0:0

In [14]:
from stellargraph.data import BiasedRandomWalk

In [16]:
rw = BiasedRandomWalk(G)


TypeError: ignored

SyntaxError: ignored

In [None]:
while True:pass

KeyboardInterrupt: ignored