In [1]:
import os
import numpy as np
import pandas as pd
from pprint import pprint 
import  matplotlib.pyplot as plt

import networkx as nx

%load_ext autoreload
%autoreload 2


NLOOP is a python package that provides a convenient interface for exploring and analyzing text data. 
Behind the scene, NLOOP uses spaCy and gensim to take care of cleaning, tokenization, dependency parsing, keyword extraction and much more in one fell swoop. Here I will use it to build a keyword co-occurrence graph from the titles and abstracts of the research papers in the provided dataset. 

You can install NLOOP from the following address. Checkout the github repository for more examples. 

In [2]:
!pip install git+https://github.com/syasini/NLOOP.git@master

In [3]:
from nloop import Text

In [4]:
data_fname = os.path.join("..","input","project-btech", "dataset.csv")
data = pd.read_csv(data_fname, index_col=0,)

 # let's look at a small sample of the data 
data.reset_index(inplace=True)

In [5]:
# process text with nloop
text = Text(data["text"], fast=False)

# This will take a while for the entire corpus
# use fast=True if you're only interested in clean tokens
# and don't need dependencies, named entities, and keywords

In [6]:
# show word cloud
text.show_wordcloud()

In [7]:
#show the most common token
text_tokens = []
for i in range (0, len(text.tokens)):
    for j in range (0, len(text.tokens[i])):
        text_tokens.append(text.tokens[i][j])

<h4>Deep Walk </h4>

In [8]:
text = data['text']

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
docs = text
count_model = CountVectorizer(ngram_range=(1,1)) # default unigram model
X = count_model.fit_transform(docs)
# X[X > 0] = 1 # run this line if you don't want extra within-text cooccurence (see below)
Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
Xc.setdiag(0) # sometimes you want to fill same word cooccurence to 0
print(Xc.todense()) # print out matrix in dense format

In [10]:
words = count_model.get_feature_names()
words[1000]

In [11]:
Xc.shape

In [12]:
G = nx.Graph()
mat = Xc.toarray()
for i in range (0, 9929):
    for j in range(0, 9929):
        if mat[i][j] != 0:
            G.add_edge(words[i], words[j], weight = mat[i][j])

In [13]:
# calculate the node sizes using arbitrary transformation 
node_sizes= [20*G.degree[node]**2+100 for node in G.nodes]

# construct the label dictionary
labels = {i:i for i in list(G.nodes)}

In [14]:
from typing import List
import random
def get_random_walk(graph:nx.Graph, node:int, n_steps:int = 4)->List[str]:
   local_path = [str(node),]
   target_node = node
   for _ in range(n_steps):
      neighbors = list(nx.all_neighbors(graph, target_node))
      if len(neighbors) != 0:
          target_node = random.choice(neighbors)
          local_path.append(str(target_node))
   return local_path

In [15]:
walk_paths = []
for node in G.nodes():
   for _ in range(10):
      walk_paths.append(get_random_walk(G, node))

In [16]:
from gensim.models import Word2Vec

In [17]:
embedder = Word2Vec(
   window=4, sg=1, hs=0, negative=10, alpha=0.03, min_alpha=0.0001,    
   seed=42
)
embedder.build_vocab(walk_paths, progress_per=2)

In [19]:
embedder.similar_by_word('glass')

In [18]:
len(embedder['rules'])

In [20]:
df = pd.DataFrame()
embeddings = []
for word in words:
    embeddings.append(embedder[word])

In [25]:
len(embeddings)

In [23]:
df = pd.DataFrame()
df['words'] = words
df['embeddings'] = embeddings

In [26]:
df.to_csv('DeepWalkEmbeddings.csv')