In [1]:
## Building Graph Adjacency Matrix
# !pip3 install neo4j
# !pip3 install py2neo

In [2]:
from IPython.display import display

import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer, one_hot

## NEO4J
from neo4j import GraphDatabase, basic_auth
from graphdatascience import GraphDataScience
from tqdm import tqdm

from py2neo import Graph, Node, Relationship

pd.set_option ('display.max_colwidth', None)
pd.set_option ('display.max_columns', None)

In [3]:
driver = GraphDatabase.driver(
  "bolt://localhost:7687",
  auth=basic_auth("neo4j", password="12345"))


gds = GraphDataScience(
    "bolt://localhost:7687",
    auth=("neo4j", "12345"),
    # aura_ds=True
)

In [4]:
tweet_df = pd.read_excel ("../Data/preprocessed/NG_ELECTION_TWEETS_CLEANED.xlsx")
tweet_df.head (2)

Unnamed: 0,Sentiment,cleaned_tweet
0,Positive,nobody say better goambassador mrmacaronii take matter hand acquiring pvc
1,Neutral,getting pvc registering vote come nonviolent part take delefarotimi tsngcampaign ht


In [5]:
tweet_df.shape 

(18301, 2)

In [6]:
vocab = set ()
X = tweet_df.drop ('Sentiment', axis=1)
for x, y in tqdm (X.iterrows (), total=X.shape[0], colour='black'):
  vocab = vocab.union (set (y.values[0].split (' ')))


vocab_size = len (vocab)

In [7]:
tweets = tweet_df.cleaned_tweet

tokenizer = Tokenizer (num_words=vocab_size, oov_token= ' ')
tokenizer.fit_on_texts (X['cleaned_tweet'].values)

X = tokenizer.texts_to_sequences (X['cleaned_tweet'].values)
word_index = tokenizer.word_index

In [9]:
tweet_df.shape 

(18301, 2)

<center>
 <img src='../Images/graph_adjacency_model.svg', width=50%>
</center>

In [11]:
# # pip3 install neo4j-driver
# # python3 example.py
cypher_query = '''
MERGE (n:Word {name:$word_1, word_index:$word_index_1})
  ON CREATE SET n.count = 1
  ON MATCH SET n.count = n.count + 1
MERGE (m:Word {name:$word_2, word_index:$word_index_2})
  ON CREATE SET m.count = 1
  ON MATCH SET m.count = m.count + 1
MERGE (n) - [r:NEXT]-> (m)
  ON CREATE SET r.count = 1
  ON MATCH SET r.count = r.count + 1
'''

with driver.session (database="neo4j") as session:
  for tweet in  tqdm (tweet_df ['cleaned_tweet'], total=tweet_df.shape[0], colour='black'):
    words = tweet.split (" ")
    for i in range (len (words)-1):
      start = words[i]
      end   = words[i + 1]
      session.write_transaction (
        lambda tx: tx.run (
          cypher_query,
          parameters={"word_1": start, "word_2": end, 'word_index_1': word_index [start], 'word_index_2': word_index [end]})
        )

driver.close()

In [13]:
G, res = gds.graph.project(
    "tweets",
    {"Word": {
      'properties': ['word_index', 'count']
    }},
    {"NEXT": {
      'properties': ['count']
    }},
)

In [15]:
print (G.node_count ())
print (G.relationship_count ())

19811
118042


<center>
  <img src='../Images/graph_adjacency_img.svg' width=150%>
</center>

In [16]:
G = gds.graph.get("tweets")

fastrp_res = gds.fastRP.stream (
    G,                          #  Graph object
    embeddingDimension= 16,
    randomSeed= 42,
    relationshipWeightProperty= 'count',
    featureProperties= ['word_index', 'count'],
)

fastrp_res ['name'] = [gds.util.asNode (i).get ('name') for i in fastrp_res.nodeId]
fastrp_res.head (5)

Unnamed: 0,nodeId,embedding,name
0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",whatsa
1,1,"[-1.0598999011790511e-08, -0.28232985734939575, -0.34600189328193665, 0.3079962134361267, 0.07699906080961227, 0.17966444790363312, 0.1283317506313324, 0.48766061663627625, 0.4619942903518677, 0.1283317655324936, -0.47433364391326904, 0.07699904590845108, 0.3849952816963196, 0.1920037865638733, 0.6539981365203857, 0.07699904590845108]",mowe
2,2,"[0.0700836032629013, 0.15865182876586914, 0.2879798710346222, 0.4799431562423706, 0.2856059968471527, 0.04680052399635315, 0.48793864250183105, 0.6623001098632812, 0.8977084159851074, 0.1071426272392273, 0.11407659202814102, 0.039497584104537964, 0.20808245241641998, -0.6317499876022339, 0.13809454441070557, -0.3659849464893341]",ofada
3,3,"[-0.34612521529197693, 0.018053214997053146, -0.2551054060459137, 0.27834922075271606, 0.3592156767845154, 0.04417794942855835, -0.009046375751495361, -0.6046962141990662, 0.3876951336860657, -0.34533682465553284, 0.31133437156677246, -0.07301396131515503, 0.7171489000320435, -0.17626768350601196, 0.11545254290103912, -0.4596334397792816]",survey
4,4,"[-0.4766041040420532, 0.1477653980255127, 0.191990926861763, 0.22356198728084564, 0.32325372099876404, -0.1852354109287262, 0.6941946744918823, 0.33110085129737854, 0.05322249233722687, -1.0251713991165161, -0.0690322145819664, -0.2762591540813446, 0.7484951019287109, -0.2565154433250427, -0.05901751294732094, -0.2995421886444092]",sqm


In [17]:
fastrp_res.to_csv ('../Data/preprocessed/NODE_EMBEDDING.csv', index=False)

In [18]:
# key phrase extraction
graph = Graph ("bolt://52.204.57.229:7687", auth=("neo4j", "meeting-modes-polisher"))


cypher_query = '''
MATCH p = (n:Word) -[:NEXT*1..2]-> (:Word)
WITH reduce (x=0, i IN  relationships(p) | x + i.count) AS total, p
WITH nodes (p) AS text, 1.0 * total / size (nodes (p)) AS weight, p
WITH reduce (x="", i IN Nodes(p) | x + i.name + " ") AS phrase, weight
RETURN phrase, weight ORDER BY weight DESC LIMIT 10
'''
result = graph.run (cypher_query)

for x in result:
    print (x)


<neo4j.work.result.Result object at 0x104143fa0>
