from "Topic Modeling based on Louvain method in Online Social Networks":

In the pre-processing, all alphanumerics characters were transformed to lowercase. By using regular expression, URLs and links were removed. These kinds of data do not represent an analysable term. 
Articles, pronouns, and prepositions were removed because they are considered noises for topic’s formation.

So, the co-occurrence was applied. For the co-occurrence verification, the adjacency list was formed by edges with weights greater than 1, due to a large number of edges with weight equal to 1. This cutting justifies the elimination of a dense graph, hard to be analyzed.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

from time import time

# Standard plotly imports
import plotly.figure_factory as ff
import plotly as py
import plotly.graph_objs as go
import plotly.express as px

from tqdm import tqdm
tqdm.pandas()
#from matplotlib_venn import venn2

import pickle
import json
from itertools import product
import re



  from pandas import Panel


In [2]:
import spacy

# load sample data

In [3]:
df = pd.read_json('data/Reviews_small_run_cleaned.jl',
           lines=True)

df = df[df['language']=='english']

df = df.sample(10000,random_state=42 )

# preprocessing

In [4]:
from utils.helpers.text_helpers import expand_contractions
def reduce_lengthening(text):
    # from https://rustyonrampage.github.io/text-mining/2017/11/28/spelling-correction-with-python-and-nltk.html
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

import re, string, timeit
regex = re.compile('[%s]' % re.escape(string.punctuation))
def remove_punct_re(s):  
    return regex.sub('', s)

In [5]:
text = pd.concat([df['pros'], df['cons']])

text = text.str.lower() #all alphanumerics characters were transformed to lowercase
text = text.progress_apply(reduce_lengthening) # reduce lengths: helllloooo -> helloo 
text = text.progress_apply(expand_contractions)
text = text.progress_apply(remove_punct_re)

100%|██████████| 20000/20000 [00:00<00:00, 113851.75it/s]
100%|██████████| 20000/20000 [00:03<00:00, 5507.35it/s]
100%|██████████| 20000/20000 [00:00<00:00, 448953.59it/s]


In [6]:
nlp = spacy.load('en')

In [7]:
print(text.iloc[0])
doc = nlp(text.iloc[0])

the firm has a great culture and plenty of opportunities to move from one role to another it is a good place for a woman to be at because of a lot of support received from senior leadership


In [8]:
from spacy import displacy

In [9]:
displacy.render(doc)

In [10]:
set([i.pos_ for i in doc])

{'ADJ',
 'ADP',
 'AUX',
 'CCONJ',
 'DET',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'SCONJ',
 'VERB'}

In [11]:
def noun_chunk_tokenizer(text):
    doc = nlp(text)
    noun_chunks = [' '.join([i.text for i in nc if not i.is_stop])  for nc in doc.noun_chunks]
    return list(filter(lambda x: x!='', noun_chunks))

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
tfidf = TfidfVectorizer(strip_accents='ascii', 
                        tokenizer=noun_chunk_tokenizer, 
                       max_df=0.9,
                       min_df=10)

In [14]:
tfidf = TfidfVectorizer(strip_accents='ascii', 
                        #tokenizer=noun_chunk_tokenizer, 
                        ngram_range=(1,3),
                        stop_words='english',
                       max_df=0.9,
                       min_df=10)

In [15]:
tfidf.fit(text)

TfidfVectorizer(max_df=0.9, min_df=10, ngram_range=(1, 3), stop_words='english',
                strip_accents='ascii')

In [16]:
len(tfidf.vocabulary_)

3567

# adjacency matrix

In [17]:
tfidf_matrix = tfidf.transform(text)

In [18]:
tfidf_matrix = tfidf_matrix.todense() 

tfidf_matrix = np.array(tfidf_matrix)

nonzero_idx = tfidf_matrix.nonzero()
nonzero_vals = tfidf_matrix[nonzero_idx]
nonzero_index_pairs = list(zip(nonzero_idx[0], nonzero_idx[1])) # document, words

In [19]:
nonzero_vals

array([0.16317629, 0.25867103, 0.1134177 , ..., 0.23380723, 0.46814056,
       0.36881464])

In [20]:
tfidf_index_2_text_index = text.reset_index()['index'].to_dict()

In [21]:
# document data
tfidf_matrix.shape

(20000, 3567)

In [22]:
edges = list(zip(nonzero_index_pairs, nonzero_vals))

edges = [{'document': str(i[0][0]),
          'review_id':str(tfidf_index_2_text_index[i[0][0]]),
          'phrase_id':i[0][1],
          'phrase':tfidf.get_feature_names()[i[0][1]],
          'weight':i[1]} for i in tqdm(edges)]

100%|██████████| 198215/198215 [03:44<00:00, 883.53it/s]


# load to Neo4J

In [23]:
from py2neo import Graph
graph = Graph("bolt://localhost:7687", auth=("neo4j", "glassdoor"))

In [24]:
# delete nodes and rels and constraints for a fresh start
clean_up =  [
"MATCH ()-[r]->() DELETE r;",
    "MATCH (n) DELETE n;"
]

[graph.evaluate(q) for q in clean_up]

constraints_in_db = graph.run("CALL db.constraints()").to_data_frame()
if 'name' in constraints_in_db:
    constraints_in_db = constraints_in_db['name'].tolist()
    for constraint in constraints_in_db:
        graph.run(f'''DROP CONSTRAINT {constraint}''')

In [25]:
constraints = ['CREATE CONSTRAINT doc_constraint ON (review:REVIEW) ASSERT review.id IS UNIQUE;',
               'CREATE CONSTRAINT phrase_constraint ON (phrase:PHRASE) ASSERT phrase.id IS UNIQUE;']
graph.run(constraints[0])
graph.run(constraints[1])

ValueError: Missing keys

In [None]:
#tx = graph.begin()
for row in tqdm(edges):
    graph.run('''

    MERGE (a:REVIEW {id:$doc_id}) 
    MERGE (b:PHRASE {text:$phrase, id:$phrase_id})
    with a, b
      MERGE (a)<-[r:is_in {weight:$weight}]-(b)
    ''', parameters = {'doc_id': row['document'], 
                       'phrase': row['phrase'], 
                       'phrase_id':str(row['phrase_id']),
                       'weight':row['weight']})
#tx.commit()

  7%|▋         | 13151/198215 [03:10<39:29, 78.11it/s]  

## phrase relationships

In [None]:
q = '''
MATCH (p1:PHRASE)-[w1:is_in]->(r:REVIEW) <-[w2:is_in]-(p2)
where id(p1) < id(p2)
RETURN p1.text,p2.text, count(r) as common_docs, sum(w1.weight*w2.weight) as sum_weight_prod 
'''
phrase_rels =graph.run(q).to_data_frame()

In [None]:
phrase_rels

In [None]:
px.scatter(data_frame=phrase_rels,
           x='common_docs',  y='sum_weight_prod',
          hover_data=['p1.text', 'p2.text'], 
           marginal_x='histogram',
          marginal_y='histogram')

In [None]:
set_phrase_relationships = '''\
MATCH (p1:PHRASE)-[w1:is_in]->(r:REVIEW) <-[w2:is_in]-(p2)
where id(p1) < id(p2)
with p1,p2, count(r) as common_docs, sum(w1.weight*w2.weight) as sum_weight_prod 
MERGE (p1)-[:phrase_relation {common_docs:common_docs, sum_weight_prod:sum_weight_prod}]-(p2)
'''

In [None]:
graph.evaluate(set_phrase_relationships)

## community detection on phrase relationships

In [None]:
louvain = '''CALL gds.louvain.stream({
  nodeProjection:"PHRASE",
  relationshipProjection:{
  phrase_relation:{
  type:"phrase_relation",
  //properties:"sum_weight_prod",
  orientation:"UNDIRECTED"}},
  includeIntermediateCommunities:True
})
YIELD nodeId,communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).id as phrase_id, 
gds.util.asNode(nodeId).text as phrase, 
communityId, 
intermediateCommunityIds
'''

In [None]:
louvain_communities = graph.run(louvain).to_data_frame()

In [None]:
# intermediate communities to separare columns
number_of_intermediates = len(louvain_communities['intermediateCommunityIds'].iloc[0])
louvain_communities = louvain_communities.join(
    louvain_communities['intermediateCommunityIds']\
    .apply(lambda x: pd.Series(x, 
                               index=[f'community_{i}' for i in range(number_of_intermediates)])))
louvain_communities = louvain_communities.drop(columns=['intermediateCommunityIds'])

In [None]:
louvain_communities['phrase_id'] = louvain_communities['phrase_id'].astype(int)

In [None]:
louvain_communities

In [None]:
louvain_communities.groupby('community_0')['phrase'].count()

In [None]:
louvain_communities.groupby('community_0')['phrase'].apply(list).tolist()

In [None]:
edges_df = pd.DataFrame(edges)

edges_df['review_id'] = edges_df['review_id'].astype(int)

edges_df = edges_df.merge(df[['overall_rating','company', 
                   'work_life_balance' ,
                   'culture_values', 
                   'career_opportunities',
                  'compensation_and_benefits',
                  'senior_management']], 
               left_on='review_id',
              right_index=True)

edges_df = edges_df.merge(louvain_communities.drop(columns=['phrase']), on='phrase_id')

In [None]:
clusters = edges_df.groupby('communityId').agg({'overall_rating': [np.mean, np.std],
                                                'company': [lambda x: Counter(x).most_common()],
                                                'phrase':[lambda x: Counter(x).most_common()],
                                                'work_life_balance': [np.mean, np.std],
                                                'culture_values': [np.mean, np.std],
                                                'career_opportunities': [np.mean, np.std],
                                                'compensation_and_benefits': [np.mean, np.std],
                                                'senior_management': [np.mean, np.std]})

In [None]:
clusters.sort_values(by=('overall_rating','mean'))

In [None]:
clusters[('phrase','<lambda>')].explode().apply(pd.Series).head(20)

In [None]:
clusters[('phrase','<lambda>')].explode().apply(pd.Series).tail(20)

In [None]:
label_prop = '''\
CALL gds.labelPropagation.stream({
  nodeProjection:"PHRASE",
  relationshipProjection:{
  phrase_relation:{
  type:"phrase_relation",
  properties:"sum_weight_prod",
  orientation:"UNDIRECTED"}}
})
YIELD nodeId,communityId
RETURN gds.util.asNode(nodeId).id as phrase_id, 
gds.util.asNode(nodeId).text as phrase, 
communityId
'''

In [None]:
label_prop_communities = graph.run(label_prop).to_data_frame()

In [None]:
label_prop_communities

In [None]:
label_prop_communities.groupby('communityId')['phrase'].count()

In [None]:
rel_q = '''
MATCH (p1:PHRASE)-[w1:is_in]->(r:REVIEW) <-[w2:is_in]-(p2)
where id(p1) < id(p2)
RETURN id(p1) as source, id(p2) as target, count(r) as common_docs, sum(w1.weight*w2.weight) as sum_weight_prod 
'''