# Load Library

In [215]:
# import tensorflow v2
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

import torch

# import keyword extraction
import spacy
import pytextrank

# import data manipulation lib
from scipy import spatial
from tqdm import tqdm
import json
import random
import operator
import pandas as pd
import numpy as np
import random

# import networkx 
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities, asyn_lpa_communities
import matplotlib.pyplot as plt

import copy
import re
twitter_username_re = re.compile(r'@([A-Za-z0-9_]+)')
twitter_url_re = re.compile(r'https?:\/\/\S*')
twitter_hashtag_re = re.compile(r'#(\w+)')
twitter_long_chars_re = re.compile(r'[A-Za-z0-9_]{15,}')

# process twitter
import tweepy

# Load Twitter Public Thread Data

In [268]:
texts_to_analyze = set()
with open('2_elon.json') as f:
    obj = json.load(f)
    for o in obj:
        original_text = o['text']
        text = re.sub(twitter_username_re, '', original_text)
        text = re.sub(twitter_url_re, '', text)
        text = re.sub(twitter_hashtag_re, '', text)
        text = re.sub(twitter_long_chars_re, '', text)
        text = text.replace('\n', '. ').replace('&amp;', '').replace('amp;', '').replace('&gt;', '').replace('&lt;', '').strip()
        text = text.lower()
        if 'elonmusk-giveaway.org' in text:
            continue
        if 'www.elon-btc.org' in text:
            continue
        if 'www.elontrust.com' in text:
            continue
        if 'www.elon-btc' in text:
            continue
        if 'elonx .club' in text:
            continue
        if 'musktop' in text:
            continue
        if 'elonmusk-giveaway' in text:
            continue
        if text:
            texts_to_analyze.add(text)
        

texts_to_analyze = list(texts_to_analyze)
#for text in texts_to_analyze:
#    print(text)
len(texts_to_analyze)

1628

In [277]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        keywords = []
        for i, (key, value) in enumerate(node_weight.items()):
            keywords.append(key)
            if i > number:
                break
            
        return keywords
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [278]:
tr4w = TextRank4Keyword()
tr4w.analyze('\n'.join(texts_to_analyze), window_size=3, candidate_pos=['NOUN', 'VERB'],  lower=True)
keywords = tr4w.get_keywords(20)
keywords

['elon',
 'people',
 'dogecoin',
 'edda',
 'coin',
 'buy',
 'doge',
 'money',
 'need',
 'sell',
 'thanks',
 'btc',
 'bitcoin',
 'whales',
 'tesla',
 'crypto',
 'believe',
 'want',
 'know',
 'help',
 'coins',
 'support']

In [84]:
model = hub.load("./universal-sentence-encoder-multilingual-large_3")

In [253]:
embeddings = model(texts_to_analyze)

In [254]:
embeddings_tensor = torch.Tensor(embeddings.numpy())

In [255]:
embeddings_trans = torch.transpose(embeddings_tensor, 0, 1)

In [256]:
scores = torch.matmul(embeddings_tensor, embeddings_trans)

In [257]:
cond_scores = torch.where(scores > 0.6, 1 , 0)

In [258]:
graph_scores = torch.triu(cond_scores, diagonal=1)

In [259]:
G = nx.Graph()
G.add_nodes_from(range(len(texts_to_analyze)))

In [260]:
edge_list = torch.nonzero(graph_scores).numpy()
G.add_edges_from(edge_list)

In [261]:
c = list(nx.algorithms.community.label_propagation.asyn_lpa_communities(G))

In [262]:
centroids = {}
clusters_examples = {}
for i in range(len(c)):
    sentence_ids = c[i]
    sentences = []
    for sid in sentence_ids:
        s = texts_to_analyze[sid]
        sentences.append(s)
    centroid = np.mean(model(sentences).numpy(), 0)
    centroids[i] = centroid
    clusters_examples[i] = sentences

In [263]:
# reassign the sentences
def merge_classes(centroids, clusters_examples):
    def calculate_similarity(matrix, vector):
        """
        Compute the cosine distances between each row of matrix and vector.
        """
        v = vector.reshape(1, -1)
        return 1 - spatial.distance.cdist(matrix, v, 'cosine').reshape(-1)

    deleted = set()
    new_clusters_examples = copy.deepcopy(clusters_examples)
    new_centroids = copy.deepcopy(centroids)
    centroid_ids = list(centroids.keys())
    centroid_ids_len = len(centroid_ids) 
    for idx1 in range(centroid_ids_len - 1):
        for idx2 in range(idx1+1, centroid_ids_len):
            center_id1 = centroid_ids[idx1]
            center_id2 = centroid_ids[idx2]
            centre1 = centroids[center_id1]
            centre2 = centroids[center_id2]
            scores = calculate_similarity([centre1], centre2)
            score = scores[0]
            if score >= 0.6:
                #print(" merging ", center_id1, " and ", center_id2)
                id_to_delete = max(center_id1, center_id2)
                id_to_keep = min(center_id1, center_id2)
                deleted.add(id_to_delete)
                new_clusters_examples[id_to_keep].extend(clusters_examples[id_to_delete])

    for d in deleted:
        del new_centroids[d] 
        del new_clusters_examples[d] 
    clusters_examples = new_clusters_examples
    centroids = new_centroids
    
    return centroids, clusters_examples

centroids, clusters_examples = merge_classes(centroids, clusters_examples)

In [283]:
count = 0
with open("output.txt", "w") as f:
    for c in clusters_examples:
        sentences = clusters_examples[c]
        if len(sentences) > 3:
            tr4w.analyze('\n'.join(sentences), window_size=3, candidate_pos=['NOUN', 'VERB'],  lower=True)
            keywords = tr4w.get_keywords(1)

            f.write(" ".join(keywords) + "\n")

            f.write("----------------\n")
            for s in sentences:
                f.write(s + "\n")
            f.write("\n")
            count += 1