In [1]:
from utils import read_records, create_collection
import pandas as pd

collection = create_collection("test")
tweets = [tweet for tweet in read_records(collection)]
df = pd.DataFrame(tweets)
df = df.set_index('_id')

In [2]:
from IPython.display import display

df.head()

Unnamed: 0_level_0,text,withheld
_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1531676295154761732,RT @MoCoCouncilMD: Applications for the fourth...,
1531676283058458630,RT @What46HasDone: President Biden has signed ...,
1531676282802561024,RT @NHSDigital: Missed our latest blog?\n\nSen...,
1531676271679356928,RT @ScottyUnchained: The government has respon...,
1531676271024824322,"RT @withyoumarktuan: Hello, everyone.\n\nThe C...",


### Data preprocessing
- Punctuation removal
- Lowercasing
- Removing hashtags & mentions
- Removing links

- Tokenization
- Removing Stop words
- Stemming/Lemmatization

### LDA Hyperparameters

- Alpha - higher means documents are assumed to be made up of more topics - try 0.1
- Beta - controls distribution of words per topic, higher means topics have more words

*default = 1.0 for both*




In [3]:
# creating a series

serie = df['text']

print(serie.head())
print(type(serie))

_id
1531676295154761732    RT @MoCoCouncilMD: Applications for the fourth...
1531676283058458630    RT @What46HasDone: President Biden has signed ...
1531676282802561024    RT @NHSDigital: Missed our latest blog?\n\nSen...
1531676271679356928    RT @ScottyUnchained: The government has respon...
1531676271024824322    RT @withyoumarktuan: Hello, everyone.\n\nThe C...
Name: text, dtype: object
<class 'pandas.core.series.Series'>


In [4]:
import re

def clean_tweet(pandas_series):
    '''Returns a cleaned Pandas Series object with the following changes:
    1- Removal of hashtags, new lines, retweet mentions, links and punctuation;
    2- lowercasing.
    '''
    regex_transformations = [r'\n','(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)',
                             'RT', '@[A-Za-z0-9\.-_:]+', '[,\.!?:;/]']

    for regex in regex_transformations:
        pandas_series = pandas_series.map(lambda x: re.sub(regex, '', x))

    pandas_series = pandas_series.map(lambda x: x.lower())
    
    return pandas_series

s = clean_tweet(serie)

s.head(10)

_id
1531676295154761732      applications for the fourth round of montgom...
1531676283058458630      president biden has signed 129 bills into la...
1531676282802561024      missed our latest blogsenior content designe...
1531676271679356928      the government has responded to the petition...
1531676271024824322      hello everyonethe covid-19 situation in thai...
1531676269640929282      the importance of understanding the stages o...
1531676264523702272      "i think it's outrageous that we as a countr...
1531676263953440768      messi has #longcovid “the argentine tested p...
1531676260245676041    "covid-19 helped sex workers in belgium make h...
1531676258274185217      hello everyonethe covid-19 situation in thai...
Name: text, dtype: object

In [4]:
# Removing hashtags & mentions, links, punctuation & lowercasing
import re

# creating clean column
df['clean_text'] = df['text']

# remove new lines
df['clean_text'] = df['clean_text'].map(lambda x: re.sub('\n', ' ', x))
# removing links
df['clean_text'] = df['clean_text'].map(lambda x: re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)', '', x))
# removing RT (retweets)
df['clean_text'] = df['clean_text'].map(lambda x: re.sub('RT', '', x))
# removing mentions (ex: @user123)
df['clean_text'] = df['clean_text'].map(lambda x: re.sub('@[A-Za-z0-9\.-_:]+', '', x))
# remove punctuation
df['clean_text'] =  df['clean_text'].map(lambda x: re.sub('[,\.!?]', '', x))
# lowercase
df['clean_text'] = df['clean_text'].map(lambda x: x.lower())

df['clean_text'].head(10)

_id
1531676295154761732      applications for the fourth round of montgom...
1531676283058458630      president biden has signed 129 bills into la...
1531676282802561024      missed our latest blog  senior content desig...
1531676271679356928      the government has responded to the petition...
1531676271024824322      hello everyone  the covid-19 situation in th...
1531676269640929282      the importance of understanding the stages o...
1531676264523702272      "i think it's outrageous that we as a countr...
1531676263953440768      messi has #longcovid   “the argentine tested...
1531676260245676041    "covid-19 helped sex workers in belgium make h...
1531676258274185217      hello everyone  the covid-19 situation in th...
Name: clean_text, dtype: object

In [5]:
# tokenization
import gensim
from gensim.utils import simple_preprocess

def tweet_to_words(tweets):
    for tweet in tweets:
        yield(gensim.utils.simple_preprocess(str(tweet), deacc=True))
        # deacc=True removes punctuation

data = df.clean_text.values.tolist()
data_words = list(tweet_to_words(data))

print(data_words[:1][0][:30])

['applications', 'for', 'the', 'fourth', 'round', 'of', 'montgomery', 'county', 'covid', 'rent', 'relief', 'program', 'are', 'now', 'open', 'and', 'available', 'for']


In [6]:
# Bigram and Trigram models

# build the models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [7]:
# Remove stopwords, make bigrams and lemmatization
import nltk
from nltk.corpus import stopwords
import spacy

nltk.download('stopwords')

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, 
                                allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/devilgoncalo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[['application', 'fourth', 'round', 'county', 'rent', 'relief', 'program', 'open', 'available']]


In [8]:
# Corpus and dictionary
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]]


### GSDMM

In [10]:
# importing libraries
import pandas as pd
import numpy as np
import gensim
from gsdmm import MovieGroupProcess

# cast tweets to numpy array
# docs = texts.to_numpy()

# create variable containing length of dictionary/vocab
vocab_length = len(id2word)

# initialize GSDMM
gsdmm = MovieGroupProcess(K=15, alpha=0.1, beta=0.3, n_iters=15)

# fit GSDMM model
y = gsdmm.fit(texts, vocab_length)

In stage 0: transferred 124403 clusters with 15 clusters populated
In stage 1: transferred 77457 clusters with 15 clusters populated
In stage 2: transferred 42642 clusters with 15 clusters populated
In stage 3: transferred 25129 clusters with 15 clusters populated
In stage 4: transferred 18664 clusters with 15 clusters populated
In stage 5: transferred 15960 clusters with 15 clusters populated
In stage 6: transferred 14495 clusters with 15 clusters populated
In stage 7: transferred 13788 clusters with 15 clusters populated
In stage 8: transferred 12915 clusters with 15 clusters populated
In stage 9: transferred 12423 clusters with 15 clusters populated
In stage 10: transferred 11944 clusters with 15 clusters populated
In stage 11: transferred 11752 clusters with 15 clusters populated
In stage 12: transferred 11640 clusters with 15 clusters populated
In stage 13: transferred 11453 clusters with 15 clusters populated
In stage 14: transferred 11334 clusters with 15 clusters populated


In [11]:
# print number of documents per topic
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
top_words(gsdmm.cluster_word_distribution, top_index, 20)

Number of documents per topic : [ 5783 11976  4762 25076 27297  6402  2729 12455  5041  7804  4085  3537
  5324 15438  2360]
Most important clusters (by number of docs inside): [ 4  3 13  7  1  9  5  0 12  8  2 10 11  6 14]

Cluster 4 : [('war', 9962), ('price', 4469), ('go', 2336), ('say', 2176), ('ukraine', 2063), ('people', 1939), ('logistic', 1896), ('get', 1862), ('country', 1489), ('make', 1379), ('russian', 1377), ('weapon', 1252), ('want', 1223), ('think', 1210), ('know', 1063), ('take', 1062), ('time', 1042), ('ukrainian', 1020), ('year', 996), ('need', 934)]

Cluster 3 : [('football', 15053), ('get', 2233), ('play', 2031), ('player', 1979), ('club', 1911), ('team', 1838), ('go', 1711), ('good', 1579), ('game', 1523), ('year', 1325), ('price', 1239), ('fan', 1198), ('people', 1179), ('day', 1150), ('know', 1111), ('season', 1088), ('time', 1067), ('make', 1057), ('think', 1032), ('see', 1007)]

Cluster 13 : [('covid', 10271), ('vaccine', 2188), ('pandemic', 1598), ('people', 1

In [12]:
# Import wordcloud library
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Get topic word distributions from gsdmm model
cluster_word_distribution = gsdmm.cluster_word_distribution

# Select topic you want to output as dictionary (using topic_number)
topic_dict = sorted(cluster_word_distribution[topic_number].items(), key=lambda k: k[1], reverse=True)[:values]

# Generate a word cloud image
wordcloud = WordCloud(background_color='#fcf2ed', 
                            width=1800,
                            height=700,
                            font_path=path_to_font,
                            colormap='flag').generate_from_frequencies(topic_dict)

# Print to screen
fig, ax = plt.subplots(figsize=[20,10])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

NameError: name 'topic_number' is not defined

### LDA

In [10]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=5, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.031*"world" + 0.026*"new" + 0.020*"real_life" + 0.019*"enter" + '
  '0.019*"target" + 0.018*"entire" + 0.018*"people" + 0.017*"metaverse_find" + '
  '0.017*"best_defender" + 0.016*"covid"'),
 (1,
  '0.022*"still" + 0.021*"oil" + 0.020*"vote" + 0.019*"well" + 0.014*"company" '
  '+ 0.014*"love" + 0.011*"set" + 0.011*"way" + 0.011*"gouging" + '
  '0.011*"market"'),
 (2,
  '0.074*"football" + 0.018*"get" + 0.015*"sale" + 0.015*"logistic" + '
  '0.015*"time" + 0.014*"go" + 0.010*"price" + 0.010*"box" + 0.009*"good" + '
  '0.009*"team"'),
 (3,
  '0.048*"war" + 0.020*"pleasure" + 0.019*"gemini" + 0.017*"want" + '
  '0.017*"gas" + 0.013*"sell" + 0.013*"make" + 0.012*"price" + 0.010*"talk" + '
  '0.009*"say"'),
 (4,
  '0.153*"price" + 0.018*"live" + 0.017*"amp" + 0.016*"club" + 0.014*"eth" + '
  '0.012*"floor" + 0.011*"high" + 0.010*"get" + 0.010*"buy" + 0.008*"last"')]


In [12]:
# Measure models coherence score
from gensim.models import CoherenceModel

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='u_mass') 
#'c_v' is the best coherence score, but slow

coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.380162397886737

Coherence Score:  -6.604959767114778


In [11]:
# Visualize Topics
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
LDAvis_prepared

  from imp import reload
  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


### LDA Hyperparameter tuning

Finding the ideal number of topics, beta and alpha based on the coherence score.

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, 
                                         texts=data_lemmatized,
                                         dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
topics_range = range(min_topics, max_topics)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()