## Import Required Libraries

In [2]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import string
import re
import nltk  
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

In [3]:
from nltk import ngrams
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import collections
import math
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

In [4]:
import gensim
from gsdmm import MovieGroupProcess

## Read Tweets Data

In [5]:
df = pd.read_csv('tweets.csv')

## Preprocess Data

Drop unnecessary columns

In [6]:
df.drop(columns = ['user', 'date', 'Tweet Id'], inplace = True)

Remove URLs from data

In [7]:
def remove_urls(text):
    return re.sub(r'http\S+','', text)

df['text'] = df['text'].apply(remove_urls)

Lowercase all alphabets and remove punctuation

In [8]:
df['clean'] = df['text'].str.lower().str.replace('[^\w\s]', ' ').str.replace(' +', ' ').str.strip()

  df['clean'] = df['text'].str.lower().str.replace('[^\w\s]', ' ').str.replace(' +', ' ').str.strip()


In [9]:
df = df.rename(columns={"text": 0, "clean": 1})

Tokenize data

In [10]:
df[1] = df.apply(lambda row: nltk.word_tokenize(row[1]), axis=1)

Remove Stop Words

In [11]:
stop_words = stopwords.words('english')

In [12]:
stop_words.extend(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l','m','n','o','p','q','r','s','t', 'u', 'v', 'w', 'x', 'y', 'z', "about", "across", "after", "all", "also", "an", "and", "another", "added",
"any", "are", "as", "at", "basically", "be", "because", 'become', "been", "before", "being", "between","both", "but", "by","came","can","come","could","did","do","does","each","else","every","either","especially", "for","from","get","given","gets",
'give','gives',"got","goes","had","has","have","he","her","here","him","himself","his","how","if","in","into","is","it","its","just","lands","like","make","making", "made", "many","may","me","might","more","most","much","must","my","never","provide", 
"provides", "perhaps","no","now","of","on","only","or","other", "our","out","over","re","said","same","see","should","since","so","some","still","such","seeing", "see", "take","than","that","the","their","them","then","there",
"these","they","this","those","through","to","too","under","up","use","using","used", "underway", "very","want","was","way","we","well","were","what","when","where","which","while","whilst","who","will","with","would","you","your", 
'etc', 'via', 'eg'])

In [13]:
stop_words += ['hi','\n','\n\n', '&amp;', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'rt', 'feel', 'give', 'giving', 'help', 'said', 'also', 'gave', 'like', 'going', 'even']

In [14]:
df[1] = df[1].apply(lambda x: [item for item in x if item not in stop_words])

Perform Lemmatization

In [15]:
wordnet_lemmatizer = WordNetLemmatizer()

df[1] = df[1].apply(lambda x: [wordnet_lemmatizer.lemmatize(y) for y in x]) 

In [16]:
docs = df[1].to_numpy()

## Create a Dictionary

In [17]:
# create dictionary of all words in all documents
dictionary = gensim.corpora.Dictionary(docs)

# filter extreme cases out of dictionary
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# create BOW dictionary
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

## Create LDA model

In [18]:
# create LDA model using preferred hyperparameters
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                         num_topics=5, 
                                         id2word=dictionary, 
                                         passes=4, 
                                         workers=2,
                                         random_state=21)

In [19]:
from gensim.models import CoherenceModel

View LDA topics

In [20]:
lda_model.show_topics()

[(0,
  '0.037*"good" + 0.036*"done" + 0.029*"new" + 0.029*"time" + 0.023*"best" + 0.022*"na" + 0.021*"gon" + 0.021*"think" + 0.020*"shit" + 0.019*"year"'),
 (1,
  '0.028*"first" + 0.027*"morning" + 0.027*"need" + 0.026*"good" + 0.024*"work" + 0.022*"day" + 0.018*"know" + 0.018*"looking" + 0.017*"people" + 0.016*"hope"'),
 (2,
  '0.055*"love" + 0.055*"one" + 0.031*"day" + 0.028*"thank" + 0.024*"really" + 0.022*"year" + 0.019*"girl" + 0.018*"people" + 0.016*"show" + 0.015*"happy"'),
 (3,
  '0.037*"one" + 0.032*"go" + 0.021*"oh" + 0.018*"need" + 0.018*"lmao" + 0.017*"life" + 0.016*"getting" + 0.015*"let" + 0.015*"buy" + 0.015*"god"'),
 (4,
  '0.041*"amp" + 0.027*"time" + 0.025*"know" + 0.023*"say" + 0.022*"live" + 0.021*"na" + 0.020*"people" + 0.019*"better" + 0.018*"yeah" + 0.017*"back"')]

### Calculate LDA Coherence Score

In [21]:
cm = CoherenceModel(model=lda_model, corpus=bow_corpus, texts=docs, coherence='c_v')
coherence_lda = cm.get_coherence() 
print(coherence_lda)

0.4204149214638339


## Create GSDMM Model

In [22]:
# create variable containing length of dictionary/vocab
vocab_length = len(dictionary)

# initialize GSDMM
gsdmm = MovieGroupProcess(K=15, alpha=0.1, beta=0.3, n_iters=15)

# fit GSDMM model
y = gsdmm.fit(docs, vocab_length)

In stage 0: transferred 4357 clusters with 15 clusters populated
In stage 1: transferred 3389 clusters with 15 clusters populated
In stage 2: transferred 3107 clusters with 15 clusters populated
In stage 3: transferred 2923 clusters with 15 clusters populated
In stage 4: transferred 2803 clusters with 15 clusters populated
In stage 5: transferred 2788 clusters with 15 clusters populated
In stage 6: transferred 2747 clusters with 15 clusters populated
In stage 7: transferred 2724 clusters with 15 clusters populated
In stage 8: transferred 2679 clusters with 15 clusters populated
In stage 9: transferred 2652 clusters with 15 clusters populated
In stage 10: transferred 2653 clusters with 15 clusters populated
In stage 11: transferred 2640 clusters with 15 clusters populated
In stage 12: transferred 2606 clusters with 15 clusters populated
In stage 13: transferred 2638 clusters with 15 clusters populated
In stage 14: transferred 2621 clusters with 15 clusters populated


In [23]:
import numpy as np

Display GSDMM topics with top words

In [24]:
# print number of documents per topic
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
top_words(gsdmm.cluster_word_distribution, top_index, 20)

Number of documents per topic : [335 439 335 335 439 297 464 378 281 416 219 194 273 300 296]
Most important clusters (by number of docs inside): [ 6  4  1  9  7  3  2  0 13  5 14  8 12 10 11]

Cluster 6 : [('oh', 44), ('love', 41), ('thank', 41), ('man', 24), ('happy', 23), ('god', 23), ('beautiful', 19), ('omg', 16), ('best', 16), ('birthday', 15), ('photo', 14), ('today', 13), ('lt', 13), ('look', 13), ('day', 12), ('baby', 12), ('posted', 11), ('literally', 11), ('wait', 11), ('go', 10)]

Cluster 4 : [('na', 96), ('gon', 48), ('wan', 47), ('go', 30), ('back', 24), ('time', 23), ('lol', 19), ('good', 19), ('love', 18), ('let', 18), ('ta', 17), ('play', 16), ('think', 14), ('keep', 13), ('mean', 13), ('really', 13), ('buy', 13), ('life', 12), ('anything', 12), ('man', 12)]

Cluster 1 : [('morning', 64), ('good', 52), ('day', 44), ('one', 30), ('nice', 22), ('night', 21), ('today', 18), ('vote', 18), ('friend', 18), ('elonmusk', 18), ('love', 16), ('please', 15), ('amp', 14), ('last',

Create Lists from GSDMM topics

In [25]:
def get_topics_lists(model, top_clusters, n_words):
    '''
    Gets lists of words in topics as a list of lists.
    
    model: gsdmm instance
    top_clusters:  numpy array containing indices of top_clusters
    n_words: top n number of words to include
    
    '''
    # create empty list to contain topics
    topics = []
    
    # iterate over top n clusters
    for cluster in top_clusters:
        #create sorted dictionary of word distributions
        sorted_dict = sorted(model.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:n_words]
         
        #create empty list to contain words
        topic = []
        
        #iterate over top n words in topic
        for k,v in sorted_dict:
            #append words to topic list
            topic.append(k)
            
        #append topics to topics list    
        topics.append(topic)
    
    return topics

# get topics to feed to coherence model
topics = get_topics_lists(gsdmm, top_index, 20) 

### Calculate GSDMM Coherence Score

In [26]:
# evaluate model using Topic Coherence score
cm_gsdmm = CoherenceModel(topics=topics, dictionary=dictionary, corpus=bow_corpus, texts=docs, coherence='c_v')

# get coherence value
coherence_gsdmm = cm_gsdmm.get_coherence()  

print(coherence_gsdmm)

0.3232805025489422


In [27]:
top_words(gsdmm.cluster_word_distribution, top_index, 20)


Cluster 6 : [('oh', 44), ('love', 41), ('thank', 41), ('man', 24), ('happy', 23), ('god', 23), ('beautiful', 19), ('omg', 16), ('best', 16), ('birthday', 15), ('photo', 14), ('today', 13), ('lt', 13), ('look', 13), ('day', 12), ('baby', 12), ('posted', 11), ('literally', 11), ('wait', 11), ('go', 10)]

Cluster 4 : [('na', 96), ('gon', 48), ('wan', 47), ('go', 30), ('back', 24), ('time', 23), ('lol', 19), ('good', 19), ('love', 18), ('let', 18), ('ta', 17), ('play', 16), ('think', 14), ('keep', 13), ('mean', 13), ('really', 13), ('buy', 13), ('life', 12), ('anything', 12), ('man', 12)]

Cluster 1 : [('morning', 64), ('good', 52), ('day', 44), ('one', 30), ('nice', 22), ('night', 21), ('today', 18), ('vote', 18), ('friend', 18), ('elonmusk', 18), ('love', 16), ('please', 15), ('amp', 14), ('last', 13), ('people', 13), ('know', 13), ('thing', 13), ('first', 11), ('lol', 11), ('omicron', 11)]

Cluster 9 : [('done', 52), ('luck', 28), ('good', 25), ('go', 23), ('start', 18), ('getting', 17

In [28]:
cm_gsdmm = CoherenceModel(topics=topics, dictionary=dictionary, corpus=bow_corpus, texts=docs, coherence='c_v')
coherence_gsdmm = cm_gsdmm.get_coherence()  
print(coherence_gsdmm)

0.3232805025489422
