In [1]:

from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from pprint import pprint

import re
import pyLDAvis.gensim
import pandas as pd

In [2]:
tweets = pd.read_csv('tweets_negative.csv')


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text)

def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

def remove_mentions(text):
    mention_pattern = re.compile(r'@\w+')
    return mention_pattern.sub('', text)

def remove_special_chars(text):
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return clean_text


tweets['text'] = tweets['text'].str.lower()
tweets['text'] = tweets['text'].apply(remove_urls)
tweets['text'] = tweets['text'].apply(remove_stopwords)
tweets['text'] = tweets['text'].apply(remove_mentions)
tweets['text'] = tweets['text'].apply(remove_special_chars)




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yaori\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yaori\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
tweets.head()

Unnamed: 0,text
0,congratulations mayorros reelection hope camp...
1,good point hs2 nt environmental win especial...
2,lib dem surge antihs2 lib dem stephen lambert ...
3,surprisentoo many snouts trough incl crayonp...
4,hs2 meadowhall fiasco syorks would metro mayo...


In [5]:
tweets = tweets.text.values.tolist()

# Turn the list of string into a list of tokens
tweets = [t.split() for t in tweets]

In [6]:
tweets[0]

['congratulations',
 'mayorros',
 'reelection',
 'hope',
 'campaign',
 'damaging',
 'untruthful',
 'hs2',
 'new',
 'route',
 'meets',
 'equal',
 'success']

In [7]:
id2word = Dictionary(tweets)
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in tweets]
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)]]


In [17]:
# Build LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=5,
                     random_state=0,
                     chunksize=100,
                     alpha='auto',
                     per_word_topics=True)

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.073*"hs2" + 0.011*"rail" + 0.010*"government" + 0.010*"people" + '
  '0.009*"going" + 0.009*"protesters" + 0.009*"like" + 0.008*"still" + '
  '0.007*"news" + 0.007*"case"'),
 (1,
  '0.030*"hs2" + 0.030*"nn" + 0.018*"report" + 0.014*"cost" + 0.013*"scrap" + '
  '0.012*"construction" + 0.011*"costs" + 0.011*"s" + 0.009*"hs2ltd" + '
  '0.007*"lockdown"'),
 (2,
  '0.031*"hs2" + 0.023*"n" + 0.016*"mps" + 0.014*"stop" + 0.012*"post" + '
  '0.010*"another" + 0.010*"please" + 0.008*"anti" + 0.007*"bosses" + '
  '0.006*"trees"'),
 (3,
  '0.116*"hs2" + 0.014*"borisjohnson" + 0.013*"amp" + 0.013*"project" + '
  '0.011*"money" + 0.011*"s" + 0.010*"nt" + 0.009*"need" + 0.009*"work" + '
  '0.009*"time"'),
 (4,
  '0.039*"hs2" + 0.034*"stophs2" + 0.016*"workers" + 0.012*"petition" + '
  '0.009*"via" + 0.008*"sites" + 0.008*"take" + 0.006*"destruction" + '
  '0.006*"ever" + 0.006*"scrapped"')]


In [18]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=tweets, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.3110218160892245


In [19]:

#Creating Topic Distance Visualization
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, id2word)