In [1]:
import spacy.cli
spacy.cli.download("en_core_web_md")

import pandas as pd
import string
import spacy
import nltk
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from nltk.corpus import stopwords
import en_core_web_md
nltk.download('wordnet')
nltk.download('stopwords')

Collecting en-core-web-md==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


  from pandas.core import (
[nltk_data] Downloading package wordnet to /Users/pgalli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pgalli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data = pd.read_csv('twitter_data_full.csv')

In [4]:
nlp = en_core_web_md.load(disable=['parser', 'ner'])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ']):
    '''This function converts terms to their base forms.'''
    output = []
    for sent in texts:
        doc = nlp(str(sent))
        output.append(
            [token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return output


text_list = data['clean_post'].tolist()
tokenized_text = lemmatization(text_list)

In [5]:
dictionary = corpora.Dictionary(tokenized_text)
if len(dictionary) > 0:
    doc_term_matrix = [dictionary.doc2bow(text) for text in tokenized_text]
else:
    doc_term_matrix = []

## 10 Topics

In [6]:
if doc_term_matrix:
    LDA = gensim.models.ldamodel.LdaModel
    lda_model = LDA(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=10,
        random_state=100,
        chunksize=1000,
        passes=50,
        iterations=100
    )
    print(lda_model.print_topics())
else:
    print("Document term matrix is empty, cannot build LDA model.")

[(0, '0.057*"year" + 0.049*"day" + 0.032*"today" + 0.017*"old" + 0.016*"name" + 0.014*"hour" + 0.014*"nice" + 0.013*"lot" + 0.013*"talk" + 0.012*"stupid"'), (1, '0.079*"thank" + 0.034*"happy" + 0.028*"much" + 0.021*"right" + 0.018*"week" + 0.014*"cool" + 0.013*"birthday" + 0.012*"little" + 0.012*"gopayt" + 0.012*"strong"'), (2, '0.039*"award" + 0.018*"tonight" + 0.016*"word" + 0.016*"ill" + 0.014*"season" + 0.012*"part" + 0.012*"show" + 0.011*"sad" + 0.011*"dream" + 0.011*"change"'), (3, '0.042*"twitter" + 0.028*"u" + 0.015*"one" + 0.014*"business" + 0.014*"least" + 0.012*"bed" + 0.012*"dad" + 0.011*"hope" + 0.010*"brain" + 0.010*"sweet"'), (4, '0.050*"heart" + 0.047*"depression" + 0.027*"real" + 0.018*"vote" + 0.016*"kid" + 0.014*"mom" + 0.013*"movie" + 0.011*"light" + 0.011*"care" + 0.009*"free"'), (5, '0.022*"trump" + 0.019*"family" + 0.019*"night" + 0.017*"return" + 0.016*"last" + 0.016*"morning" + 0.013*"sleep" + 0.013*"pbb" + 0.013*"problem" + 0.012*"home"'), (6, '0.086*"people" 

In [7]:
total_docs = len(doc_term_matrix)
if total_docs > 0:
    print('\nPerplexity:', lda_model.log_perplexity(
        doc_term_matrix, total_docs=total_docs))
    coherence_model_lda = CoherenceModel(
        model=lda_model,
        texts=tokenized_text,
        dictionary=dictionary,
        coherence='c_v'
    )
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence:', coherence_lda)
else:
    print("No documents to evaluate coherence or perplexity.")


Perplexity: -10.208149988491838
Coherence: 0.5353578244090614


In [9]:
if total_docs > 0:
    pyLDAvis.enable_notebook()
    vis_data = gensimvis.prepare(lda_model, doc_term_matrix, dictionary)
    vis_data
    pyLDAvis.save_html(vis_data, '10_twitter_lda_visualization.html')
else:
    print("No documents for visualization.")

## 20 Topics

In [10]:
if doc_term_matrix:
    LDA = gensim.models.ldamodel.LdaModel
    lda_model = LDA(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=20,
        random_state=100,
        chunksize=1000,
        passes=50,
        iterations=100
    )
    print(lda_model.print_topics())
else:
    print("Document term matrix is empty, cannot build LDA model.")

[(0, '0.070*"week" + 0.052*"sleep" + 0.051*"woman" + 0.041*"second" + 0.041*"law" + 0.027*"office" + 0.027*"death" + 0.021*"awesome" + 0.016*"disorder" + 0.013*"sign"'), (1, '0.155*"friend" + 0.062*"nice" + 0.041*"idea" + 0.035*"school" + 0.035*"rt" + 0.029*"bitch" + 0.022*"self" + 0.019*"autism" + 0.014*"american" + 0.013*"option"'), (2, '0.169*"thing" + 0.064*"big" + 0.037*"part" + 0.034*"business" + 0.028*"top" + 0.027*"eye" + 0.022*"war" + 0.020*"easy" + 0.019*"side" + 0.017*"list"'), (3, '0.064*"well" + 0.051*"ill" + 0.039*"question" + 0.037*"point" + 0.036*"funny" + 0.034*"help" + 0.033*"story" + 0.030*"place" + 0.026*"line" + 0.022*"sale"'), (4, '0.062*"baby" + 0.046*"mind" + 0.040*"boy" + 0.039*"fan" + 0.038*"door" + 0.033*"head" + 0.023*"difference" + 0.019*"style" + 0.019*"suicide" + 0.017*"less"'), (5, '0.088*"fuck" + 0.076*"shit" + 0.041*"one" + 0.031*"bed" + 0.024*"tired" + 0.024*"anxiety" + 0.020*"skin" + 0.019*"happiness" + 0.018*"post" + 0.017*"election"'), (6, '0.133*"

In [11]:
total_docs = len(doc_term_matrix)
if total_docs > 0:
    print('\nPerplexity:', lda_model.log_perplexity(
        doc_term_matrix, total_docs=total_docs))
    coherence_model_lda = CoherenceModel(
        model=lda_model,
        texts=tokenized_text,
        dictionary=dictionary,
        coherence='c_v'
    )
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence:', coherence_lda)
else:
    print("No documents to evaluate coherence or perplexity.")


Perplexity: -20.51739341115975
Coherence: 0.6281397952546397


In [12]:
if total_docs > 0:
    pyLDAvis.enable_notebook()
    vis_data = gensimvis.prepare(lda_model, doc_term_matrix, dictionary)
    vis_data
    pyLDAvis.save_html(vis_data, '20_twitter_lda_visualization.html')
else:
    print("No documents for visualization.")

  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


## 30 Topics

In [13]:
if doc_term_matrix:
    LDA = gensim.models.ldamodel.LdaModel
    lda_model = LDA(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=30,
        random_state=100,
        chunksize=1000,
        passes=50,
        iterations=100
    )
    print(lda_model.print_topics())
else:
    print("Document term matrix is empty, cannot build LDA model.")

[(11, '0.159*"depression" + 0.092*"real" + 0.072*"big" + 0.064*"amp" + 0.053*"tomorrow" + 0.041*"long" + 0.031*"free" + 0.026*"number" + 0.024*"anxiety" + 0.024*"account"'), (12, '0.126*"twitter" + 0.061*"lot" + 0.051*"black" + 0.044*"dream" + 0.040*"high" + 0.037*"rt" + 0.034*"song" + 0.031*"small" + 0.030*"sweet" + 0.028*"death"'), (4, '0.100*"baby" + 0.076*"part" + 0.073*"show" + 0.046*"social" + 0.040*"medium" + 0.020*"daily" + 0.017*"normal" + 0.012*"emotional" + 0.010*"ride" + 0.010*"pink"'), (14, '0.067*"service" + 0.059*"positive" + 0.055*"body" + 0.033*"rest" + 0.031*"daddy" + 0.027*"thinking" + 0.025*"report" + 0.024*"personal" + 0.024*"quick" + 0.023*"move"'), (18, '0.166*"girl" + 0.094*"nice" + 0.088*"talk" + 0.054*"end" + 0.035*"sex" + 0.015*"professional" + 0.014*"gorgeous" + 0.014*"bag" + 0.011*"bunch" + 0.011*"awful"'), (7, '0.087*"return" + 0.058*"job" + 0.051*"help" + 0.036*"war" + 0.034*"tired" + 0.033*"h" + 0.033*"session" + 0.025*"step" + 0.025*"rich" + 0.024*"pret

In [14]:
total_docs = len(doc_term_matrix)
if total_docs > 0:
    print('\nPerplexity:', lda_model.log_perplexity(
        doc_term_matrix, total_docs=total_docs))
    coherence_model_lda = CoherenceModel(
        model=lda_model,
        texts=tokenized_text,
        dictionary=dictionary,
        coherence='c_v'
    )
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence:', coherence_lda)
else:
    print("No documents to evaluate coherence or perplexity.")


Perplexity: -27.074841283548224
Coherence: 0.646720444535249


In [15]:
if total_docs > 0:
    pyLDAvis.enable_notebook()
    vis_data = gensimvis.prepare(lda_model, doc_term_matrix, dictionary)
    vis_data
    pyLDAvis.save_html(vis_data, '30_twitter_lda_visualization.html')
else:
    print("No documents for visualization.")