## gensim - LDA

### Imports

In [78]:
# Experimenting with gensim
from gensim import models, corpora
from nlp import *

### Load Data and Preprocessing

In [79]:
data_file = "../Dataset/twitter.csv"

init_nltk_downloads()
df = load_twitter_csv(data_file)


df['tweet'] = df['tweet'].apply(furnish)

### Vectorisation - using Bag-of-Words

In [80]:
# Associate each word with a unique integer ID
vocabulary = corpora.Dictionary([tweet.split() for tweet in df['tweet']])
vocabulary.filter_extremes(no_below=5, no_above=0.5)

# Using the vocabulary as feature labels, create bag-of-word vectors for tweets
bow_corpus = [vocabulary.doc2bow(tweet.split()) for tweet in df['tweet']]

### Vectorisation 2 - using TF-IDF

In [81]:
# TF-IDF model transforms bow vector space to TF-IDF vector space

# TRAINS the model on the corpus
tfidf = models.TfidfModel(bow_corpus)

# APPLY the transformation
# NOTE: only creates model around the corpus - actual transformation are done
# on the fly during iteration
tfidf_corpus = tfidf[bow_corpus]

### Vectoriser 3 - Latent Semantic Indexing (LSI)

In [82]:
# train model for transformation
lsi = models.LsiModel(tfidf_corpus, id2word=vocabulary, num_topics=8)
# Create wrapper over the tfidf wrapper
lsi_corpus = lsi[tfidf_corpus]

#lsi.print_topics(8)

---

### Similarity Query
- Compare how similar a document is to another document
- Usage: Search engines

- `gensim.similarities` uses **cosine measure** - return similarity in range [-1, 1]

In [83]:
from gensim import similarities

# Query - how similar is query_doc to every documents in the corpus?
query_doc = "very cool bots on twitter very nice".split()
query_bow = vocabulary.doc2bow(query_doc)

#### Similarity with TF-IDF

In [84]:
# Index the TF-IDF vector
index = similarities.SparseMatrixSimilarity(tfidf_corpus, len(tfidf_corpus[:]))

# This creates an array with similarity score range from [-1, 1]
sims = index[tfidf[query_bow]]

#### Similarity with LSI

In [85]:
# Index the LSI vector
# NOTE: MatrixSimilarity is only appropriate when WHOLE vector fits into RAM
#     Use Similarity class instead if not (it uses Sparse and MatrixSimilarity)
index = similarities.MatrixSimilarity(lsi_corpus)

sims = index[lsi[query_bow]]

#### Printing Similar Documents

In [86]:
for doc_pos, doc_score in sorted(
    enumerate(sims), key=lambda item: item[1], reverse=True
):
    print(doc_score, df.iloc[doc_pos]['tweet'])

0.9988595 bring light quit comment thread care
0.99882126 b people right peace regardless agree maybe personal responsibility
0.99868613 wow dine incompetent wish incompetent richest man world incompetence btw free speech maybe people like
0.99861723 people stupid
0.99853307 dont people understand lie mar stuff mainly controll government
0.99835944 lmao someone bright
0.99820864 ban hour share clip w u nasa earth guess nasa like beating ben lmao
0.9979204 please help alex jones keep ban alive freedom press infowars network feed live
0.99748576 salary table rich people
0.99745125 much expensive potentially huge fine maybe even criminal charge ban babylon bee satire
0.99742377 apology ban people hunter story proven true
0.9974085 seem realize already free speech warrior country actually matter
0.9973895 agree vote former host celebrity apprentice white house realize worry anyone hose right
0.99731886 let look people bookmark already thought like let see save later
0.9972837 greg people m

---

### LDA

In [None]:
import logging

for handler in logging.root.handlers[:]:
   logging.root.removeHandler(handler)
   
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

In [None]:
from gensim.models import LdaModel

temp = vocabulary[0]  # Load the dictionary
id2word = vocabulary.id2token

model = LdaModel(
    corpus=bow_corpus,
    id2word=id2word,
    chunksize=3000,
    iterations=100,  # How often to loop over each document
    num_topics=10,
    passes=20,   # How often to train model on entire corpus
    eval_every=1,
)

In [93]:
top_topics = model.top_topics(bow_corpus)

# Sum of topic coherence of all topics / num_topics
avg_topic_coherence = sum([t[1] for t in top_topics]) / 10
print(f'Average topic coherence: {avg_topic_coherence}')

2023-01-31 11:29:40,686 : DEBUG : Setting topics to those of the model: LdaModel<num_terms=2202, num_topics=10, decay=0.5, chunksize=3000>
2023-01-31 11:29:40,716 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2023-01-31 11:29:40,723 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2023-01-31 11:29:40,738 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2023-01-31 11:29:40,754 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2023-01-31 11:29:40,769 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2023-01-31 11:29:40,782 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2023-01-31 11:29:40,793 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2023-01-31 11:29:40,803 : INFO : CorpusAccumulator accumulated stats from 8000 documents


Average topic coherence: -5.959005927998815
