# LDA Topic Modeling with gemsim: GDrive or wget

# Setup and Configuration

## Configure Jupyter Notebook

In [None]:
## Configure Jupyter Notebook

# Ignore warnings

import warnings
warnings.filterwarnings('ignore')

# Enable multiple outputs from one code cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display
from IPython.display import Image
from ipywidgets import widgets, interactive

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## [INPUT] Connect Google gDrive to this Jupyter Notebook

In [None]:
# [INPUT REQUIRED]: Authorize access to Google gDrive via popup windows

# Connect this Notebook to your permanent Google Drive
#   so all generated output is saved to permanent storage there

try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("Attempting to attach your Google gDrive to this Colab Jupyter Notebook")
  drive.mount('/gdrive')
else:
  print("Your Google gDrive is attached to this Colab Jupyter Notebook")

Attempting to attach your Google gDrive to this Colab Jupyter Notebook
Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
# What is in your GDrive root directory?

!ls 

 atepc_inference.result.json
 checkpoints
 checkpoints.json
'Copy of IPHS 200 Final.pptx'
'Copy of tm_Introduction to topic modeling_20220512.ipynb'
 data.csv
'PyBasa Data Cleanup.ipynb'
'tm_lda_Topic Modeling_iphs2021.ipynb'
'[Working] Youtuber Topic Modeling.ipynb'


In [None]:
# CUSTOMIZE: /gdrive/MyDrive/(path to your project subdir)

%cd /gdrive/MyDrive/"Spring 2022"/"IPHS 300.00"/"Final Project"

/gdrive/MyDrive/Spring 2022/IPHS 300.00/Final Project


In [None]:
# Make sure your cHANGE dIRECTORY command worked and the project directory exists

!pwd

/gdrive/MyDrive/Spring 2022/IPHS 300.00/Final Project


In [None]:
# Verify subdir content (arguments after '-' aLL, lONG, tIME, rEVERSE ORDER)
!ls -altr

# Compare this CLI (Command Line Interface) with what you see in 
#   (a) your Google Colab left margin folder icon view (out of sync: 'sample_data')
#   (b) your browser view via Google GDrive (same content)

total 36458
drwx------ 2 root root     4096 May 13 00:52  checkpoints
-rw------- 1 root root     6322 May 13 15:06  checkpoints.json
-rw------- 1 root root      315 May 13 15:06  atepc_inference.result.json
-rw------- 1 root root   125013 May 13 15:07 'PyBasa Data Cleanup.ipynb'
-rw------- 1 root root   836441 May 13 15:22 'Copy of IPHS 200 Final.pptx'
-rw------- 1 root root   366575 May 13 15:41 'Copy of tm_Introduction to topic modeling_20220512.ipynb'
-rw------- 1 root root  3582437 May 13 16:49 'tm_lda_Topic Modeling_iphs2021.ipynb'
-rw------- 1 root root 31126200 May 13 16:50  data.csv
-rw------- 1 root root  1283926 May 13 18:29 '[Working] Youtuber Topic Modeling.ipynb'


# Data

You have (2) ways to get data in this tutorial, but if you're following
this tutorial just to learn about LDA I encourage you to consider picking a
corpus on a subject that you are familiar with. Qualitatively evaluating the
output of an LDA model is challenging and can require you to understand the
subject matter of your corpus (depending on your goal with the model).

Reference to Compare::

    The NeurIPS corpus contains 1740 documents, and not particularly long ones.

    `website <http://www.cs.nyu.edu/~roweis/data.html>`
    
    So keep in mind that this tutorial is not geared towards efficiency, and be
    careful before applying the code to a large dataset.


## Option (a): Put Text Datafile in your GDrive project directory

If you have a Text Datafile ready to use, just copy it into your GDrive folder that is the project directory for this LDA exercise (listed below)

In [None]:
!ls *.csv

data.csv


In [None]:
yt_data_df = pd.read_csv("data.csv") 

In [None]:
yt_data_df.head()

Unnamed: 0,Id,Channel,Subscribers,Title,CC,URL,Released,Views,Category,Transcript,Length
0,FozCkl1xj-w,JRE Clips,6.28M subscribers,Former CIA Agent Breaks Down Jeffrey Epstein Case,0,https://www.youtube.com/watch?v=FozCkl1xj-w,2 years ago,7.9M views,Blog,the Joe Rogan experience well how about the ot...,13:32
1,RN8yoi-e2yc,Mythical Kitchen,1.9M subscribers,$420 Pizza Hut Stuffed Crust Pizza | Fancy Fas...,1,https://www.youtube.com/watch?v=RN8yoi-e2yc,,2.7M views,Food,"- Oh, that's dirty.\r\n- Wow! - Whoa.\r\n- You...",24:26
2,IugcIAAZJ2M,Munchies,4.59M subscribers,The Iconic $1 Pizza Slice of NYC | Street Food...,0,https://www.youtube.com/watch?v=IugcIAAZJ2M,2 years ago,11M views,Food,if you want good pizza come to st marks it's t...,7:51
3,JiEO6F8i0eU,Parks and Recreation,282K subscribers,Ron Swanson: The Papa of Pawnee | Parks and Re...,0,https://www.youtube.com/watch?v=JiEO6F8i0eU,3 years ago,2.3M views,"Entertainment,Comedy",April where have you been over two phone calls...,10:06
4,1T4XMNN4bNM,Vsauce,17.4M subscribers,What's The Most Dangerous Place on Earth?,1,https://www.youtube.com/watch?v=1T4XMNN4bNM,9 years ago,21M views,Science,"Hey, Vsauce. Michael here. 93% of all the\r\nh...",9:29


In [None]:
yt_tech_df1 = yt_data_df[yt_data_df.Category == "Tech"]
yt_tech_df2 = yt_data_df[yt_data_df.Category == "Tech,Comedy"]
yt_tech_df3 = yt_data_df[yt_data_df.Category == "Tech,News"]
yt_tech_df4 = yt_data_df[yt_data_df.Category == "Tech,Informative"]

yt_tech_df = pd.concat([yt_tech_df1, yt_tech_df2, yt_tech_df3, yt_tech_df4], axis=0)
# yt_tech_df.head(100)
yt_tech_df.shape

(280, 11)

In [None]:
# yt_tech_df.Category.unique()

In [None]:
"""
print("Blog Count:" + str(yt_tech_df['Category'].value_counts()["Blog"]))
print("Food Count:" + str(yt_tech_df['Category'].value_counts()["Food"]))
print("Entertainment,Comedy Count:" + str(yt_tech_df['Category'].value_counts()["Entertainment,Comedy"]))
print("Science Count:" + str(yt_tech_df['Category'].value_counts()["Science"]))
print("Entertainment Count:" + str(yt_tech_df['Category'].value_counts()["Entertainment"]))
print("News Count:" + str(yt_tech_df['Category'].value_counts()["News"]))
print("VideoGames Count:" + str(yt_tech_df['Category'].value_counts()["VideoGames"]))
print("Blog,Comedy Count:" + str(yt_tech_df['Category'].value_counts()["Blog,Comedy"]))
print("Comedy,Entertainment Count:" + str(yt_tech_df['Category'].value_counts()["Comedy,Entertainment"]))
print("Blog,Science Count:" + str(yt_tech_df['Category'].value_counts()["Blog,Science"]))
print("Tech Count:" + str(yt_tech_df['Category'].value_counts()["Tech"]))
print("Tech,Comedy Count:" + str(yt_tech_df['Category'].value_counts()["Tech,Comedy"]))
print("Automobile,Comedy Count:" + str(yt_tech_df['Category'].value_counts()["Automobile,Comedy"]))
print("Informative Count:" + str(yt_tech_df['Category'].value_counts()["Informative"]))
print("Tech,News Count:" + str(yt_tech_df['Category'].value_counts()["Tech,News"]))
print("Automobile Count:" + str(yt_tech_df['Category'].value_counts()["Automobile"]))
print("Tech,Informative Count:" + str(yt_tech_df['Category'].value_counts()["Tech,Informative"]))
print("Food,Entertainment Count:" + str(yt_tech_df['Category'].value_counts()["Food,Entertainment"]))
print("Blog,Entertainment Count:" + str(yt_tech_df['Category'].value_counts()["Blog,Entertainment"]))
print("Entertainment,Blog Count:" + str(yt_tech_df['Category'].value_counts()["Entertainment,Blog"]))
print("Comedy Count:" + str(yt_tech_df['Category'].value_counts()["Comedy"]))
"""

'\nprint("Blog Count:" + str(yt_tech_df[\'Category\'].value_counts()["Blog"]))\nprint("Food Count:" + str(yt_tech_df[\'Category\'].value_counts()["Food"]))\nprint("Entertainment,Comedy Count:" + str(yt_tech_df[\'Category\'].value_counts()["Entertainment,Comedy"]))\nprint("Science Count:" + str(yt_tech_df[\'Category\'].value_counts()["Science"]))\nprint("Entertainment Count:" + str(yt_tech_df[\'Category\'].value_counts()["Entertainment"]))\nprint("News Count:" + str(yt_tech_df[\'Category\'].value_counts()["News"]))\nprint("VideoGames Count:" + str(yt_tech_df[\'Category\'].value_counts()["VideoGames"]))\nprint("Blog,Comedy Count:" + str(yt_tech_df[\'Category\'].value_counts()["Blog,Comedy"]))\nprint("Comedy,Entertainment Count:" + str(yt_tech_df[\'Category\'].value_counts()["Comedy,Entertainment"]))\nprint("Blog,Science Count:" + str(yt_tech_df[\'Category\'].value_counts()["Blog,Science"]))\nprint("Tech Count:" + str(yt_tech_df[\'Category\'].value_counts()["Tech"]))\nprint("Tech,Comedy C

In [None]:
yt_tech_df.head()
yt_tech_df.info()

Unnamed: 0,Id,Channel,Subscribers,Title,CC,URL,Released,Views,Category,Transcript,Length
16,16q8_32M03k,Austin Evans,5.24M subscribers,The Ultimate PlayStation Comparison,1,https://www.youtube.com/watch?v=16q8_32M03k,3 years ago,6M views,Tech,"- Hey guys, this is Austin, and welcome to the...",15:12
23,OX31kZbAXsA,Linus Tech Tips,14.3M subscribers,Does High FPS make you a better gamer? Ft. Shr...,0,https://www.youtube.com/watch?v=OX31kZbAXsA,2 years ago,9.5M views,Tech,you guys loved our last video using a high-spe...,36:12
25,94b2pr9Prog,Hardware Canucks,1.71M subscribers,SteelSeries Arctis 5 - the BEST $99 Gaming Hea...,0,https://www.youtube.com/watch?v=94b2pr9Prog,4 years ago,1.4M views,Tech,[Music] guys the arctas headset from SteelSeri...,10:59
26,R8rmfD9Y5-c,Web Dev Simplified,817K subscribers,8 Must Know JavaScript Array Methods,0,https://www.youtube.com/watch?v=R8rmfD9Y5-c,3 years ago,667K views,Tech,arrays are one of the most common things that ...,10:05
35,5pP8TLwO_Ks,Austin Evans,5.24M subscribers,This Will Kill Your Computer,1,https://www.youtube.com/watch?v=5pP8TLwO_Ks,5 years ago,9.7M views,Tech,"- I'm nervous. Hey guys, this is Austin, and t...",6:00


<class 'pandas.core.frame.DataFrame'>
Int64Index: 280 entries, 16 to 2443
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Id           280 non-null    object
 1   Channel      280 non-null    object
 2   Subscribers  280 non-null    object
 3   Title        280 non-null    object
 4   CC           280 non-null    int64 
 5   URL          280 non-null    object
 6   Released     223 non-null    object
 7   Views        280 non-null    object
 8   Category     280 non-null    object
 9   Transcript   280 non-null    object
 10  Length       280 non-null    object
dtypes: int64(1), object(10)
memory usage: 26.2+ KB


In [None]:
yt_tech_str = '\n\n'.join(yt_tech_df['Transcript'].tolist())

In [None]:
yt_tech_str[:1000]

'- Hey guys, this is Austin, and welcome to the ultimate PlayStation comparison. And of course, how else could we begin, but with the original PlayStation One? In 1994, the world got this,\r\nthe original PlayStation. Now what\'s interesting is, is that this actually\r\nalmost didn\'t even happen. Originally, Sony was\r\ndeveloping the PlayStation as an add-on for the Super Nintendo. But after Nintendo bailed on the deal, they figured, "Hey, we\'ve\r\nalready put the work in, we might as well make\r\nour own game console." And thus, the original\r\nPlayStation was born. It\'s hard to overstate just\r\nhow good of an idea that was. The PlayStation was the\r\nvery first game console to ever hit 100 million sales, and after being sold for ten years, the games continue to come\r\nout for the PlayStation all the way up until 2006. You know, when the PlayStation 3 came out. Now, I have a special\r\nattachment to the PlayStation, as this was my very first game console. However, it actually\r\

*italicized text*
# LDA Model

Introduces Gensim's LDA model and demonstrates its use on the NIPS corpus.


The purpose of this tutorial is to demonstrate how to train and tune an LDA model.

In this tutorial we will:

* Load input data.
* Pre-process that data.
* Transform documents into bag-of-words vectors.
* Train an LDA model.

This tutorial will **not**:

* Explain how Latent Dirichlet Allocation works
* Explain how the LDA model performs inference
* Teach you all the parameters and options for Gensim's LDA implementation

If you are not familiar with the LDA model or how to use it in Gensim, I (Olavur Mortensen)
suggest you read up on that before continuing with this tutorial. Basic
understanding of the LDA model should suffice. Examples:

* `Introduction to Latent Dirichlet Allocation <http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation>`_
* Gensim tutorial: `sphx_glr_auto_examples_core_run_topics_and_transformations.py`
* Gensim's LDA model API docs: :py:class:`gensim.models.LdaModel`

I would also encourage you to consider each step when applying the model to
your data, instead of just blindly applying my solution. The different steps
will depend on your data and possibly your goal with the model.





## Read Textfile

In [None]:
len(yt_tech_str)

3519767

In [None]:
print(yt_tech_str[:1000])

- Hey guys, this is Austin, and welcome to the ultimate PlayStation comparison. And of course, how else could we begin, but with the original PlayStation One? In 1994, the world got this,
the original PlayStation. Now what's interesting is, is that this actually
almost didn't even happen. Originally, Sony was
developing the PlayStation as an add-on for the Super Nintendo. But after Nintendo bailed on the deal, they figured, "Hey, we've
already put the work in, we might as well make
our own game console." And thus, the original
PlayStation was born. It's hard to overstate just
how good of an idea that was. The PlayStation was the
very first game console to ever hit 100 million sales, and after being sold for ten years, the games continue to come
out for the PlayStation all the way up until 2006. You know, when the PlayStation 3 came out. Now, I have a special
attachment to the PlayStation, as this was my very first game console. However, it actually
wasn't the only version. T

## Split the Book/Corpus into Paragraphs/Documents

In [None]:
book_parags_ls = yt_tech_str.split('\n\n')
book_parags_ls[:3]

['- Hey guys, this is Austin, and welcome to the ultimate PlayStation comparison. And of course, how else could we begin, but with the original PlayStation One? In 1994, the world got this,\r\nthe original PlayStation. Now what\'s interesting is, is that this actually\r\nalmost didn\'t even happen. Originally, Sony was\r\ndeveloping the PlayStation as an add-on for the Super Nintendo. But after Nintendo bailed on the deal, they figured, "Hey, we\'ve\r\nalready put the work in, we might as well make\r\nour own game console." And thus, the original\r\nPlayStation was born. It\'s hard to overstate just\r\nhow good of an idea that was. The PlayStation was the\r\nvery first game console to ever hit 100 million sales, and after being sold for ten years, the games continue to come\r\nout for the PlayStation all the way up until 2006. You know, when the PlayStation 3 came out. Now, I have a special\r\nattachment to the PlayStation, as this was my very first game console. However, it actually\r

In [None]:
len(book_parags_ls)

280

In [None]:
book_parags_ls[100]
print('\n')
len(book_parags_ls[100])







13860

In [None]:
#Delete any paragraphs shorter than MIN_LEN_PARAG

MIN_LEN_TOKEN = 2
MIN_LEN_PARAG = 5
MIN_LEN_DOC = 1000

# Delete any paragraphs shorter than MIN_LEN_PARAG
#book_parags_ls = [x for x in book_parags_ls if len(x) > MIN_LEN_PARAG]

# Trim any leading/trailing/multiple embedded whitespaces
#book_parags_ls = [' '.join(x.split()) for x in book_parags_ls]

#len(book_parags_ls)
#len(book_parags_ls)

In [None]:
# Agglomerate paragraphs into Documents of MIN_LEN_DOC=1000 chars

parag_ct = len(book_parags_ls)

doc_now_str = ''
doc_now_len = 0
docs_ls = []

for i in range(parag_ct):
  # print(f'Processing Paragraph #{i}')
  parag_now_str = book_parags_ls[i]
  doc_now_str += parag_now_str
  doc_now_len += len(parag_now_str)
  if doc_now_len > MIN_LEN_DOC:
    docs_ls.append(doc_now_str)
    doc_now_str = ''
    doc_now_len = 0

docs_ls[-1] += doc_now_str

print(f'There are now {len(docs_ls)} Documents of {MIN_LEN_DOC} chars or more')

There are now 267 Documents of 1000 chars or more


In [None]:
# View the first 5 docs

docs_ls[:5]

['- Hey guys, this is Austin, and welcome to the ultimate PlayStation comparison. And of course, how else could we begin, but with the original PlayStation One? In 1994, the world got this,\r\nthe original PlayStation. Now what\'s interesting is, is that this actually\r\nalmost didn\'t even happen. Originally, Sony was\r\ndeveloping the PlayStation as an add-on for the Super Nintendo. But after Nintendo bailed on the deal, they figured, "Hey, we\'ve\r\nalready put the work in, we might as well make\r\nour own game console." And thus, the original\r\nPlayStation was born. It\'s hard to overstate just\r\nhow good of an idea that was. The PlayStation was the\r\nvery first game console to ever hit 100 million sales, and after being sold for ten years, the games continue to come\r\nout for the PlayStation all the way up until 2006. You know, when the PlayStation 3 came out. Now, I have a special\r\nattachment to the PlayStation, as this was my very first game console. However, it actually\r

In [None]:
# View the first 500 chars in the 50th Document

docs_ls[49][:500]

"the best way to convince your friends you're a good web developer is to put fancy animations on your homepage and the best way to do that is to reverse engineer other websites like this animated loading sequence on the gatsby home page open chrome dev tools then hit ctrl p to bring up the animation panel it automatically records every css animation on the page allowing you to visualize and modify the keyframes now double-click to highlight an element in the dom if necessary you can right-click a"

In [None]:
# View the last 100 chars in the last Document

docs_ls[-1][-100:]

" possible you all rock\xa0\xa0 that's all for now thanks so much for\xa0\r\nwatching and i'll see you next time"

In [None]:
len(docs_ls)

267

## Pre-process and vectorize the documents

As part of preprocessing, we will:

* Tokenize (split the documents into tokens).
* Lemmatize the tokens.
* Compute bigrams.
* Compute a bag-of-words representation of the data.

First we tokenize the text using a regular expression tokenizer from NLTK. We
remove numeric tokens and tokens that are only a single character, as they
don't tend to be useful, and the dataset contains a lot of them.

.. Important::

   This tutorial uses the nltk library for preprocessing, although you can
   replace it with something else if you want.




In [None]:
type(docs_ls)

list

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")

STOPWORDS = stopwords.words("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# STOPWORDS = STOPWORDS + ["bazinga", "woohoo"]

In [None]:
'the' in STOPWORDS

True

In [None]:
len(yt_tech_str)

3519767

In [None]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

yt_token_ls = []

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
tokens_ls = tokenizer.tokenize(yt_tech_str)
# tokenize(YTTechTest)
for i, atoken in enumerate(tokens_ls):
  atoken_clean = atoken.strip().lower()
  if atoken_clean in STOPWORDS:
    # print(f'Skipping Token #{i}: [{atoken}]')
    continue
  else:
    yt_token_ls.append(atoken_clean)  # Convert to lowercase.

# Remove numbers, but not words that contain numbers.
# docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
# docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [None]:
# Alternative approach to remove STOPWORDS

tokenizer = RegexpTokenizer(r'\w+')
tokens_ls = tokenizer.tokenize(yt_tech_str)

tokens_set = set(tokens_ls)
stopwords_set = set(STOPWORDS)

tokens_nostop_set = tokens_set.difference(stopwords_set)

print(f'Original Token Set: {len(tokens_set)}')

print(f'STOPWORD Token Set: {len(stopwords_set)}')

print(f'Original-STOPWORD Token Set: {len(tokens_nostop_set)}')

Original Token Set: 18340
STOPWORD Token Set: 179
Original-STOPWORD Token Set: 18192


In [None]:
yt_token_ls = list(tokens_nostop_set)

yt_token_ls

['charming',
 'memes',
 'Earlier',
 'comprehend',
 'PlayStations',
 'committing',
 'realistically',
 'terrified',
 'rxjs',
 'Phillips',
 'directives',
 'treadle',
 'guilty',
 'hospitals',
 'walked',
 'relieved',
 'legitimized',
 'enter',
 'cautious',
 'positive',
 'bosses',
 'binge',
 'showdown',
 'aperture',
 'teslas',
 'infringed',
 'appreciation',
 'Preview',
 'Kate',
 'NH',
 'centimeters',
 'sees',
 'hog',
 'huh',
 'Citizen',
 'matched',
 'magnificent',
 'key',
 'redo',
 'valley',
 'copy',
 'Five',
 'crypto',
 'boat',
 'address',
 '502s',
 'chuckle',
 'Factory',
 'accessible360',
 'Anyways',
 'spoil',
 'mucked',
 'unchecked',
 'chi',
 'fart',
 'OL',
 'doings',
 'bathroom',
 'PowerPoint',
 'JWT',
 'curved',
 'bonus',
 'halfway',
 'brightly',
 'tv',
 'Story',
 'YouTubers',
 'html5',
 'mush',
 'anna',
 'jpg',
 'shields',
 'medical',
 'irreverent',
 'frostbite',
 'steps',
 'ore',
 'gimbal',
 '84',
 'digits',
 'tap',
 'kyle',
 'pressed',
 'winding',
 'murmurs',
 'analytics',
 'storeroom

In [None]:
min(yt_token_ls)

'0'

In [None]:
'the' in yt_token_ls

False

In [None]:
for i,atoken in enumerate(yt_token_ls):
  if (len(atoken) == 3) & atoken.startswith('th'):
    print(f'Token #{i}: {atoken}')

Token #7503: thx
Token #7825: tho


We use the WordNet lemmatizer from NLTK. A lemmatizer is preferred over a
stemmer in this case because it produces more readable words. Output that is
easy to read is very desirable in topic modelling.




In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
type(yt_token_ls)

list

In [None]:
yt_token_ls[0]

'charming'

In [None]:
yt_token_ls[:5]

['charming', 'memes', 'Earlier', 'comprehend', 'PlayStations']

In [None]:
# Clean

docs_clean_ls = []

for asent in yt_token_ls:
  asent_clean = asent.strip().lower()
  asent_clean_ls = []
  for aword in asent_clean.split():
    if (aword.isalpha()) & (not(aword in STOPWORDS)):
      asent_clean_ls.append(aword)
  asent_clean_str = ' '.join(asent_clean_ls)
  docs_clean_ls.append(asent_clean_str)

docs_clean_ls[:5]


['charming', 'memes', 'earlier', 'comprehend', 'playstations']

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
%%time

# NOTE: 0m24s @03:48 on 20220228 Colab Pro 

# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
# docs = [lemmatizer.lemmatize(token) for token in doc for doc in docs]

docs_lemma_ls = []
for i, adoc in enumerate(docs_clean_ls):
  adoc_lemma_ls = [lemmatizer.lemmatize(x).lower() for x in adoc.split()]
  adoc_lemma_str = ' '.join(adoc_lemma_ls)
  docs_lemma_ls.append(adoc_lemma_str)

# Verify
docs_lemma_ls[:5]

CPU times: user 280 ms, sys: 13.1 ms, total: 293 ms
Wall time: 293 ms


In [None]:
docs_lemma_ls[:20]

['charming',
 'meme',
 'earlier',
 'comprehend',
 'playstations',
 'committing',
 'realistically',
 'terrified',
 'rxjs',
 'phillips',
 'directive',
 'treadle',
 'guilty',
 'hospital',
 'walked',
 'relieved',
 'legitimized',
 'enter',
 'cautious',
 'positive']

In [None]:
# docs = [lemmatizer.lemmatize(token) for token in doc for doc in docs]

We find bigrams in the documents. Bigrams are sets of two adjacent words.
Using bigrams we can get phrases like "machine_learning" in our output
(spaces are replaced with underscores); without bigrams we would only get
"machine" and "learning".

Note that in the code below, we find bigrams and then add them to the
original data, because we would like to keep the words "machine" and
"learning" as well as the bigram "machine_learning".

.. Important::
    Computing n-grams of large dataset can be very computationally
    and memory intensive.




In [None]:
type(docs_lemma_ls)

list

In [None]:
len(docs_lemma_ls)

18192

In [None]:
docs_lemma_ls[0]

'charming'

In [None]:
docs_lemma_ls[10]

'directive'

In [None]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs_lemma_ls, min_count=20)
for idx in range(len(docs_lemma_ls)):
  for token in bigram[docs_lemma_ls[idx]]:
    if '_' in token:
      # Token is a bigram, add to document.
      docs_lemma_ls[idx] = docs_lemma_ls[idx] + token

2022-05-13 21:41:55,928 : INFO : collecting all words and their counts
2022-05-13 21:41:55,934 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2022-05-13 21:41:56,035 : INFO : PROGRESS: at sentence #10000, processed 63065 words and 597 word types
2022-05-13 21:41:56,129 : INFO : collected 626 word types from a corpus of 114943 words (unigram + bigrams) and 18192 sentences
2022-05-13 21:41:56,132 : INFO : using 626 counts as vocab in Phrases<0 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>


In [None]:
top_n = len(docs_lemma_ls)

for i in range(top_n):
  print(f'doc #{i}: {docs_lemma_ls[i][-20:]}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
doc #13192: endeavor
doc #13193: gt
doc #13194: cooling
doc #13195: planet
doc #13196: squeezing
doc #13197: controversy
doc #13198: jamal
doc #13199: yep
doc #13200: lego
doc #13201: transform
doc #13202: berg
doc #13203: unaware
doc #13204: stand
doc #13205: kill
doc #13206: unlike
doc #13207: invoice
doc #13208: mentor
doc #13209: fry
doc #13210: recyclable
doc #13211: clearance
doc #13212: twice
doc #13213: timed
doc #13214: analogy
doc #13215: true
doc #13216: enclosure
doc #13217: library
doc #13218: real
doc #13219: exposed
doc #13220: packin
doc #13221: sonic
doc #13222: purse
doc #13223: motherboards
doc #13224: original
doc #13225: greaseproof
doc #13226: teased
doc #13227: tea
doc #13228: george
doc #13229: eclipse
doc #13230: 
doc #13231: lens
doc #13232: orange
doc #13233: mister
doc #13234: 
doc #13235: mm
doc #13236: philip
doc #13237: discrimination
doc #13238: agency
doc #13239: adjustment
doc #13240: spe

We remove rare words and common words based on their *document frequency*.
Below we remove words that appear in less than 20 documents or in more than
50% of the documents. Consider trying to remove words only based on their
frequency, or maybe combining that with this approach.




In [None]:
docs_lemma_ls[:5]

['charming', 'meme', 'earlier', 'comprehend', 'playstations']

In [None]:
# Remove rare and common tokens.

from gensim import corpora
from gensim.utils import simple_preprocess
dictionary = corpora.Dictionary()
# from gensim.corpora import Dictionary

docs_tokenized = [simple_preprocess(doc) for doc in docs_lemma_ls]
corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in docs_tokenized]
# Create a dictionary representation of the documents.
# dictionary = Dictionary(docs_lemma_ls)
print(corpus)
# Filter out words that occur less than 20 documents, or more than 50% of the documents.
# dictionary.filter_extremes(no_below=20, no_above=0.5)

[[(0, 1)], [(1, 1)], [(2, 1)], [(3, 1)], [(4, 1)], [(5, 1)], [(6, 1)], [(7, 1)], [(8, 1)], [(9, 1)], [(10, 1)], [(11, 1)], [(12, 1)], [(13, 1)], [(14, 1)], [(15, 1)], [(16, 1)], [(17, 1)], [(18, 1)], [(19, 1)], [(20, 1)], [(21, 1)], [(22, 1)], [(23, 1)], [(24, 1)], [(25, 1)], [(26, 1)], [(27, 1)], [(28, 1)], [(29, 1)], [(30, 1)], [(31, 1)], [(32, 1)], [(33, 1)], [(34, 1)], [(35, 1)], [(36, 1)], [(37, 1)], [(38, 1)], [(39, 1)], [(40, 1)], [(41, 1)], [(42, 1)], [(43, 1)], [(44, 1)], [], [(45, 1)], [(46, 1)], [], [(47, 1)], [(48, 1)], [(49, 1)], [(50, 1)], [(51, 1)], [(52, 1)], [(53, 1)], [(54, 1)], [(55, 1)], [(56, 1)], [(57, 1)], [(58, 1)], [(59, 1)], [(60, 1)], [(61, 1)], [(62, 1)], [(63, 1)], [(64, 1)], [], [(65, 1)], [(66, 1)], [(67, 1)], [(68, 1)], [(69, 1)], [(70, 1)], [(71, 1)], [(72, 1)], [(73, 1)], [(74, 1)], [], [(75, 1)], [(76, 1)], [(77, 1)], [(78, 1)], [(79, 1)], [(80, 1)], [(81, 1)], [(82, 1)], [(83, 1)], [(84, 1)], [(85, 1)], [(86, 1)], [(87, 1)], [(88, 1)], [(89, 1)], [(9

In [None]:
id_words = [[(dictionary[id], count) for id, count in line] for line in corpus]
print(id_words)



Finally, we transform the documents to a vectorized form. We simply compute
the frequency of each word, including the bigrams.




In [None]:
# Bag-of-words representation of the documents.
# corpus = [dictionary.doc2bow(doc) for doc in docs_lemma_ls]

Let's see how many tokens and documents we have to train on.




In [None]:
type(corpus)

list

In [None]:
# Orig 1740

len(corpus)

18192

In [None]:
type(corpus[0])

list

In [None]:
corpus[0][:10]

[(0, 1)]

In [None]:
print('Number of unique tokens: %d' % len(dictionary))  # Orig 1864
print('Number of documents: %d' % len(corpus))          # Orig 1740

Number of unique tokens: 13452
Number of documents: 18192


## Training

We are ready to train the LDA model. We will first discuss how to set some of
the training parameters.

First of all, the elephant in the room: how many topics do I need? There is
really no easy answer for this, it will depend on both your data and your
application. I have used 10 topics here because I wanted to have a few topics
that I could interpret and "label", and because that turned out to give me
reasonably good results. You might not need to interpret all your topics, so
you could use a large number of topics, for example 100.

``chunksize`` controls how many documents are processed at a time in the
training algorithm. Increasing chunksize will speed up training, at least as
long as the chunk of documents easily fit into memory. I've set ``chunksize =
2000``, which is more than the amount of documents, so I process all the
data in one go. Chunksize can however influence the quality of the model, as
discussed in Hoffman and co-authors [2], but the difference was not
substantial in this case.

``passes`` controls how often we train the model on the entire corpus.
Another word for passes might be "epochs". ``iterations`` is somewhat
technical, but essentially it controls how often we repeat a particular loop
over each document. It is important to set the number of "passes" and
"iterations" high enough.

I suggest the following way to choose iterations and passes. First, enable
logging (as described in many Gensim tutorials), and set ``eval_every = 1``
in ``LdaModel``. When training the model look for a line in the log that
looks something like this::

   2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations

If you set ``passes = 20`` you will see this line 20 times. Make sure that by
the final passes, most of the documents have converged. So you want to choose
both passes and iterations to be high enough for this to happen.

We set ``alpha = 'auto'`` and ``eta = 'auto'``. Again this is somewhat
technical, but essentially we are automatically learning two parameters in
the model that we usually would have to specify explicitly.




In [None]:
%%time

# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 7
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

2022-05-13 21:45:37,066 : INFO : using autotuned alpha, starting with [0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715]
2022-05-13 21:45:37,075 : INFO : using serial LDA version on this node
2022-05-13 21:45:37,092 : INFO : running online (multi-pass) LDA training, 7 topics, 20 passes over the supplied corpus of 18192 documents, updating model once every 2000 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2022-05-13 21:45:37,095 : INFO : PROGRESS: pass 0, at document #2000/18192
2022-05-13 21:45:37,380 : INFO : optimized alpha [0.14420363, 0.1435189, 0.14253022, 0.14417566, 0.14492077, 0.14287047, 0.14369063]
2022-05-13 21:45:37,383 : INFO : merging changes from 2000 documents into a model of 18192 documents
2022-05-13 21:45:37,403 : INFO : topic #2 (0.143): 0.004*"security" + 0.004*"back" + 0.004*"delta" + 0.004*"story" + 0.002*"sam" + 0.002*"plasticy" + 0.002*"raytheon" + 0.002*"tool" + 0.

CPU times: user 43.4 s, sys: 829 ms, total: 44.2 s
Wall time: 45.8 s


We can compute the topic coherence of each topic. Below we display the
average topic coherence and print the topics in order of topic coherence.

Note that we use the "Umass" topic coherence measure here (see
:py:func:`gensim.models.ldamodel.LdaModel.top_topics`), Gensim has recently
obtained an implementation of the "AKSW" topic coherence measure (see
accompanying blog post, http://rare-technologies.com/what-is-topic-coherence/).

If you are familiar with the subject of the articles in this dataset, you can
see that the topics below make a lot of sense. However, they are not without
flaws. We can see that there is substantial overlap between some topics,
others are hard to interpret, and most of them have at least some terms that
seem out of place. If you were able to do better, feel free to share your
methods on the blog at http://rare-technologies.com/lda-training-tips/ !




In [None]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2022-05-13 21:46:22,851 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2022-05-13 21:46:22,856 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2022-05-13 21:46:22,860 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2022-05-13 21:46:22,866 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2022-05-13 21:46:22,870 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2022-05-13 21:46:22,875 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2022-05-13 21:46:22,880 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2022-05-13 21:46:22,884 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2022-05-13 21:46:22,889 : INFO : CorpusAccumulator accumulated stats from 9000 documents
2022-05-13 21:46:22,893 : INFO : CorpusAccumulator accumulated stats from 10000 documents
2022-05-13 21:46:22,898 : INFO : CorpusAccumulator accumulated stats from 11000 documents
2022-05-13 21:46:22

Average topic coherence: -18.4290.
[([(0.0039659394, 'quantity'),
   (0.0039659394, 'text'),
   (0.0037875967, 'block'),
   (0.003728955, 'dropping'),
   (0.003728955, 'justing'),
   (0.003728955, 'reminded'),
   (0.003728955, 'lightspeed'),
   (0.003728955, 'deceiving'),
   (0.003728955, 'limbo'),
   (0.003728955, 'bonding'),
   (0.003728955, 'fling'),
   (0.003728955, 'manufactured'),
   (0.003728955, 'imager'),
   (0.003728955, 'deferring'),
   (0.003728955, 'andreessen'),
   (0.003728955, 'subsequent'),
   (0.003728955, 'recompiling'),
   (0.003728955, 'beset'),
   (0.003728955, 'singular'),
   (0.003728955, 'kindle')],
  -18.019283624939476),
 ([(0.003955675, 'eh'),
   (0.0039556744, 'addition'),
   (0.0039556744, 'facebook'),
   (0.003897968, 'load'),
   (0.0038276557, 'gasp'),
   (0.0038276555, 'capacitor'),
   (0.0038276555, 'footer'),
   (0.0037287099, 'faq'),
   (0.003670982, 'electricity'),
   (0.0036709816, 'crimp'),
   (0.0036709816, 'reluctant'),
   (0.0036709816, 'mortar

# Visualize

In [None]:
!pip install pyldavis



In [None]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

In [None]:
# vis_data = gensimvis.prepare(lda, corpus, dictionary)
vis_data = gensimvis.prepare(model, corpus, dictionary)
pyLDAvis.display(vis_data)