# Baseline LDA model
This notebook gives an overview of how to train an LDA model from the Reddit data.

The input is a joined submissions and commends dataframe as produced by `notebooks/bagOfWords_preprocessing_databricks.ipynb`

In [11]:
import ihop.utils
spark = ihop.utils.get_spark_session("baseline lda")

input_data = spark.read.load(
    "../data/bagOfWords/2021-05_to_2021-06_joined_submissions_comments_5percentTopUsersExcludedFromComments_02102022.parquet").limit(200)


Spark configuration:
[('spark.app.id', 'local-1645464934358'), ('spark.executor.id', 'driver'), ('spark.driver.port', '40065'), ('spark.app.name', 'baseline lda'), ('spark.driver.memory', '8G'), ('spark.driver.host', '192.168.0.11'), ('spark.sql.warehouse.dir', 'file:/home/virginia/Documents/CenterForDataScience/ZuckermanProj/IHOP/notebooks/spark-warehouse'), ('spark.rdd.compress', 'True'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.submit.pyFiles', ''), ('spark.submit.deployMode', 'client'), ('spark.executor.extraLibraryPath', '/home/virginia/hadoop-3.3.1/lib/native'), ('spark.ui.showConsoleProgress', 'true'), ('spark.app.startTime', '1645464932414'), ('spark.driver.extraLibraryPath', '/home/virginia/hadoop-3.3.1/lib/native')]


                                                                                

In [12]:
input_data.show(5)



+---------+----------+-----------+------+-----+--------------------+--------------------+--------------------+-----------+------------------+-----------+----------+--------------+---------+---------------+--------------------+--------------------+--------------------------+
|subreddit|    author|created_utc|    id|score|            selftext|               title|                 url|fullname_id|comments_subreddit|comments_id| parent_id|comments_score|  link_id|comments_author|                body|comments_created_utc|time_to_comment_in_seconds|
+---------+----------+-----------+------+-----+--------------------+--------------------+--------------------+-----------+------------------+-----------+----------+--------------+---------+---------------+--------------------+--------------------+--------------------------+
| Market76|Apostle-II| 1619827212|n26p5s|    2|Don’t use 5.56 an...|H: 30k 50cal and ...|https://www.reddi...|  t3_n26p5s|          Market76|    gwhjcmt| t3_n26p5s|           

                                                                                

In [13]:
input_data.count()

                                                                                

200

# Train a simple LDA model using Gensim


In [18]:
from datetime import datetime
import ihop.clustering as ic
import ihop.text_processing as itp

# Read in the joined data, collecting all the comments for each submission
# Any desired filtering by time stamps can be done here
corpus = itp.SparkCorpus.init_from_joined_dataframe(input_data)

# Tokenize the document, then create an id to word index and vectorize each document
# This is where you would set minimum and maximum document frequency and minimum term frequency, passed to Spark CountVectorizer
pipeline = itp.SparkTextPreprocessingPipeline('document_text', **{'maxDF':0.95, 'minDF':0.05})
transformed = pipeline.fit_transform(corpus.document_dataframe)
vectorized_corpus = itp.SparkCorpus(transformed)


TypeError: __init__() got an unexpected keyword argument 'maxDF'

In [None]:
index = pipeline.get_id_to_word()
print("INDEX DETAILS:")
print("Vocab size:", len(index))
for k in range(10):
    print(k, index[k])

INDEX DETAILS:
Vocab size: 2214
0 a
1 the
2 i
3 que
4 to
5 and
6 de
7 la
8 y
9 no


In [None]:
vectorized_corpus.document_dataframe.show()



+------+--------------------+--------------------+--------------------+
|    id|       document_text|           tokenized|          vectorized|
+------+--------------------+--------------------+--------------------+
|n26uxd|Cum visit and cha...|[cum, visit, and,...|(2214,[5,10,13,26...|
|n26p5s|H: 30k 50cal and ...|[h, 30k, 50cal, a...|(2214,[0,1,2,4,5,...|
|n2725q|hey black jews  Nice|[hey, black, jews...|(2214,[180,432,44...|
|n27gmk|Magazine markings...|[magazine, markin...|(2214,[0,1,2,4,5,...|
|n272yp|Bulma - lonely mi...|[bulma, lonely, m...|(2214,[0,14,20,38...|
|n26rtn|What is always be...|[what, is, always...|(2214,[0,1,2,4,5,...|
|n26unv|Lost cat in the S...|[lost, cat, in, t...|(2214,[0,1,4,5,10...|
|n26z8u|GUYS THEY ARE STI...|[guys, they, are,...|(2214,[0,1,2,9,10...|
|n274xr|Wardens pushing i...|[wardens, pushing...|(2214,[0,1,2,4,5,...|
|n272kp|This is my last r...|[this, is, my, la...|(2214,[14,20,24,1...|
|n276vv|This week we rele...|[this, week, we, ...|(2214,[0,1,2,4

                                                                                

In [None]:
corpus_iterator = vectorized_corpus.get_vectorized_column_iterator()



In [None]:
lda_model = ic.GensimLDAModel(corpus_iterator, "sample_lda", index, num_topics=10)

print("Starting training at", datetime.now())
lda_model.train()

gensim.models.ldamodel : 2022-02-21 12:37:34,818 : INFO : using asymmetric alpha [0.20349778, 0.15460682, 0.124657474, 0.10442834, 0.08984803, 0.07884031, 0.070235424, 0.06332404, 0.057651002, 0.052910853]
gensim.models.ldamodel : 2022-02-21 12:37:34,821 : INFO : using symmetric eta at 0.1
gensim.models.ldamodel : 2022-02-21 12:37:34,822 : INFO : using serial LDA version on this node


Starting training at 2022-02-21 12:37:34.831240


gensim.models.ldamulticore : 2022-02-21 12:37:37,284 : INFO : running online LDA training, 10 topics, 1 passes over the supplied corpus of 38 documents, updating every 6000 documents, evaluating every ~38 documents, iterating 1000x with a convergence threshold of 0.001000
gensim.models.ldamulticore : 2022-02-21 12:37:37,290 : INFO : training LDA model using 3 processes
gensim.models.ldamulticore : 2022-02-21 12:37:38,046 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #38/38, outstanding queue size 1
gensim.models.ldamodel : 2022-02-21 12:37:38,779 : INFO : topic #9 (0.053): 0.035*"que" + 0.027*"de" + 0.023*"la" + 0.021*"y" + 0.019*"a" + 0.016*"el" + 0.015*"en" + 0.014*"no" + 0.014*"es" + 0.013*"vida"
gensim.models.ldamodel : 2022-02-21 12:37:38,783 : INFO : topic #8 (0.058): 0.000*"to" + 0.000*"the" + 0.000*"and" + 0.000*"is" + 0.000*"i" + 0.000*"a" + 0.000*"with" + 0.000*"it" + 0.000*"jews" + 0.000*"that"
gensim.models.ldamodel : 2022-02-21 12:37:38,795 : INFO : topi

In [None]:
lda_model.get_top_words_as_dataframe()

Unnamed: 0,topic_id,top_terms
0,0,que la de a y es lo no vida en el te me una se...
1,1,i the to a and of that in this my for is you w...
2,2,a i me the is please us for feedback thanks to...
3,3,que de la y en no a el es vida te lo una un to...
4,4,the and you to scripts with can for these toy ...
5,5,and the a to i is que for it in la de me of th...
6,6,you the of a for i it your to mags do in if an...
7,7,melatonin sending virtual lt;/3 in lt;3 hrs 32...
8,8,i a to and the is in nice just have with it th...
9,9,que de la y a el en no es vida lo un una si te...


In [None]:
lda_model.get_top_words()

[(0,
  [('que', 0.033466827),
   ('la', 0.021855572),
   ('de', 0.01814427),
   ('a', 0.017571624),
   ('y', 0.015609357),
   ('es', 0.0156065235),
   ('lo', 0.013540798),
   ('no', 0.013000362),
   ('vida', 0.012253153),
   ('en', 0.012138069),
   ('el', 0.0115247555),
   ('te', 0.010641693),
   ('me', 0.010159906),
   ('una', 0.008095273),
   ('sentido', 0.007598698),
   ('un', 0.0074862987),
   ('pero', 0.007056793),
   ('mi', 0.0063557494),
   ('por', 0.0061084973),
   ('si', 0.0056476262)]),
 (1,
  [('i', 0.038976073),
   ('the', 0.03282222),
   ('to', 0.028178835),
   ('a', 0.022003897),
   ('and', 0.020558286),
   ('of', 0.01441212),
   ('that', 0.010915627),
   ('in', 0.01048467),
   ('this', 0.009951461),
   ('my', 0.009497926),
   ('for', 0.0092457),
   ('is', 0.008750445),
   ('you', 0.008525668),
   ('was', 0.00812873),
   ('just', 0.00785081),
   ('have', 0.0075670634),
   ('it', 0.0075492915),
   ('be', 0.006701196),
   ('with', 0.0059494213),
   ('me', 0.005636731)]),
 (