# Baseline LDA model
This notebook gives an overview of how to train an LDA model from the Reddit data.

The input is a joined submissions and commends dataframe as produced by `notebooks/bagOfWords_preprocessing_databricks.ipynb`

In [1]:
import ihop.utils
spark = ihop.utils.get_spark_session("baseline lda")

input_data = spark.read.load(
    "../data/bagOfWords/2021-05_to_2021-06_joined_submissions_comments_5percentTopUsersExcludedFromComments_02102022.parquet").limit(200)


22/02/22 13:43:27 WARN Utils: Your hostname, Kurt resolves to a loopback address: 127.0.1.1; using 192.168.0.11 instead (on interface wlp4s0)
22/02/22 13:43:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/22 13:43:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/02/22 13:43:29 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


Spark configuration:
[('spark.app.startTime', '1645555408216'), ('spark.driver.port', '35221'), ('spark.app.id', 'local-1645555410249'), ('spark.executor.id', 'driver'), ('spark.app.name', 'baseline lda'), ('spark.driver.memory', '8G'), ('spark.driver.host', '192.168.0.11'), ('spark.sql.warehouse.dir', 'file:/home/virginia/Documents/CenterForDataScience/ZuckermanProj/IHOP/notebooks/spark-warehouse'), ('spark.rdd.compress', 'True'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.submit.pyFiles', ''), ('spark.submit.deployMode', 'client'), ('spark.executor.extraLibraryPath', '/home/virginia/hadoop-3.3.1/lib/native'), ('spark.ui.showConsoleProgress', 'true'), ('spark.driver.extraLibraryPath', '/home/virginia/hadoop-3.3.1/lib/native')]


                                                                                

In [2]:
input_data.show(5)

                                                                                

+---------+----------+-----------+------+-----+--------------------+--------------------+--------------------+-----------+------------------+-----------+----------+--------------+---------+---------------+--------------------+--------------------+--------------------------+
|subreddit|    author|created_utc|    id|score|            selftext|               title|                 url|fullname_id|comments_subreddit|comments_id| parent_id|comments_score|  link_id|comments_author|                body|comments_created_utc|time_to_comment_in_seconds|
+---------+----------+-----------+------+-----+--------------------+--------------------+--------------------+-----------+------------------+-----------+----------+--------------+---------+---------------+--------------------+--------------------+--------------------------+
| Market76|Apostle-II| 1619827212|n26p5s|    2|Don’t use 5.56 an...|H: 30k 50cal and ...|https://www.reddi...|  t3_n26p5s|          Market76|    gwhjcmt| t3_n26p5s|           

In [3]:
input_data.count()

                                                                                

200

# Train a simple LDA model using Gensim


In [4]:
from datetime import datetime
import ihop.clustering as ic
import ihop.text_processing as itp

# Read in the joined data, collecting all the comments for each submission
# Any desired filtering by time stamps can be done here
corpus = itp.SparkCorpus.init_from_joined_dataframe(input_data)

# Tokenize the document, then create an id to word index and vectorize each document
# This is where you would set minimum and maximum document frequency and minimum term frequency, passed to Spark CountVectorizer
pipeline = itp.SparkTextPreprocessingPipeline('document_text')
transformed = pipeline.fit_transform(corpus.document_dataframe)
vectorized_corpus = itp.SparkCorpus(transformed)


                                                                                

In [5]:
index = pipeline.get_id_to_word()
print("INDEX DETAILS:")
print("Vocab size:", len(index))
for k in range(10):
    print(k, index[k])

INDEX DETAILS:
Vocab size: 11
0 a
1 the
2 i
3 to
4 and
5 of
6 for
7 is
8 you
9 in


In [6]:
vectorized_corpus.document_dataframe.show()

                                                                                

+------+----------------+--------------------+--------------------+--------------------+
|    id|       subreddit|       document_text|           tokenized|          vectorized|
+------+----------------+--------------------+--------------------+--------------------+
|n26uxd|onlyfansgirls101|Cum visit and cha...|[cum, visit, and,...|(11,[4,5],[1.0,1.0])|
|n26p5s|        Market76|H: 30k 50cal and ...|[h, 30k, 50cal, a...|(11,[0,1,2,3,4,5,...|
|n2725q|PvZGardenWarfare|hey black jews  Nice|[hey, black, jews...|          (11,[],[])|
|n27gmk|            ar15|Magazine markings...|[magazine, markin...|(11,[0,1,2,3,4,5,...|
|n272yp|          rule34|Bulma - lonely mi...|[bulma, lonely, m...|(11,[0,7,10],[1.0...|
|n26rtn|       AskReddit|What is always be...|[what, is, always...|(11,[0,1,2,3,4,5,...|
|n26unv|        Columbus|Lost cat in the S...|[lost, cat, in, t...|(11,[0,1,3,4,5,6,...|
|n26z8u|      deathgrips|GUYS THEY ARE STI...|[guys, they, are,...|(11,[0,1,2,5,7,10...|
|n274xr|     foxholeg

In [7]:
corpus_iterator = vectorized_corpus.get_vectorized_column_iterator()

In [8]:
lda_model = ic.GensimLDAModel(corpus_iterator, "sample_lda", index, num_topics=10)

print("Starting training at", datetime.now())
lda_model.train()

gensim.models.ldamodel : 2022-02-22 13:45:16,254 : INFO : using asymmetric alpha [0.20349778, 0.15460682, 0.124657474, 0.10442834, 0.08984803, 0.07884031, 0.070235424, 0.06332404, 0.057651002, 0.052910853]
gensim.models.ldamodel : 2022-02-22 13:45:16,256 : INFO : using symmetric eta at 0.1
gensim.models.ldamodel : 2022-02-22 13:45:16,257 : INFO : using serial LDA version on this node


Starting training at 2022-02-22 13:45:16.265508


gensim.models.ldamulticore : 2022-02-22 13:45:17,926 : INFO : running online LDA training, 10 topics, 1 passes over the supplied corpus of 38 documents, updating every 6000 documents, evaluating every ~38 documents, iterating 1000x with a convergence threshold of 0.001000
gensim.models.ldamulticore : 2022-02-22 13:45:17,929 : INFO : training LDA model using 3 processes
gensim.models.ldamulticore : 2022-02-22 13:45:48,816 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #38/38, outstanding queue size 1
gensim.models.ldamodel : 2022-02-22 13:45:49,124 : INFO : topic #9 (0.053): 0.235*"i" + 0.145*"the" + 0.130*"and" + 0.116*"a" + 0.112*"to" + 0.058*"is" + 0.054*"for" + 0.047*"of" + 0.047*"in" + 0.029*"you"
gensim.models.ldamodel : 2022-02-22 13:45:49,126 : INFO : topic #8 (0.058): 0.091*"a" + 0.091*"the" + 0.091*"to" + 0.091*"i" + 0.091*"and" + 0.091*"you" + 0.091*"for" + 0.091*"of" + 0.091*"this" + 0.091*"is"
gensim.models.ldamodel : 2022-02-22 13:45:49,128 : INFO : topic

In [9]:
lda_model.get_top_words_as_dataframe()

Unnamed: 0,topic_id,top_terms
0,0,and to the is you i in a of for this
1,1,the a for i to you and this in of is
2,2,a to and the you i for this of in is
3,3,a to the and of you this for i is in
4,4,a i to the of and this for you in is
5,5,a to the and i this you of for in is
6,6,the to and this for of you a i in is
7,7,a the to and i you for of is in this
8,8,a the to i and you for of this is in
9,9,i the and a to is for of in you this


In [10]:
lda_model.get_top_words()

[(0,
  [('and', 0.14881942),
   ('to', 0.14447534),
   ('the', 0.13576007),
   ('is', 0.09643146),
   ('you', 0.087731265),
   ('i', 0.08338675),
   ('in', 0.0746347),
   ('a', 0.07032274),
   ('of', 0.06153272),
   ('for', 0.05283381),
   ('this', 0.044071767)]),
 (1,
  [('the', 0.17007343),
   ('a', 0.17001452),
   ('for', 0.16994862),
   ('i', 0.12872131),
   ('to', 0.12867348),
   ('you', 0.08697742),
   ('and', 0.045754578),
   ('this', 0.045746256),
   ('in', 0.045607455),
   ('of', 0.004241557),
   ('is', 0.0042413757)]),
 (2,
  [('a', 0.09143674),
   ('to', 0.0912339),
   ('and', 0.09104566),
   ('the', 0.0909628),
   ('you', 0.09093136),
   ('i', 0.090870544),
   ('for', 0.09074566),
   ('this', 0.090735145),
   ('of', 0.090728745),
   ('in', 0.090699814),
   ('is', 0.09060963)]),
 (3,
  [('a', 0.09099901),
   ('to', 0.09097387),
   ('the', 0.09094788),
   ('and', 0.09091422),
   ('of', 0.090911366),
   ('you', 0.09089516),
   ('this', 0.09089499),
   ('for', 0.09088799),
   (

In [11]:
print(lda_model.get_metrics())

{'Coherence': -0.2828817524014652}


In [12]:
lda_model.get_term_topics('i')

[(0, 0.08139257),
 (1, 0.11091771),
 (4, 0.14634702),
 (6, 0.0539086),
 (9, 0.23356293)]

In [13]:
lda_model.get_parameters()

{'model_name': 'sample_lda',
 'num_topics': 10,
 'alpha': [0.20349778,
  0.15460682,
  0.124657474,
  0.10442834,
  0.08984803,
  0.07884031,
  0.070235424,
  0.06332404,
  0.057651002,
  0.052910853],
 'eta': [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
 'decay': 0.5,
 'offset': 1.0,
 'iterations': 1000,
 'random_state_seed': <function RandomState.seed>}

In [14]:
id_corpus_iter = vectorized_corpus.get_vectorized_column_iterator(use_id_col=True)
doc_topics = lda_model.get_topic_assigments(id_corpus_iter)
print(len(doc_topics))
doc_topics

38


{'n26uxd': [(9, 0.017636975),
  (8, 0.019217027),
  (7, 0.021108042),
  (6, 0.023412019),
  (5, 0.026280139),
  (4, 0.029951965),
  (3, 0.034809504),
  (2, 0.041552596),
  (1, 0.051575977),
  (0, 0.73445576)],
 'n26p5s': [(0, 0.965333)],
 'n2725q': [(9, 0.052910846),
  (8, 0.057650995),
  (7, 0.063324034),
  (6, 0.07023542),
  (5, 0.0788403),
  (4, 0.08984802),
  (3, 0.10442833),
  (2, 0.12465746),
  (1, 0.1546068),
  (0, 0.20349775)],
 'n27gmk': [(0, 0.29489407), (4, 0.6767942)],
 'n272yp': [(9, 0.013227739),
  (8, 0.014412778),
  (7, 0.01583104),
  (6, 0.017559022),
  (5, 0.019710114),
  (3, 0.026107145),
  (2, 0.031164479),
  (1, 0.03885591),
  (4, 0.28107932),
  (0, 0.54205245)],
 'n26rtn': [(0, 0.4723073), (6, 0.4986023)],
 'n26unv': [(0, 0.9715137)],
 'n26z8u': [(3, 0.010442853),
  (2, 0.012465781),
  (1, 0.01552637),
  (4, 0.33853054),
  (0, 0.5907382)],
 'n274xr': [(6, 0.9687534)],
 'n272kp': [(9, 0.010582193),
  (8, 0.011530223),
  (7, 0.012664834),
  (6, 0.014047332),
  (5, 0

In [15]:
subreddits = corpus.document_dataframe.select('id', 'subreddit').toPandas()
topics_df = lda_model.get_cluster_results_as_df(vocab_col_name="id", join_df = subreddits)
topics_df

                                                                                

Unnamed: 0,id,sample_lda,subreddit
0,n26uxd,9,onlyfansgirls101
1,n26p5s,0,Market76
2,n2725q,9,PvZGardenWarfare
3,n27gmk,0,ar15
4,n272yp,9,rule34
5,n26rtn,0,AskReddit
6,n26unv,0,Columbus
7,n26z8u,3,deathgrips
8,n274xr,6,foxholegame
9,n272kp,9,funny
