# Baseline LDA model
This notebook gives an overview of how to train an LDA model from the Reddit data.

The input is a joined submissions and commends dataframe as produced by `notebooks/bagOfWords_preprocessing_databricks.ipynb`

In [1]:
import ihop.utils
import logging

# TODO Logging should be configurable, but for now just turn it on for Gensim
logging.basicConfig(
    format='%(name)s : %(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

spark = ihop.utils.get_spark_session("baseline lda")

22/02/26 15:57:49 WARN Utils: Your hostname, Kurt resolves to a loopback address: 127.0.1.1; using 192.168.0.11 instead (on interface wlp4s0)
22/02/26 15:57:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/26 15:57:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/02/26 15:57:53 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


Spark configuration:
[('spark.app.id', 'local-1645909073755'), ('spark.app.startTime', '1645909071006'), ('spark.executor.id', 'driver'), ('spark.app.name', 'baseline lda'), ('spark.driver.memory', '8G'), ('spark.driver.host', '192.168.0.11'), ('spark.driver.port', '44947'), ('spark.sql.warehouse.dir', 'file:/home/virginia/Documents/CenterForDataScience/ZuckermanProj/IHOP/notebooks/spark-warehouse'), ('spark.rdd.compress', 'True'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.submit.pyFiles', ''), ('spark.submit.deployMode', 'client'), ('spark.executor.extraLibraryPath', '/home/virginia/hadoop-3.3.1/lib/native'), ('spark.ui.showConsoleProgress', 'true'), ('spark.driver.extraLibraryPath', '/home/virginia/hadoop-3.3.1/lib/native')]


In [2]:
input_data = spark.read.load(
    "../data/bagOfWords/2021-05_to_2021-06_joined_submissions_comments_5percentTopUsersExcludedFromComments_02102022.parquet").limit(500)

                                                                                

In [3]:
input_data.write.parquet("tmp_data")

                                                                                

In [4]:
input_data.count()

                                                                                

500

# Train a simple LDA model using Gensim


In [5]:
import ihop.clustering as ic
import ihop.text_processing as itp

lda_output_dir = "tmp_lda_model"

vectorized_corpus, pipeline = itp.prep_spark_corpus(spark, "tmp_data", output_dir=lda_output_dir)
lda_model = ic.main("lda", vectorized_corpus.get_vectorized_column_iterator(), pipeline.get_id_to_word(), lda_output_dir, {'num_topics':10})



numexpr.utils : 2022-02-26 15:59:04,322 : INFO : NumExpr defaulting to 4 threads.
ihop.text_processing : 2022-02-26 15:59:05,417 : INFO : Prepping corpus for LDA with parameters: {'spark': <pyspark.sql.session.SparkSession object at 0x7fae3091f940>, 'input_corpus_path': 'tmp_data', 'min_time_delta': 3, 'max_time_delta': 259200, 'min_doc_frequency': 0.05, 'max_doc_frequency': 0.95, 'output_dir': 'tmp_lda_model', 'corpus_output_name': 'vectorized_corpus.parquet'}
gensim.models.ldamodel : 2022-02-26 15:59:25,828 : INFO : using asymmetric alpha [0.20349778, 0.15460682, 0.124657474, 0.10442834, 0.08984803, 0.07884031, 0.070235424, 0.06332404, 0.057651002, 0.052910853]
gensim.models.ldamodel : 2022-02-26 15:59:25,832 : INFO : using symmetric eta at 0.1
gensim.models.ldamodel : 2022-02-26 15:59:25,834 : INFO : using serial LDA version on this node
ihop.clustering : 2022-02-26 15:59:25,837 : INFO : Training model lda
gensim.models.ldamulticore : 2022-02-26 15:59:27,403 : INFO : running online 

Performance metrics: {'Coherence': -3.917943434471193}


ihop.clustering : 2022-02-26 15:59:35,661 : INFO : Saving topic keywords to CSV tmp_lda_model/words.csv


In [6]:
lda_model.get_top_words_as_dataframe()

Unnamed: 0,topic_id,top_terms
0,0,to a my this just you is in me not one really ...
1,1,a the and to of it i no you is with in on me f...
2,2,i'm the it in post a you for see i of that but...
3,3,the is and to of a it in was i that my with us...
4,4,for having real best far looks good so a my to...
5,5,i and to the a my that it you in is so of was ...
6,6,the to i and a you it of that for is in this m...
7,7,the to just it's be 6 about of no that's as wa...
8,8,i and a the to it for just in all be of we tha...
9,9,and your the i you big what for to that that’s...


In [7]:
print(lda_model.get_metrics())

{'Coherence': -3.917943434471193}


In [8]:
lda_model.get_term_topics('i')

[(5, 0.05769345),
 (6, 0.04360495),
 (8, 0.039964624),
 (1, 0.025803326),
 (3, 0.018116003),
 (9, 0.0172091)]

In [9]:
lda_model.get_parameters()

{'model_name': 'lda',
 'num_topics': 10,
 'decay': 0.5,
 'offset': 1.0,
 'iterations': 1000}

In [10]:
id_corpus_iter = vectorized_corpus.get_vectorized_column_iterator(use_id_col=True)
doc_topics = lda_model.get_topic_assignments(id_corpus_iter)
print(len(doc_topics))
doc_topics

99


{'n27gmk': [(1, 0.9893782)],
 'n26pnc': [(4, 0.011231296),
  (3, 0.013056133),
  (2, 0.015589058),
  (1, 0.01940883),
  (0, 0.02571095),
  (9, 0.88124675)],
 'n2847x': [(5, 0.9974772)],
 'n27p5v': [(0, 0.011374701), (6, 0.9482025)],
 'n28a3x': [(1, 0.011984165), (0, 0.9386227)],
 'n280bz': [(0, 0.011614121), (6, 0.9479444)],
 'n27qbg': [(1, 0.47262216), (5, 0.49075827)],
 'n26zl0': [(6, 0.9970454)],
 'n26rob': [(0, 0.014666959), (1, 0.93947643)],
 'n2796a': [(7, 0.010554121),
  (6, 0.011706071),
  (5, 0.013140466),
  (4, 0.014974889),
  (3, 0.017410312),
  (2, 0.020794688),
  (0, 0.03526347),
  (1, 0.85772884)],
 'n270nt': [(6, 0.98781824)],
 'n27se1': [(5, 0.9921953)],
 'n27lbf': [(6, 0.010035667),
  (5, 0.011265067),
  (4, 0.012837737),
  (3, 0.014926754),
  (2, 0.017821835),
  (0, 0.029579964),
  (1, 0.8786877)],
 'n26rtn': [(6, 0.9858348)],
 'n28bfz': [(8, 0.98342234)],
 'n274qm': [(0, 0.011042604), (1, 0.9551651)],
 'n26unv': [(6, 0.98974276)],
 'n26qdl': [(1, 0.011973957), (0, 0.

In [11]:
subreddits = vectorized_corpus.document_dataframe.select('id', 'subreddit').toPandas()
topics_df = lda_model.get_cluster_results_as_df(doc_col_name="id", join_df = subreddits)
topics_df

                                                                                

Unnamed: 0,id,lda,probability,subreddit
0,n27gmk,1,0.989378,ar15
1,n26pnc,4,0.011231,osugame
2,n26pnc,3,0.013056,osugame
3,n26pnc,2,0.015589,osugame
4,n26pnc,1,0.019408,osugame
...,...,...,...,...
227,n27pbs,0,0.040896,ethtrader
228,n27pbs,8,0.811070,ethtrader
229,n28az2,5,0.995976,classicwow
230,n27qgk,8,0.984219,RandomActsOfMuffDive
