In [2]:
import os
import nltk

# https://stackoverflow.com/questions/51390676/how-to-visualize-pyspark-mls-lda-or-other-clustering

nltk.download('stopwords')
from nltk.corpus import stopwords

from pyspark import SparkConf, SparkContext,SQLContext
from pyspark.sql import SparkSession, functions
from pyspark.ml.feature import Word2Vec,CountVectorizer,Tokenizer, StopWordsRemover
from pyspark.ml.clustering import LDA, LDAModel
from pyspark.sql.functions import col, udf, countDistinct
from pyspark.sql.types import IntegerType,ArrayType,StringType


spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

data_path = "gs://6893finalpruv/2.csv" # Data path for csv file
spark_df = spark.read.csv(data_path, inferSchema = True, header=True) # checking the csv file
spark_df.show()

# Topic Modelling on Title (Potentially do it on description if possible)
node = "Title"
# Get title data, filter out empty nodes
title_data = spark_df.select(node).filter(functions.col(node).isNotNull())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               Title|          Title Link|              Author|               Score|        Num Comments|           Subreddit|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|TIL the Vice Pres...|https://publicapo...|          douggold11|               22719|                1308|       todayilearned|
|What’s your toxic...|                 N/A|         KingOfLoser|               16369|               10396|           AskReddit|
|Herschel Walker v...|https://www.busin...|        Singing_Wolf|               13530|                1280|            politics|
|Qatari security t...|https://v.redd.it...|    Rollo_Tomasi3000|               85855|                4588|    nextfuckinglevel|
|North Carolina po...|https://v.redd.it...|        FarmSuch5021|               28059|                360

In [3]:
df2 = spark_df.select(countDistinct("Subreddit"))
topic_num = df2.first()[0]

In [4]:
tokenizer = Tokenizer(inputCol="Title", outputCol="words")
tokenized = tokenizer.transform(spark_df)
remover = StopWordsRemover(stopWords=stopwords.words('english'), inputCol="words", outputCol="filtered")
result = remover.transform(tokenized)
result.select("filtered").show()

+--------------------+
|            filtered|
+--------------------+
|[til, vice, presi...|
|[what’s, toxic, t...|
|[herschel, walker...|
|[qatari, security...|
|[north, carolina,...|
|[boy, deaf, hears...|
|[declining, globa...|
|[title, isn’t, ev...|
|[guy, tries, catc...|
|[what's, somethin...|
|[joe, biden,, ola...|
|[n.y., disbars, l...|
|[jon, stewart, da...|
|["kevin, durant:,...|
|[qatari, security...|
|[late..., 60, yea...|
|[local, microcent...|
|[3.20, balance, m...|
|[woman, named, el...|
|      [jack, trades]|
+--------------------+
only showing top 20 rows



In [5]:
cv = CountVectorizer(inputCol="filtered", outputCol="features")
cvModel = cv.fit(result)
cvResult = cvModel.transform(result)

In [6]:
lda = LDA(maxIter=20, k = 10)
ldaModel = lda.fit(cvResult)
transformed = ldaModel.transform(cvResult).select("topicDistribution")
transformed.show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topicDistribution                                                                                                                                                                                                  |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[0.007801260590798447,0.008120082929823148,0.007668871639982281,0.9296939115693913,0.007557623830125605,0.007786233189854585,0.008280167862423703,0.007709895070471677,0.007966411323441799,0.007415541993687494]  |
|[0.013430319850476932,0.013980151286126464,0.8788569040844371,0.013308568429154665,0.013010617267165014,0.01340441273919526,0.01425549564653407

In [7]:
ll = ldaModel.logLikelihood(cvResult)
lp = ldaModel.logPerplexity(cvResult)
print("ll: ", ll)
print("lp: ", lp)

ll:  -49755.4549735826
lp:  14.384346624337265


In [8]:
vocab = cvModel.vocabulary
topics = ldaModel.describeTopics()
topics_rdd = topics.rdd

topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()
for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)

topic: 0
*************************
took
away
take
%
work
great
every
14
queen
gotta
*************************
topic: 1
*************************
people
it’s
leave
like
let
finished
trump's
still
don’t
wicked
*************************
topic: 2
*************************
life
covid-19
6
may
consecutive
first
last
killing
injuring
waukesha
*************************
topic: 3
*************************
enough
states
mistake
party
united
random
center
grown
philadelphia
post
*************************
topic: 4
*************************
experience
amendment?
motorcycle
right
100
scale
then...
tries
advantage
taking
*************************
topic: 5
*************************
cup
world
egg_irl
vs
open
watch
mick
chappelle
issues
gordon
*************************
topic: 6
*************************
one
new
local
next
got
character
get
people
almost
house
*************************
topic: 7
*************************
hits
made
longer
white
episode
move
went
black
rtx
4090
*************************
topi

In [15]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import lit
topics.show()
dist = ldaModel.transform(cvResult)
dist.show()
def ith_(v, i):
    try:
        return float(v[i])
    except ValueError:
        return None
    
ith = udf(ith_, DoubleType())
df = dist.select(["Title"] + [ith("topicDistribution", lit(i)).alias('topic_'+str(i)) for i in range(10)] )
df.show()

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[17, 37, 31, 86, ...|[0.00299084005034...|
|    1|[2, 6, 13, 3, 68,...|[0.00369118202394...|
|    2|[57, 331, 47, 416...|[0.00179393636526...|
|    3|[98, 44, 432, 78,...|[0.00211770316232...|
|    4|[396, 1798, 2427,...|[0.00152531622398...|
|    5|[14, 7, 908, 21, ...|[0.00240239604253...|
|    6|[0, 1, 77, 18, 8,...|[0.00434384712126...|
|    7|[105, 10, 74, 95,...|[0.00189531789125...|
|    8|[40, 119, 141, 12...|[0.00264990718536...|
|    9|[277, 250, 333, 1...|[0.00131767430631...|
+-----+--------------------+--------------------+

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               Title|          Title Link|              Author|               Score|  

In [21]:
import pandas as pd
import numpy as np
df_p = dist.select('topicDistribution').toPandas()
df_p1 = df_p.topicDistribution.apply(lambda x:np.array(x))
df_p2 = pd.DataFrame(df_p1.tolist()).apply(lambda x:x.argmax(),axis=1)
df_p3 = df_p2.reset_index()
df_p3.columns = ['doc','topic']
df2_p = dist.select('Title').toPandas()
print(df_p3)
print(pd.concat([df2_p, df_p3], axis=1))

     doc  topic
0      0      3
1      1      2
2      2      6
3      3      5
4      4      2
..   ...    ...
494  494      2
495  495      8
496  496      2
497  497      6
498  498      6

[499 rows x 2 columns]
                                                 Title  doc  topic
0    TIL the Vice President of the United States mi...    0      3
1        What’s your toxic trait that you’re proud of?    1      2
2    Herschel Walker veers off speech to rant about...    2      6
3    Qatari security threaten to destory camera dur...    3      5
4    North Carolina police officer holding his K-9 ...    4      2
..                                                 ...  ...    ...
494                     Drill charges kill shields now  494      2
495                  So quick question, are we a cult?  495      8
496  Darrel Brooks sentenced to 6 life sentences, 7...  496      2
497    The new $1 coin released through the mint today  497      6
498  Design character Frankentein inspired (thi