In [2]:
import os
import nltk

# https://stackoverflow.com/questions/51390676/how-to-visualize-pyspark-mls-lda-or-other-clustering

nltk.download('stopwords')
from nltk.corpus import stopwords

from pyspark import SparkConf, SparkContext,SQLContext
from pyspark.sql import SparkSession, functions
from pyspark.ml.feature import Word2Vec,CountVectorizer,Tokenizer, StopWordsRemover
from pyspark.ml.clustering import LDA, LDAModel
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType,ArrayType,StringType


spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

data_path = "gs://6893finalpruv/2.csv" # Data path for csv file
spark_df = spark.read.csv(data_path, inferSchema = True, header=True) # checking the csv file
spark_df.show()

# Topic Modelling on Title (Potentially do it on description if possible)
node = "Title"
# Get title data, filter out empty nodes
title_data = spark_df.select(node).filter(functions.col(node).isNotNull())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               Title|          Title Link|              Author|               Score|        Num Comments|           Subreddit|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|TIL the Vice Pres...|https://publicapo...|          douggold11|               22719|                1308|       todayilearned|
|What’s your toxic...|                 N/A|         KingOfLoser|               16369|               10396|           AskReddit|
|Herschel Walker v...|https://www.busin...|        Singing_Wolf|               13530|                1280|            politics|
|Qatari security t...|https://v.redd.it...|    Rollo_Tomasi3000|               85855|                4588|    nextfuckinglevel|
|North Carolina po...|https://v.redd.it...|        FarmSuch5021|               28059|                360

In [13]:
from pyspark.sql.functions import countDistinct
df2 = spark_df.select(countDistinct("Subreddit"))
topics = df2.first()[0]

In [3]:
tokenizer = Tokenizer(inputCol="Title", outputCol="words")
tokenized = tokenizer.transform(spark_df)
remover = StopWordsRemover(stopWords=stopwords.words('english'), inputCol="words", outputCol="filtered")
result = remover.transform(tokenized)
result.select("filtered").show()

+--------------------+
|            filtered|
+--------------------+
|[til, vice, presi...|
|[what’s, toxic, t...|
|[herschel, walker...|
|[qatari, security...|
|[north, carolina,...|
|[boy, deaf, hears...|
|[declining, globa...|
|[title, isn’t, ev...|
|[guy, tries, catc...|
|[what's, somethin...|
|[joe, biden,, ola...|
|[n.y., disbars, l...|
|[jon, stewart, da...|
|["kevin, durant:,...|
|[qatari, security...|
|[late..., 60, yea...|
|[local, microcent...|
|[3.20, balance, m...|
|[woman, named, el...|
|      [jack, trades]|
+--------------------+
only showing top 20 rows



In [5]:
cv = CountVectorizer(inputCol="filtered", outputCol="features")
cvModel = cv.fit(result)
cvResult = cvModel.transform(result)

In [18]:
lda = LDA(maxIter=20, k = 10)
ldaModel = lda.fit(cvResult)
transformed = ldaModel.transform(cvResult).select("topicDistribution")
transformed.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topicDistribution                                                                                                                                                                                                     |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[0.007878400800887522,0.007830576177434635,0.00835470853177807,0.9298195113975268,0.007532009031698612,0.007857037848053503,0.007713179211811139,0.007599726548981892,0.0076548709978702155,0.007759979453957673]     |
|[0.013566497395055612,0.01348384518046326,0.8805761948502596,0.012961576850997636,0.012969623367257425,0.013530114912814868,0.01328

In [19]:
ll = ldaModel.logLikelihood(cvResult)
lp = ldaModel.logPerplexity(cvResult)
print("ll: ", ll)
print("lp: ", lp)

ll:  -49528.10066245525
lp:  14.318618289232509


In [20]:
vocab = cvModel.vocabulary
topics = ldaModel.describeTopics()
topics_rdd = topics.rdd

topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()
for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)

topic: 0
*************************
finished
musk
%
vs
two
long
round
next
trump's
took
*************************
topic: 1
*************************
i’m
one
years
game
every
late.
make
promoting
away
8
*************************
topic: 2
*************************
like
new
possible
office
take
really
getting
released
ice
close
*************************
topic: 3
*************************
shooting
states
united
gemma
,
memorial
3
band
making
map
*************************
topic: 4
*************************
nnn
it’s
horny
participant
low
least
pay
beats
willingness
food,
*************************
topic: 5
*************************
it’s
would
cell
taking
cat
body
this?
brain
party
this.
*************************
topic: 6
*************************
world
cup
disrespectful
report
made
likely
longer
camera
tv
qatari
*************************
topic: 7
*************************
question
home
mom
try
8-year-old
half
mile
son
making
jailed
*************************
topic: 8
*************************
p