In [125]:
import os
import nltk

# https://stackoverflow.com/questions/51390676/how-to-visualize-pyspark-mls-lda-or-other-clustering

nltk.download('stopwords')
from nltk.corpus import stopwords

from pyspark import SparkConf, SparkContext,SQLContext
from pyspark.sql import SparkSession, functions
from pyspark.ml.feature import Word2Vec,CountVectorizer,Tokenizer, StopWordsRemover
from pyspark.ml.clustering import LDA, LDAModel
from pyspark.sql.functions import col, udf, countDistinct, regexp_replace
from pyspark.sql.types import IntegerType,ArrayType,StringType


spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
month = "December"
year = "2010"
csv_path = "/" + year + "/" + month + ".csv"

data_path = "../data" + csv_path # Data path for csv file

spark_df = spark.read.csv(data_path, inferSchema = True, header=True) # checking the csv file
spark_df = spark_df.withColumn('Title', regexp_replace('Title', '"', ''))
# Topic Modelling on Title (Potentially do it on description if possible)
node = "Title"
# Get title data, filter out empty nodes
title_data = spark_df.select(node).filter(functions.col(node).isNotNull())

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alirahman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [126]:
df2 = spark_df.select(countDistinct("Subreddit"))
topic_num = df2.first()[0]

In [127]:
tokenizer = Tokenizer(inputCol="Title", outputCol="words")
tokenized = tokenizer.transform(spark_df)
remover = StopWordsRemover(stopWords=stopwords.words('english'), inputCol="words", outputCol="filtered")
result = remover.transform(tokenized)
# result.select("filtered").show()

In [128]:
cv = CountVectorizer(inputCol="filtered", outputCol="features")
cvModel = cv.fit(result)
cvResult = cvModel.transform(result)

In [129]:
lda = LDA(maxIter=20, k = 10)
ldaModel = lda.fit(cvResult)
transformed = ldaModel.transform(cvResult).select("topicDistribution")
#transformed.show(truncate=False)

In [130]:
vocab = cvModel.vocabulary
topics = ldaModel.describeTopics()
topics_rdd = topics.rdd

topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()
topic_weights = topics_rdd\
       .map(lambda row: row['termWeights'])\
       .collect()

import csv
file_path = "../processed_data" + csv_path
with open(file_path, 'w') as file:
    header = ["term", "probability", "topic"]
    writer = csv.writer(file)
    writer.writerow(header)
    for idx, topic in enumerate(topics_words):
        i = 0
        for word in topic:
            data = [idx, word, topic_weights[idx][i]]
            writer.writerow(data)
            i = i+1


In [131]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import lit
dist = ldaModel.transform(cvResult)
def ith_(v, i):
    try:
        return float(v[i])
    except ValueError:
        return None
    
ith = udf(ith_, DoubleType())
df = dist.select(["Title"] + [ith("topicDistribution", lit(i)).alias('topic_'+str(i)) for i in range(10)] )

In [132]:
import pandas as pd
import numpy as np
df_p = dist.select('topicDistribution').toPandas()
df_p1 = df_p.topicDistribution.apply(lambda x:np.array(x))
df_p2 = pd.DataFrame(df_p1.tolist()).apply(lambda x:x.argmax(),axis=1)
df_p3 = df_p2.reset_index()
df_p3.columns = ['doc','topic']
df2_p = dist.select('Title').toPandas()
#print(df_p3)
final_df = pd.concat([df2_p, df_p3], axis=1)
topic_path = "../document_topics" + csv_path
final_df.to_csv(topic_path, index=False)

22/12/12 00:30:43 WARN DAGScheduler: Broadcasting large task binary with size 1701.6 KiB
