In [18]:
import os
import nltk

# https://stackoverflow.com/questions/51390676/how-to-visualize-pyspark-mls-lda-or-other-clustering

nltk.download('stopwords')
from nltk.corpus import stopwords

from pyspark import SparkConf, SparkContext,SQLContext
from pyspark.sql import SparkSession, functions
from pyspark.ml.feature import Word2Vec,CountVectorizer,Tokenizer, StopWordsRemover
from pyspark.ml.clustering import LDA, LDAModel
from pyspark.sql.functions import col, udf, countDistinct, regexp_replace
from pyspark.sql.types import IntegerType,ArrayType,StringType
import pandas as pd
import numpy as np
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import lit
import csv

def ith_(v, i):
    try:
        return float(v[i])
    except ValueError:
        return None

spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
#months = ["November"]
years = ["2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"]
#years = ["2010"]
for year in years:
    for month in months:
        csv_path = "/" + year + "/" + month + ".csv"
        json_path = "/" + year + "/" + month + ".json"
        data_path = "../data" + csv_path # Data path for csv file
        spark_df = spark.read.csv(data_path, inferSchema = True, header=True) # checking the csv file
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', '"', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', '-', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', '\.', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', ',', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', '\?', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', '\!', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', '\/', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', '\\\\', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', "'", ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', ':', ''))
        # Topic Modelling on Title (Potentially do it on description if possible)
        node = "Title"
        # Get title data, filter out empty nodes
        title_data = spark_df.select(node).filter(functions.col(node).isNotNull())

        df2 = spark_df.select(countDistinct("Subreddit"))
        topic_num = df2.first()[0]

        tokenizer = Tokenizer(inputCol="Title", outputCol="words")
        tokenized = tokenizer.transform(spark_df)
        remover = StopWordsRemover(stopWords=stopwords.words('english'), inputCol="words", outputCol="filtered")
        result = remover.transform(tokenized)
        # result.select("filtered").show()

        cv = CountVectorizer(inputCol="filtered", outputCol="features")
        cvModel = cv.fit(result)
        cvResult = cvModel.transform(result)

        lda = LDA(maxIter=20, k = 10)
        ldaModel = lda.fit(cvResult)
        transformed = ldaModel.transform(cvResult).select("topicDistribution")
        #transformed.show(truncate=False)

        vocab = cvModel.vocabulary
        topics = ldaModel.describeTopics()
        topics_rdd = topics.rdd

        topics_words = topics_rdd\
               .map(lambda row: row['termIndices'])\
               .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
               .collect()
        topic_weights = topics_rdd\
               .map(lambda row: row['termWeights'])\
               .collect()

        file_path = "../processed_data" + csv_path
        if not os.path.exists("../processed_data/" + year):
            os.makedirs("../processed_data/" + year)
        with open(file_path, 'w') as file:
            header = ["term", "probability", "topic"]
            writer = csv.writer(file)
            writer.writerow(header)
            for idx, topic in enumerate(topics_words):
                i = 0
                for word in topic:
                    data = [word, topic_weights[idx][i], idx]
                    writer.writerow(data)
                    i = i+1

        json_df = pd.read_csv(file_path)
        json_df.to_json("../processed_data" + json_path, orient = "table")
        
        dist = ldaModel.transform(cvResult)

        ith = udf(ith_, DoubleType())
        df = dist.select(["Title"] + [ith("topicDistribution", lit(i)).alias('topic_'+str(i)) for i in range(10)] )

        df_p = dist.select('topicDistribution').toPandas()
        df_p1 = df_p.topicDistribution.apply(lambda x:np.array(x))
        df_p2 = pd.DataFrame(df_p1.tolist()).apply(lambda x:x.argmax(),axis=1)
        df_p3 = df_p2.reset_index()
        df_p3.columns = ['doc','topic']
        df2_p = dist.select('Title').toPandas()
        #print(df_p3)
        final_df = pd.concat([df2_p, df_p3], axis=1)
        topic_path = "../document_topics" + csv_path
        if not os.path.exists("../document_topics/" + year):
            os.makedirs("../document_topics/" + year)
        final_df.to_csv(topic_path, index=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alirahman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


22/12/13 00:09:46 WARN DAGScheduler: Broadcasting large task binary with size 1473.8 KiB


[Stage 778:>                                                        (0 + 1) / 1]                                                                                

22/12/13 00:09:50 WARN DAGScheduler: Broadcasting large task binary with size 1383.3 KiB
22/12/13 00:09:55 WARN DAGScheduler: Broadcasting large task binary with size 1483.4 KiB


[Stage 894:>                                                        (0 + 1) / 1]                                                                                

22/12/13 00:10:00 WARN DAGScheduler: Broadcasting large task binary with size 1425.1 KiB


[Stage 952:>                                                        (0 + 1) / 1]                                                                                

22/12/13 00:10:04 WARN DAGScheduler: Broadcasting large task binary with size 1432.8 KiB


[Stage 1010:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:10:09 WARN DAGScheduler: Broadcasting large task binary with size 1406.8 KiB
22/12/13 00:10:13 WARN DAGScheduler: Broadcasting large task binary with size 1432.1 KiB
22/12/13 00:10:18 WARN DAGScheduler: Broadcasting large task binary with size 1397.8 KiB


[Stage 1184:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:10:22 WARN DAGScheduler: Broadcasting large task binary with size 1355.4 KiB
22/12/13 00:10:27 WARN DAGScheduler: Broadcasting large task binary with size 1367.3 KiB


[Stage 1300:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:10:31 WARN DAGScheduler: Broadcasting large task binary with size 1305.8 KiB


[Stage 1358:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:10:36 WARN DAGScheduler: Broadcasting large task binary with size 1299.3 KiB


[Stage 1416:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:10:40 WARN DAGScheduler: Broadcasting large task binary with size 1277.5 KiB
22/12/13 00:10:44 WARN DAGScheduler: Broadcasting large task binary with size 1189.0 KiB
22/12/13 00:10:48 WARN DAGScheduler: Broadcasting large task binary with size 1314.0 KiB
22/12/13 00:10:52 WARN DAGScheduler: Broadcasting large task binary with size 1239.8 KiB
22/12/13 00:10:56 WARN DAGScheduler: Broadcasting large task binary with size 1257.5 KiB
22/12/13 00:11:00 WARN DAGScheduler: Broadcasting large task binary with size 1177.1 KiB
22/12/13 00:11:04 WARN DAGScheduler: Broadcasting large task binary with size 1221.9 KiB
22/12/13 00:11:08 WARN DAGScheduler: Broadcasting large task binary with size 1184.2 KiB
22/12/13 00:11:12 WARN DAGScheduler: Broadcasting large task binary with size 1114.2 KiB
22/12/13 00:11:16 WARN DAGScheduler: Broadcasting large task binary with size 1122.8 KiB
22/12/13 00:11:20 WARN DAGScheduler: Broadcasting large task binary with size 1114.2 KiB
22/12/13 00:11:23 WAR

[Stage 4084:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:13:40 WARN DAGScheduler: Broadcasting large task binary with size 1219.8 KiB
22/12/13 00:13:44 WARN DAGScheduler: Broadcasting large task binary with size 1230.0 KiB
22/12/13 00:13:49 WARN DAGScheduler: Broadcasting large task binary with size 1280.5 KiB


[Stage 4258:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:13:53 WARN DAGScheduler: Broadcasting large task binary with size 1225.5 KiB
22/12/13 00:13:57 WARN DAGScheduler: Broadcasting large task binary with size 1324.2 KiB


[Stage 4374:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:14:01 WARN DAGScheduler: Broadcasting large task binary with size 1265.0 KiB
22/12/13 00:14:06 WARN DAGScheduler: Broadcasting large task binary with size 1297.6 KiB
22/12/13 00:14:10 WARN DAGScheduler: Broadcasting large task binary with size 1262.0 KiB
22/12/13 00:14:14 WARN DAGScheduler: Broadcasting large task binary with size 1308.9 KiB
22/12/13 00:14:18 WARN DAGScheduler: Broadcasting large task binary with size 1336.8 KiB


[Stage 4664:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:14:22 WARN DAGScheduler: Broadcasting large task binary with size 1322.3 KiB
22/12/13 00:14:26 WARN DAGScheduler: Broadcasting large task binary with size 1310.2 KiB


[Stage 4780:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:14:30 WARN DAGScheduler: Broadcasting large task binary with size 1292.9 KiB


[Stage 4838:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:14:35 WARN DAGScheduler: Broadcasting large task binary with size 1336.8 KiB
22/12/13 00:14:39 WARN DAGScheduler: Broadcasting large task binary with size 1336.8 KiB
22/12/13 00:14:43 WARN DAGScheduler: Broadcasting large task binary with size 1308.7 KiB
22/12/13 00:14:47 WARN DAGScheduler: Broadcasting large task binary with size 1336.2 KiB
22/12/13 00:14:51 WARN DAGScheduler: Broadcasting large task binary with size 1352.6 KiB
22/12/13 00:14:55 WARN DAGScheduler: Broadcasting large task binary with size 1334.8 KiB
22/12/13 00:14:59 WARN DAGScheduler: Broadcasting large task binary with size 1304.6 KiB
22/12/13 00:15:03 WARN DAGScheduler: Broadcasting large task binary with size 1296.3 KiB
22/12/13 00:15:07 WARN DAGScheduler: Broadcasting large task binary with size 1325.2 KiB


[Stage 5360:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:15:11 WARN DAGScheduler: Broadcasting large task binary with size 1290.8 KiB
22/12/13 00:15:15 WARN DAGScheduler: Broadcasting large task binary with size 1281.8 KiB
22/12/13 00:15:19 WARN DAGScheduler: Broadcasting large task binary with size 1245.2 KiB
22/12/13 00:15:23 WARN DAGScheduler: Broadcasting large task binary with size 1270.4 KiB
22/12/13 00:15:27 WARN DAGScheduler: Broadcasting large task binary with size 1279.5 KiB
22/12/13 00:15:31 WARN DAGScheduler: Broadcasting large task binary with size 1202.1 KiB
22/12/13 00:15:35 WARN DAGScheduler: Broadcasting large task binary with size 1298.8 KiB


[Stage 5766:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:15:39 WARN DAGScheduler: Broadcasting large task binary with size 1257.6 KiB
22/12/13 00:15:43 WARN DAGScheduler: Broadcasting large task binary with size 1290.0 KiB
22/12/13 00:15:47 WARN DAGScheduler: Broadcasting large task binary with size 1262.7 KiB


[Stage 5940:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:15:52 WARN DAGScheduler: Broadcasting large task binary with size 1307.0 KiB
22/12/13 00:15:56 WARN DAGScheduler: Broadcasting large task binary with size 1285.8 KiB
22/12/13 00:16:00 WARN DAGScheduler: Broadcasting large task binary with size 1278.2 KiB
22/12/13 00:16:04 WARN DAGScheduler: Broadcasting large task binary with size 1311.9 KiB
22/12/13 00:16:08 WARN DAGScheduler: Broadcasting large task binary with size 1259.8 KiB
22/12/13 00:16:12 WARN DAGScheduler: Broadcasting large task binary with size 1289.6 KiB
22/12/13 00:16:16 WARN DAGScheduler: Broadcasting large task binary with size 1325.7 KiB
22/12/13 00:16:19 WARN DAGScheduler: Broadcasting large task binary with size 1207.2 KiB
22/12/13 00:16:23 WARN DAGScheduler: Broadcasting large task binary with size 1266.9 KiB
22/12/13 00:16:27 WARN DAGScheduler: Broadcasting large task binary with size 1249.5 KiB
22/12/13 00:16:31 WARN DAGScheduler: Broadcasting large task binary with size 1260.7 KiB
22/12/13 00:16:35 WAR