In [26]:
import os
import nltk

# https://stackoverflow.com/questions/51390676/how-to-visualize-pyspark-mls-lda-or-other-clustering

nltk.download('stopwords')
from nltk.corpus import stopwords

from pyspark import SparkConf, SparkContext,SQLContext
from pyspark.sql import SparkSession, functions
from pyspark.ml.feature import Word2Vec,CountVectorizer,Tokenizer, StopWordsRemover
from pyspark.ml.clustering import LDA, LDAModel
from pyspark.sql.functions import col, udf, countDistinct, regexp_replace
from pyspark.sql.types import IntegerType,ArrayType,StringType
import pandas as pd
import numpy as np
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import lit, expr
import csv

def ith_(v, i):
    try:
        return float(v[i])
    except ValueError:
        return None

spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
#months = ["December"]
years = ["2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"]
#years = ["2010"]
for year in years:
    for month in months:
        csv_path = "/" + year + "/" + month + ".csv"
        json_path = "/" + year + "/" + month + ".json"
        data_path = "../data" + csv_path # Data path for csv file
        spark_df = spark.read.csv(data_path, inferSchema = True, header=True) # checking the csv file
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', '"', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', '-', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', '\.', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', ',', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', '\?', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', '\!', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', '\/', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', '\\\\', ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', "'", ''))
        spark_df = spark_df.withColumn('Title', regexp_replace('Title', ':', ''))
        # Topic Modelling on Title (Potentially do it on description if possible)
        node = "Title"
        # Get title data, filter out empty nodes
        title_data = spark_df.select(node).filter(functions.col(node).isNotNull())

        df2 = spark_df.select(countDistinct("Subreddit"))
        topic_num = df2.first()[0]

        tokenizer = Tokenizer(inputCol="Title", outputCol="words")
        tokenized = tokenizer.transform(spark_df)
        
        tokenized = tokenized.withColumn("words", expr("filter(words, elem -> elem != '')"))
        
        remover = StopWordsRemover(stopWords=stopwords.words('english'), inputCol="words", outputCol="filtered")
        result = remover.transform(tokenized)
        
        # result.select("filtered").show()

        cv = CountVectorizer(inputCol="filtered", outputCol="features")
        cvModel = cv.fit(result)
        cvResult = cvModel.transform(result)

        lda = LDA(maxIter=20, k = 10)
        ldaModel = lda.fit(cvResult)
        transformed = ldaModel.transform(cvResult).select("topicDistribution")
        #transformed.show(truncate=False)

        vocab = cvModel.vocabulary
        topics = ldaModel.describeTopics()
        topics_rdd = topics.rdd

        topics_words = topics_rdd\
               .map(lambda row: row['termIndices'])\
               .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
               .collect()
        topic_weights = topics_rdd\
               .map(lambda row: row['termWeights'])\
               .collect()

        file_path = "../processed_data" + csv_path
        if not os.path.exists("../processed_data/" + year):
            os.makedirs("../processed_data/" + year)
        with open(file_path, 'w') as file:
            header = ["term", "probability", "topic"]
            writer = csv.writer(file)
            writer.writerow(header)
            for idx, topic in enumerate(topics_words):
                i = 0
                for word in topic:
                    data = [word, topic_weights[idx][i], idx]
                    writer.writerow(data)
                    i = i+1

        json_df = pd.read_csv(file_path)
        json_df.to_json("../processed_data" + json_path, orient = "table")
        
        dist = ldaModel.transform(cvResult)

        ith = udf(ith_, DoubleType())
        df = dist.select(["Title"] + [ith("topicDistribution", lit(i)).alias('topic_'+str(i)) for i in range(10)] )

        df_p = dist.select('topicDistribution').toPandas()
        df_p1 = df_p.topicDistribution.apply(lambda x:np.array(x))
        df_p2 = pd.DataFrame(df_p1.tolist()).apply(lambda x:x.argmax(),axis=1)
        df_p3 = df_p2.reset_index()
        df_p3.columns = ['doc','topic']
        df2_p = dist.select('Title').toPandas()
        #print(df_p3)
        final_df = pd.concat([df2_p, df_p3], axis=1)
        topic_path = "../document_topics" + csv_path
        if not os.path.exists("../document_topics/" + year):
            os.makedirs("../document_topics/" + year)
        final_df.to_csv(topic_path, index=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alirahman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


22/12/13 00:40:08 WARN DAGScheduler: Broadcasting large task binary with size 1448.6 KiB


[Stage 8100:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:40:12 WARN DAGScheduler: Broadcasting large task binary with size 1358.2 KiB


[Stage 8158:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:40:16 WARN DAGScheduler: Broadcasting large task binary with size 1458.2 KiB
22/12/13 00:40:20 WARN DAGScheduler: Broadcasting large task binary with size 1400.0 KiB
22/12/13 00:40:25 WARN DAGScheduler: Broadcasting large task binary with size 1407.6 KiB
22/12/13 00:40:29 WARN DAGScheduler: Broadcasting large task binary with size 1381.7 KiB
22/12/13 00:40:33 WARN DAGScheduler: Broadcasting large task binary with size 1406.9 KiB


[Stage 8448:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:40:37 WARN DAGScheduler: Broadcasting large task binary with size 1372.7 KiB


[Stage 8506:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:40:41 WARN DAGScheduler: Broadcasting large task binary with size 1330.3 KiB


[Stage 8564:>                                                       (0 + 1) / 1]                                                                                

22/12/13 00:40:45 WARN DAGScheduler: Broadcasting large task binary with size 1342.2 KiB
22/12/13 00:40:49 WARN DAGScheduler: Broadcasting large task binary with size 1280.6 KiB
22/12/13 00:40:53 WARN DAGScheduler: Broadcasting large task binary with size 1274.1 KiB
22/12/13 00:40:57 WARN DAGScheduler: Broadcasting large task binary with size 1252.3 KiB
22/12/13 00:41:01 WARN DAGScheduler: Broadcasting large task binary with size 1163.9 KiB
22/12/13 00:41:05 WARN DAGScheduler: Broadcasting large task binary with size 1288.8 KiB
22/12/13 00:41:09 WARN DAGScheduler: Broadcasting large task binary with size 1214.6 KiB
22/12/13 00:41:12 WARN DAGScheduler: Broadcasting large task binary with size 1232.3 KiB
22/12/13 00:41:16 WARN DAGScheduler: Broadcasting large task binary with size 1152.0 KiB
22/12/13 00:41:20 WARN DAGScheduler: Broadcasting large task binary with size 1196.7 KiB
22/12/13 00:41:23 WARN DAGScheduler: Broadcasting large task binary with size 1159.0 KiB
22/12/13 00:41:27 WAR

[Stage 10536:>                                                      (0 + 1) / 1]                                                                                

22/12/13 00:42:50 WARN DAGScheduler: Broadcasting large task binary with size 1153.3 KiB
22/12/13 00:42:54 WARN DAGScheduler: Broadcasting large task binary with size 1145.0 KiB
22/12/13 00:42:57 WARN DAGScheduler: Broadcasting large task binary with size 1163.7 KiB
22/12/13 00:43:01 WARN DAGScheduler: Broadcasting large task binary with size 1133.5 KiB
22/12/13 00:43:05 WARN DAGScheduler: Broadcasting large task binary with size 1127.3 KiB
22/12/13 00:43:09 WARN DAGScheduler: Broadcasting large task binary with size 1168.9 KiB
22/12/13 00:43:12 WARN DAGScheduler: Broadcasting large task binary with size 1085.7 KiB
22/12/13 00:43:16 WARN DAGScheduler: Broadcasting large task binary with size 1165.1 KiB
22/12/13 00:43:20 WARN DAGScheduler: Broadcasting large task binary with size 1148.3 KiB
22/12/13 00:43:23 WARN DAGScheduler: Broadcasting large task binary with size 1177.8 KiB
22/12/13 00:43:27 WARN DAGScheduler: Broadcasting large task binary with size 1167.7 KiB
22/12/13 00:43:31 WAR

[Stage 11406:>                                                      (0 + 1) / 1]                                                                                

22/12/13 00:43:46 WARN DAGScheduler: Broadcasting large task binary with size 1194.6 KiB
22/12/13 00:43:50 WARN DAGScheduler: Broadcasting large task binary with size 1204.9 KiB
22/12/13 00:43:54 WARN DAGScheduler: Broadcasting large task binary with size 1255.4 KiB
22/12/13 00:43:58 WARN DAGScheduler: Broadcasting large task binary with size 1200.3 KiB
22/12/13 00:44:02 WARN DAGScheduler: Broadcasting large task binary with size 1299.1 KiB


[Stage 11696:>                                                      (0 + 1) / 1]                                                                                

22/12/13 00:44:06 WARN DAGScheduler: Broadcasting large task binary with size 1239.8 KiB
22/12/13 00:44:10 WARN DAGScheduler: Broadcasting large task binary with size 1272.4 KiB
22/12/13 00:44:13 WARN DAGScheduler: Broadcasting large task binary with size 1236.8 KiB


[Stage 11870:>                                                      (0 + 1) / 1]                                                                                

22/12/13 00:44:17 WARN DAGScheduler: Broadcasting large task binary with size 1283.7 KiB
22/12/13 00:44:21 WARN DAGScheduler: Broadcasting large task binary with size 1311.6 KiB
22/12/13 00:44:25 WARN DAGScheduler: Broadcasting large task binary with size 1297.1 KiB
22/12/13 00:44:29 WARN DAGScheduler: Broadcasting large task binary with size 1285.1 KiB
22/12/13 00:44:33 WARN DAGScheduler: Broadcasting large task binary with size 1267.7 KiB
22/12/13 00:44:37 WARN DAGScheduler: Broadcasting large task binary with size 1311.7 KiB


[Stage 12218:>                                                      (0 + 1) / 1]                                                                                

22/12/13 00:44:41 WARN DAGScheduler: Broadcasting large task binary with size 1311.7 KiB
22/12/13 00:44:45 WARN DAGScheduler: Broadcasting large task binary with size 1283.5 KiB


[Stage 12334:>                                                      (0 + 1) / 1]                                                                                

22/12/13 00:44:49 WARN DAGScheduler: Broadcasting large task binary with size 1311.0 KiB
22/12/13 00:44:53 WARN DAGScheduler: Broadcasting large task binary with size 1327.4 KiB


[Stage 12450:>                                                      (0 + 1) / 1]                                                                                

22/12/13 00:44:57 WARN DAGScheduler: Broadcasting large task binary with size 1309.6 KiB


[Stage 12508:>                                                      (0 + 1) / 1]                                                                                

22/12/13 00:45:01 WARN DAGScheduler: Broadcasting large task binary with size 1279.5 KiB
22/12/13 00:45:05 WARN DAGScheduler: Broadcasting large task binary with size 1271.2 KiB
22/12/13 00:45:09 WARN DAGScheduler: Broadcasting large task binary with size 1300.1 KiB
22/12/13 00:45:13 WARN DAGScheduler: Broadcasting large task binary with size 1265.6 KiB
22/12/13 00:45:17 WARN DAGScheduler: Broadcasting large task binary with size 1256.6 KiB
22/12/13 00:45:21 WARN DAGScheduler: Broadcasting large task binary with size 1220.1 KiB
22/12/13 00:45:24 WARN DAGScheduler: Broadcasting large task binary with size 1245.3 KiB
22/12/13 00:45:28 WARN DAGScheduler: Broadcasting large task binary with size 1254.3 KiB
22/12/13 00:45:32 WARN DAGScheduler: Broadcasting large task binary with size 1177.0 KiB
22/12/13 00:45:36 WARN DAGScheduler: Broadcasting large task binary with size 1273.6 KiB
22/12/13 00:45:39 WARN DAGScheduler: Broadcasting large task binary with size 1232.4 KiB


[Stage 13146:>                                                      (0 + 1) / 1]                                                                                

22/12/13 00:45:44 WARN DAGScheduler: Broadcasting large task binary with size 1264.8 KiB


[Stage 13204:>                                                      (0 + 1) / 1]                                                                                

22/12/13 00:45:47 WARN DAGScheduler: Broadcasting large task binary with size 1237.5 KiB
22/12/13 00:45:51 WARN DAGScheduler: Broadcasting large task binary with size 1281.9 KiB
22/12/13 00:45:55 WARN DAGScheduler: Broadcasting large task binary with size 1260.6 KiB
22/12/13 00:45:59 WARN DAGScheduler: Broadcasting large task binary with size 1253.1 KiB
22/12/13 00:46:03 WARN DAGScheduler: Broadcasting large task binary with size 1286.7 KiB


[Stage 13494:>                                                      (0 + 1) / 1]                                                                                

22/12/13 00:46:07 WARN DAGScheduler: Broadcasting large task binary with size 1234.6 KiB
22/12/13 00:46:11 WARN DAGScheduler: Broadcasting large task binary with size 1264.4 KiB
22/12/13 00:46:14 WARN DAGScheduler: Broadcasting large task binary with size 1300.6 KiB


[Stage 13668:>                                                      (0 + 1) / 1]                                                                                

22/12/13 00:46:18 WARN DAGScheduler: Broadcasting large task binary with size 1182.0 KiB
22/12/13 00:46:22 WARN DAGScheduler: Broadcasting large task binary with size 1241.8 KiB
22/12/13 00:46:26 WARN DAGScheduler: Broadcasting large task binary with size 1224.3 KiB
22/12/13 00:46:30 WARN DAGScheduler: Broadcasting large task binary with size 1235.5 KiB
22/12/13 00:46:34 WARN DAGScheduler: Broadcasting large task binary with size 1221.9 KiB
22/12/13 00:46:37 WARN DAGScheduler: Broadcasting large task binary with size 1240.8 KiB
22/12/13 00:46:41 WARN DAGScheduler: Broadcasting large task binary with size 1225.2 KiB
22/12/13 00:46:45 WARN DAGScheduler: Broadcasting large task binary with size 1191.1 KiB
22/12/13 00:46:49 WARN DAGScheduler: Broadcasting large task binary with size 1224.0 KiB
22/12/13 00:46:53 WARN DAGScheduler: Broadcasting large task binary with size 1180.4 KiB
22/12/13 00:46:56 WARN DAGScheduler: Broadcasting large task binary with size 1188.3 KiB
22/12/13 00:47:00 WAR