In [179]:
import json

In [180]:
import findspark
findspark.init()

In [181]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, udf, lit
spark = SparkSession.builder.appName('read JSON files').getOrCreate()

In [182]:
# json_df=spark.read.option("inferSchema","true") \
#                 .option("header","true") \
#                 .option("sep",",") \
#                 .json("Mar*.json")

In [183]:
#json_df=spark.read.json("tweets.txt")

In [184]:
json_df=spark.read.json("Apr_tweets*.json")

In [185]:
# number of files
json_df.count()

97

In [186]:
# show the schema
json_df.printSchema()

root
 |-- data: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: string (nullable = true)
 |    |    |-- created_at: string (nullable = true)
 |    |    |-- geo: struct (nullable = true)
 |    |    |    |-- coordinates: struct (nullable = true)
 |    |    |    |    |-- coordinates: array (nullable = true)
 |    |    |    |    |    |-- element: double (containsNull = true)
 |    |    |    |    |-- type: string (nullable = true)
 |    |    |    |-- place_id: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- public_metrics: struct (nullable = true)
 |    |    |    |-- like_count: long (nullable = true)
 |    |    |    |-- quote_count: long (nullable = true)
 |    |    |    |-- reply_count: long (nullable = true)
 |    |    |    |-- retweet_count: long (nullable = true)
 |    |    |-- text: string (nullable = true)
 |-- includes: struct (nullable = true)
 |    |-- places: array (nullable = true)
 |    |    |

In [187]:
# show the schema for tweets
json_df.select('data').printSchema()

root
 |-- data: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: string (nullable = true)
 |    |    |-- created_at: string (nullable = true)
 |    |    |-- geo: struct (nullable = true)
 |    |    |    |-- coordinates: struct (nullable = true)
 |    |    |    |    |-- coordinates: array (nullable = true)
 |    |    |    |    |    |-- element: double (containsNull = true)
 |    |    |    |    |-- type: string (nullable = true)
 |    |    |    |-- place_id: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- public_metrics: struct (nullable = true)
 |    |    |    |-- like_count: long (nullable = true)
 |    |    |    |-- quote_count: long (nullable = true)
 |    |    |    |-- reply_count: long (nullable = true)
 |    |    |    |-- retweet_count: long (nullable = true)
 |    |    |-- text: string (nullable = true)



In [188]:
# convert array to dict
data_df=json_df.select('data').withColumn('data', explode('data').alias('data'))

In [189]:
data_df.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- author_id: string (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- geo: struct (nullable = true)
 |    |    |-- coordinates: struct (nullable = true)
 |    |    |    |-- coordinates: array (nullable = true)
 |    |    |    |    |-- element: double (containsNull = true)
 |    |    |    |-- type: string (nullable = true)
 |    |    |-- place_id: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- public_metrics: struct (nullable = true)
 |    |    |-- like_count: long (nullable = true)
 |    |    |-- quote_count: long (nullable = true)
 |    |    |-- reply_count: long (nullable = true)
 |    |    |-- retweet_count: long (nullable = true)
 |    |-- text: string (nullable = true)



In [190]:
# number of records
data_df.count()

47978

In [191]:
data_df=data_df.select('data.author_id',
                       'data.created_at',
                       'data.geo.place_id',
                       'data.id',
                       'data.public_metrics',
                       'data.text')

In [192]:
data_df.printSchema()

root
 |-- author_id: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- place_id: string (nullable = true)
 |-- id: string (nullable = true)
 |-- public_metrics: struct (nullable = true)
 |    |-- like_count: long (nullable = true)
 |    |-- quote_count: long (nullable = true)
 |    |-- reply_count: long (nullable = true)
 |    |-- retweet_count: long (nullable = true)
 |-- text: string (nullable = true)



In [193]:
# select the place content
place_df=json_df.select('includes.places')

In [194]:
# show the schema fo place
place_df.printSchema()

root
 |-- places: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- country_code: string (nullable = true)
 |    |    |-- full_name: string (nullable = true)
 |    |    |-- id: string (nullable = true)



In [195]:
# select the user content
user_df=json_df.select('includes.users')

In [196]:
# show the schema fo user
user_df.printSchema()

root
 |-- users: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- location: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- username: string (nullable = true)



In [197]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [198]:
def read_nested_json(df):
    column_list = []

    for column_name in df.schema.names:
        print("Outside isinstance loop: " + column_name)
        # Checking column type is ArrayType
        if isinstance(df.schema[column_name].dataType, ArrayType):
            print("Inside isinstance loop of ArrayType: " + column_name)
            df = df.withColumn(column_name, explode(column_name).alias(column_name))
            column_list.append(column_name)

        elif isinstance(df.schema[column_name].dataType, StructType):
            print("Inside isinstance loop of StructType: " + column_name)
            for field in df.schema[column_name].dataType.fields:
                column_list.append(col(column_name + "." + field.name).alias(column_name + "_" + field.name))
        else:
            column_list.append(column_name)

    # Selecting columns using column_list from dataframe: df
    df = df.select(column_list)
    return df

In [199]:
def flatten_nested_json(df):
    read_nested_json_flag = True
    while read_nested_json_flag:
        print("Reading Nested JSON File ... ")
        df = read_nested_json(df)
        df.show(100, False)
        read_nested_json_flag = False

        for column_name in df.schema.names:
            if isinstance(df.schema[column_name].dataType, ArrayType):
              read_nested_json_flag = True
            elif isinstance(df.schema[column_name].dataType, StructType):
              read_nested_json_flag = True
    return df

In [214]:
data_df=flatten_nested_json(data_df)
data_df.show(10, False)

Reading Nested JSON File ... 
Outside isinstance loop: author_id
Outside isinstance loop: created_at
Outside isinstance loop: place_id
Outside isinstance loop: id
Outside isinstance loop: public_metrics_like_count
Outside isinstance loop: public_metrics_quote_count
Outside isinstance loop: public_metrics_reply_count
Outside isinstance loop: public_metrics_retweet_count
Outside isinstance loop: text
+-------------------+------------------------+----------------+-------------------+-------------------------+--------------------------+--------------------------+----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|author_id          |created_at              |place_id        |id                 |public_metric

In [215]:
data_df.show(10, False)

+-------------------+------------------------+----------------+-------------------+-------------------------+--------------------------+--------------------------+----------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|author_id          |created_at              |place_id        |id                 |public_metrics_like_count|public_metrics_quote_count|public_metrics_reply_count|public_metrics_retweet_count|text                                                                                                                                                                                                                                                                                          |
+-------------------+-------------------

In [216]:
# check the number of record in data_df dataframe
data_df.createOrReplaceTempView("df_data")
spark.sql("SELECT count(*) FROM df_data").show()

+--------+
|count(1)|
+--------+
|   47978|
+--------+



In [217]:
spark.sql("SELECT count(DISTINCT id) FROM df_data").show()

+------------------+
|count(DISTINCT id)|
+------------------+
|             47978|
+------------------+



In [218]:
# flatten the neseted json to one record on row
place_df=flatten_nested_json(place_df)
place_df.show(500, False)

Reading Nested JSON File ... 
Outside isinstance loop: places_country
Outside isinstance loop: places_country_code
Outside isinstance loop: places_full_name
Outside isinstance loop: places_id
+--------------+-------------------+--------------------------------------------------+----------------+
|places_country|places_country_code|places_full_name                                  |places_id       |
+--------------+-------------------+--------------------------------------------------+----------------+
|Singapore     |SG                 |North-East Region, Singapore                      |5f1f473ed6455f55|
|Singapore     |SG                 |Central Region, Singapore                         |58a4c3a0d54e1400|
|Singapore     |SG                 |East Region, Singapore                            |6635b2fcebd13c64|
|Singapore     |SG                 |Singapore                                         |2509b9adc1fedfd2|
|Singapore     |SG                 |North Region, Singapore              

+--------------+-------------------+--------------------------------------------------+----------------+
|places_country|places_country_code|places_full_name                                  |places_id       |
+--------------+-------------------+--------------------------------------------------+----------------+
|Singapore     |SG                 |North-East Region, Singapore                      |5f1f473ed6455f55|
|Singapore     |SG                 |Central Region, Singapore                         |58a4c3a0d54e1400|
|Singapore     |SG                 |East Region, Singapore                            |6635b2fcebd13c64|
|Singapore     |SG                 |Singapore                                         |2509b9adc1fedfd2|
|Singapore     |SG                 |North Region, Singapore                           |14d9532bd696d8cb|
|Singapore     |SG                 |West Region, Singapore                            |0b37664066a8962a|
|Singapore     |SG                 |Lazarus Island     

In [219]:
# show the tweet location
place_df.show(500, False)

+--------------+-------------------+--------------------------------------------------+----------------+
|places_country|places_country_code|places_full_name                                  |places_id       |
+--------------+-------------------+--------------------------------------------------+----------------+
|Singapore     |SG                 |North-East Region, Singapore                      |5f1f473ed6455f55|
|Singapore     |SG                 |Central Region, Singapore                         |58a4c3a0d54e1400|
|Singapore     |SG                 |East Region, Singapore                            |6635b2fcebd13c64|
|Singapore     |SG                 |Singapore                                         |2509b9adc1fedfd2|
|Singapore     |SG                 |North Region, Singapore                           |14d9532bd696d8cb|
|Singapore     |SG                 |West Region, Singapore                            |0b37664066a8962a|
|Singapore     |SG                 |Lazarus Island     

In [220]:
# check the number of record in place_df dataframe
place_df.createOrReplaceTempView("df_data")
spark.sql("SELECT count(*) FROM df_data").show()

+--------+
|count(1)|
+--------+
|    1502|
+--------+



In [221]:
# flatten the neseted json to one record on row
user_df=flatten_nested_json(user_df)

Reading Nested JSON File ... 
Outside isinstance loop: users_id
Outside isinstance loop: users_location
Outside isinstance loop: users_name
Outside isinstance loop: users_username
+-------------------+------------------------------+--------------------------------+---------------+
|users_id           |users_location                |users_name                      |users_username |
+-------------------+------------------------------+--------------------------------+---------------+
|1278250537972985856|stayville / chan's room       |skztrees                        |skztrees       |
|63608631           |A Galaxy Far Far Away...      |Я.                              |raguugar       |
|125625919          |Singapore                     |Lexelle de Charmaine            |lexelledextjluv|
|1357206521407434752|Myanmar                       |HninOo05                        |HninOoWai05    |
|24787145           |Singapore                     |SGFirstAid                      |sfatc          |
|743

In [222]:
# show the tweet user
user_df.show(100, False)

+-------------------+------------------------------+--------------------------------+---------------+
|users_id           |users_location                |users_name                      |users_username |
+-------------------+------------------------------+--------------------------------+---------------+
|1278250537972985856|stayville / chan's room       |skztrees                        |skztrees       |
|63608631           |A Galaxy Far Far Away...      |Я.                              |raguugar       |
|125625919          |Singapore                     |Lexelle de Charmaine            |lexelledextjluv|
|1357206521407434752|Myanmar                       |HninOo05                        |HninOoWai05    |
|24787145           |Singapore                     |SGFirstAid                      |sfatc          |
|74392791           |Central Region, Singapore     |Abu Zimal                       |chat2deen      |
|6988732            |Singapore                     |Daphne Maia                   

In [223]:
# check the number of record in user_df dataframe
user_df.createOrReplaceTempView("df_data")
spark.sql("SELECT count(DISTINCT users_id) FROM df_data").show()

+------------------------+
|count(DISTINCT users_id)|
+------------------------+
|                    4374|
+------------------------+



In [224]:
spark.sql("SELECT users_name,users_location FROM df_data where users_location is not null").show(500,False)

+-----------------------------------------------------+------------------------------------+
|users_name                                           |users_location                      |
+-----------------------------------------------------+------------------------------------+
|skztrees                                             |stayville / chan's room             |
|Я.                                                   |A Galaxy Far Far Away...            |
|Lexelle de Charmaine                                 |Singapore                           |
|HninOo05                                             |Myanmar                             |
|SGFirstAid                                           |Singapore                           |
|Abu Zimal                                            |Central Region, Singapore           |
|Daphne Maia                                          |Singapore                           |
|bakchormeeboy                                        |London/Singapor

In [225]:
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from gensim.models import LsiModel

In [226]:
# get the tweet text 
raw_tweets = data_df.select('id','text')

In [None]:
raw_tweets.show(10,False)

In [None]:
raw_tweets.printSchema()

In [None]:
import re
from pyspark.sql.functions import udf
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@â'
def remove_links(tweet):
    tweet = re.sub(r'http\S+', '', tweet) 
    tweet = re.sub(r'bit.ly/\S+', '', tweet) 
    tweet = tweet.strip('[link]') 
    return tweet
def remove_users(tweet):
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) 
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) 
    return tweet
def remove_punctuation(tweet):
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) 
    return tweet
def remove_number(tweet):
    tweet = re.sub('([0-9]+)', '', tweet) 
    return tweet
def remove_hashtag(tweet):
    tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) 
    return tweet

In [None]:
remove_links=udf(remove_links)
remove_users=udf(remove_users)
remove_punctuation=udf(remove_punctuation)
remove_number=udf(remove_number)

In [None]:
raw_tweets=raw_tweets.withColumn('processed_text', remove_links(raw_tweets['text']))

In [None]:
raw_tweets=raw_tweets.withColumn('processed_text', remove_users(raw_tweets['processed_text']))

In [None]:
raw_tweets=raw_tweets.withColumn('processed_text', remove_punctuation(raw_tweets['processed_text']))

In [None]:
raw_tweets=raw_tweets.withColumn('processed_text', remove_number(raw_tweets['processed_text']))

In [None]:
raw_tweets.select('text','processed_text').show(10,False)

In [None]:
raw_tweets=raw_tweets.withColumn('remove_links', remove_links(raw_tweets['text']))

In [None]:
raw_tweets.select('text').withColumn('text', remove_number('text'))

In [None]:
# Create a tokenizer that Filter away tokens with length < 3, and get rid of symbols like $,#,...
tokenizer = RegexTokenizer().setPattern("[\\W_]+").setMinTokenLength(3).setInputCol("processed_text").setOutputCol("tokens")

In [None]:
#tokenizer = RegexTokenizer().setPattern("^[a-zA-Z]+\\b").setMinTokenLength(3).setInputCol("text").setOutputCol("tokens")

In [None]:
# Tokenize tweets
tokenized_tweets = tokenizer.transform(raw_tweets)

In [None]:
tokenized_tweets.select('text','tokens').show(50, False)

In [None]:
# create cutomized extended stop word list
stopwordList = ["singapore","Singapore"]
StopWordsRemover().getStopWords()
stopwordList.extend(StopWordsRemover().getStopWords())
stopwordList = list(set(stopwordList))

In [None]:
# Create a remover that Filter away stop word
remover = StopWordsRemover(stopWords=stopwordList).setInputCol("tokens").setOutputCol("cleaned")

In [None]:
# remove stopwords
cleaned_tweets = remover.transform(tokenized_tweets)

In [None]:
cleaned_tweets.show()

In [None]:
# create a vector of words that at least appeared in two different tweets, and set maximum vocab size to 20000.
vectorizer = CountVectorizer().setInputCol("cleaned").setOutputCol("features").setVocabSize(20000).setMinDF(2).fit(cleaned_tweets)
wordVectors = vectorizer.transform(cleaned_tweets)#.select("id", "features")

In [230]:
wordVectors.show()

+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 id|                text|      processed_text|              tokens|             cleaned|            features|
+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|1384064534600507392|@hhjflrts im sorr...| im sorry if i co...|[sorry, come, out...|[sorry, come, ign...|(18851,[85,328,32...|
|1384064515759693830|@yogezlor By now ...| By now you shoul...|[now, you, should...|[know, fan, base,...|(18851,[12,27,244...|
|1384064394011709446|Forgot about the ...|Forgot about the ...|[forgot, about, t...|[forgot, wire, ta...|(18851,[641,714,7...|
|1384064393017626635|Massive protester...|Massive protester...|[massive, protest...|[massive, protest...|(18851,[0,3,17,56...|
|1384063860852678657|Just posted a pho...|Just posted a pho...|[just, posted, ph...|[posted, photo, f...|(18851

In [None]:
# LDA
# create Latent Dirichlet Allocation model and run it on our data with 10 iteration and 10 topics
lda = LDA(k=10, maxIter=10)

In [None]:
# fit the model on data
ldaModel = lda.fit(wordVectors)

In [229]:
ldaModel

LDA_9171b118f592

In [228]:
ll = ldaModel.logLikelihood(wordVectors)
lp = ldaModel.logPerplexity(wordVectors)

Py4JJavaError: An error occurred while calling o1663.logLikelihood.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 224.0 failed 1 times, most recent failure: Lost task 3.0 in stage 224.0 (TID 2040, localhost, executor driver): java.net.SocketException: Connection reset by peer: socket write error
	at java.net.SocketOutputStream.socketWrite0(Native Method)
	at java.net.SocketOutputStream.socketWrite(SocketOutputStream.java:111)
	at java.net.SocketOutputStream.write(SocketOutputStream.java:155)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at java.io.DataOutputStream.write(DataOutputStream.java:107)
	at java.io.FilterOutputStream.write(FilterOutputStream.java:97)
	at org.apache.spark.api.python.PythonRDD$.org$apache$spark$api$python$PythonRDD$$write$1(PythonRDD.scala:212)
	at org.apache.spark.api.python.PythonRDD$$anonfun$writeIteratorToStream$1.apply(PythonRDD.scala:224)
	at org.apache.spark.api.python.PythonRDD$$anonfun$writeIteratorToStream$1.apply(PythonRDD.scala:224)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:224)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.writeIteratorToStream(PythonUDFRunner.scala:50)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread$$anonfun$run$1.apply(PythonRunner.scala:346)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1945)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:195)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1143)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1137)
	at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply$mcD$sp(DoubleRDDFunctions.scala:35)
	at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply(DoubleRDDFunctions.scala:35)
	at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply(DoubleRDDFunctions.scala:35)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.DoubleRDDFunctions.sum(DoubleRDDFunctions.scala:34)
	at org.apache.spark.mllib.clustering.LocalLDAModel.logLikelihoodBound(LDAModel.scala:340)
	at org.apache.spark.mllib.clustering.LocalLDAModel.logLikelihood(LDAModel.scala:252)
	at org.apache.spark.ml.clustering.LDAModel.logLikelihood(LDA.scala:518)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketException: Connection reset by peer: socket write error
	at java.net.SocketOutputStream.socketWrite0(Native Method)
	at java.net.SocketOutputStream.socketWrite(SocketOutputStream.java:111)
	at java.net.SocketOutputStream.write(SocketOutputStream.java:155)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at java.io.DataOutputStream.write(DataOutputStream.java:107)
	at java.io.FilterOutputStream.write(FilterOutputStream.java:97)
	at org.apache.spark.api.python.PythonRDD$.org$apache$spark$api$python$PythonRDD$$write$1(PythonRDD.scala:212)
	at org.apache.spark.api.python.PythonRDD$$anonfun$writeIteratorToStream$1.apply(PythonRDD.scala:224)
	at org.apache.spark.api.python.PythonRDD$$anonfun$writeIteratorToStream$1.apply(PythonRDD.scala:224)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:224)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.writeIteratorToStream(PythonUDFRunner.scala:50)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread$$anonfun$run$1.apply(PythonRunner.scala:346)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1945)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:195)


In [227]:
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

The lower bound on the log likelihood of the entire corpus: -3370501.5001549376
The upper bound on perplexity: 9.120232155168205


In [None]:
# extract vocabulary from CountVectorizer
vocab = vectorizer.vocabulary

In [None]:
# create topics based on LDA
lda_topics = ldaModel.describeTopics()
lda_topics.show()

In [None]:
lda_topics.select('termWeights').show(10,False)

In [None]:
lda_topics.select('termIndices').show(10,False)

In [None]:
topics_rdd = lda_topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()

for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)