In [203]:
import json
from pyspark.sql.functions import split, explode

In [204]:
import findspark
findspark.init()

In [205]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('read JSON files').getOrCreate()

In [206]:
# json_df=spark.read.option("inferSchema","true") \
#                 .option("header","true") \
#                 .option("sep",",") \
#                 .json("Mar*.json")

In [207]:
#json_df=spark.read.json("tweets.txt")

In [208]:
json_df=spark.read.json("Apr_tweets*.json")

In [209]:
# number of files
json_df.count()

97

In [210]:
# show the schema
json_df.printSchema()

root
 |-- data: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: string (nullable = true)
 |    |    |-- created_at: string (nullable = true)
 |    |    |-- geo: struct (nullable = true)
 |    |    |    |-- coordinates: struct (nullable = true)
 |    |    |    |    |-- coordinates: array (nullable = true)
 |    |    |    |    |    |-- element: double (containsNull = true)
 |    |    |    |    |-- type: string (nullable = true)
 |    |    |    |-- place_id: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- public_metrics: struct (nullable = true)
 |    |    |    |-- like_count: long (nullable = true)
 |    |    |    |-- quote_count: long (nullable = true)
 |    |    |    |-- reply_count: long (nullable = true)
 |    |    |    |-- retweet_count: long (nullable = true)
 |    |    |-- text: string (nullable = true)
 |-- includes: struct (nullable = true)
 |    |-- places: array (nullable = true)
 |    |    |

In [211]:
# show the schema for tweets
json_df.select('data').printSchema()

root
 |-- data: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: string (nullable = true)
 |    |    |-- created_at: string (nullable = true)
 |    |    |-- geo: struct (nullable = true)
 |    |    |    |-- coordinates: struct (nullable = true)
 |    |    |    |    |-- coordinates: array (nullable = true)
 |    |    |    |    |    |-- element: double (containsNull = true)
 |    |    |    |    |-- type: string (nullable = true)
 |    |    |    |-- place_id: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- public_metrics: struct (nullable = true)
 |    |    |    |-- like_count: long (nullable = true)
 |    |    |    |-- quote_count: long (nullable = true)
 |    |    |    |-- reply_count: long (nullable = true)
 |    |    |    |-- retweet_count: long (nullable = true)
 |    |    |-- text: string (nullable = true)



In [212]:
# convert array to dict
data_df=json_df.select('data').withColumn('data', explode('data').alias('data'))

In [213]:
data_df.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- author_id: string (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- geo: struct (nullable = true)
 |    |    |-- coordinates: struct (nullable = true)
 |    |    |    |-- coordinates: array (nullable = true)
 |    |    |    |    |-- element: double (containsNull = true)
 |    |    |    |-- type: string (nullable = true)
 |    |    |-- place_id: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- public_metrics: struct (nullable = true)
 |    |    |-- like_count: long (nullable = true)
 |    |    |-- quote_count: long (nullable = true)
 |    |    |-- reply_count: long (nullable = true)
 |    |    |-- retweet_count: long (nullable = true)
 |    |-- text: string (nullable = true)



In [214]:
# number of records
data_df.count()

47978

In [215]:
data_df=data_df.select('data.author_id',
                       'data.created_at',
                       'data.geo.place_id',
                       'data.id',
                       'data.public_metrics',
                       'data.text')

In [216]:
data_df.printSchema()

root
 |-- author_id: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- place_id: string (nullable = true)
 |-- id: string (nullable = true)
 |-- public_metrics: struct (nullable = true)
 |    |-- like_count: long (nullable = true)
 |    |-- quote_count: long (nullable = true)
 |    |-- reply_count: long (nullable = true)
 |    |-- retweet_count: long (nullable = true)
 |-- text: string (nullable = true)



In [217]:
# select the place content
place_df=json_df.select('includes.places')

In [218]:
# show the schema fo place
place_df.printSchema()

root
 |-- places: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- country_code: string (nullable = true)
 |    |    |-- full_name: string (nullable = true)
 |    |    |-- id: string (nullable = true)



In [219]:
# select the user content
user_df=json_df.select('includes.users')

In [220]:
# show the schema fo user
user_df.printSchema()

root
 |-- users: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- location: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- username: string (nullable = true)



In [221]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [222]:
def read_nested_json(df):
    column_list = []

    for column_name in df.schema.names:
        print("Outside isinstance loop: " + column_name)
        # Checking column type is ArrayType
        if isinstance(df.schema[column_name].dataType, ArrayType):
            print("Inside isinstance loop of ArrayType: " + column_name)
            df = df.withColumn(column_name, explode(column_name).alias(column_name))
            column_list.append(column_name)

        elif isinstance(df.schema[column_name].dataType, StructType):
            print("Inside isinstance loop of StructType: " + column_name)
            for field in df.schema[column_name].dataType.fields:
                column_list.append(col(column_name + "." + field.name).alias(column_name + "_" + field.name))
        else:
            column_list.append(column_name)

    # Selecting columns using column_list from dataframe: df
    df = df.select(column_list)
    return df

In [223]:
def flatten_nested_json(df):
    read_nested_json_flag = True
    while read_nested_json_flag:
        print("Reading Nested JSON File ... ")
        df = read_nested_json(df)
        df.show(100, False)
        read_nested_json_flag = False

        for column_name in df.schema.names:
            if isinstance(df.schema[column_name].dataType, ArrayType):
              read_nested_json_flag = True
            elif isinstance(df.schema[column_name].dataType, StructType):
              read_nested_json_flag = True
    return df

In [224]:
data_df=flatten_nested_json(data_df)
data_df.show(500, False)

Reading Nested JSON File ... 
Outside isinstance loop: author_id
Outside isinstance loop: created_at
Outside isinstance loop: place_id
Outside isinstance loop: id
Outside isinstance loop: public_metrics
Inside isinstance loop of StructType: public_metrics
Outside isinstance loop: text
+-------------------+------------------------+----------------+-------------------+-------------------------+--------------------------+--------------------------+----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|author_id          |created_at              |place_id        |id                 |public_metrics_like_count|public_metrics_quote_count|public_metrics_reply_count|public_metrics_retweet_count|text                

+-------------------+------------------------+----------------+-------------------+-------------------------+--------------------------+--------------------------+----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|author_id          |created_at              |place_id        |id                 |public_metrics_like_count|public_metrics_quote_count|public_metrics_reply_count|public_metrics_retweet_count|text                                                                                                                                                                                                                                                                                                              

In [225]:
data_df.show(10, False)

+-------------------+------------------------+----------------+-------------------+-------------------------+--------------------------+--------------------------+----------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|author_id          |created_at              |place_id        |id                 |public_metrics_like_count|public_metrics_quote_count|public_metrics_reply_count|public_metrics_retweet_count|text                                                                                                                                                                                                                                                                                          |
+-------------------+-------------------

In [226]:
# check the number of record in data_df dataframe
data_df.createOrReplaceTempView("df_data")
spark.sql("SELECT count(*) FROM df_data").show()

+--------+
|count(1)|
+--------+
|   47978|
+--------+



In [227]:
spark.sql("SELECT count(DISTINCT id) FROM df_data").show()

+------------------+
|count(DISTINCT id)|
+------------------+
|             47978|
+------------------+



In [228]:
# flatten the neseted json to one record on row
place_df=flatten_nested_json(place_df)
place_df.show(500, False)

Reading Nested JSON File ... 
Outside isinstance loop: places
Inside isinstance loop of ArrayType: places
+-------------------------------------------------------------------------------------+
|places                                                                               |
+-------------------------------------------------------------------------------------+
|[Singapore, SG, North-East Region, Singapore, 5f1f473ed6455f55]                      |
|[Singapore, SG, Central Region, Singapore, 58a4c3a0d54e1400]                         |
|[Singapore, SG, East Region, Singapore, 6635b2fcebd13c64]                            |
|[Singapore, SG, Singapore, 2509b9adc1fedfd2]                                         |
|[Singapore, SG, North Region, Singapore, 14d9532bd696d8cb]                           |
|[Singapore, SG, West Region, Singapore, 0b37664066a8962a]                            |
|[Singapore, SG, Lazarus Island, 07d9ed829f484001]                                    |
|[Singapore, S

+--------------+-------------------+--------------------------------------------------+----------------+
|places_country|places_country_code|places_full_name                                  |places_id       |
+--------------+-------------------+--------------------------------------------------+----------------+
|Singapore     |SG                 |North-East Region, Singapore                      |5f1f473ed6455f55|
|Singapore     |SG                 |Central Region, Singapore                         |58a4c3a0d54e1400|
|Singapore     |SG                 |East Region, Singapore                            |6635b2fcebd13c64|
|Singapore     |SG                 |Singapore                                         |2509b9adc1fedfd2|
|Singapore     |SG                 |North Region, Singapore                           |14d9532bd696d8cb|
|Singapore     |SG                 |West Region, Singapore                            |0b37664066a8962a|
|Singapore     |SG                 |Lazarus Island     

In [229]:
# show the tweet location
place_df.show(500, False)

+--------------+-------------------+--------------------------------------------------+----------------+
|places_country|places_country_code|places_full_name                                  |places_id       |
+--------------+-------------------+--------------------------------------------------+----------------+
|Singapore     |SG                 |North-East Region, Singapore                      |5f1f473ed6455f55|
|Singapore     |SG                 |Central Region, Singapore                         |58a4c3a0d54e1400|
|Singapore     |SG                 |East Region, Singapore                            |6635b2fcebd13c64|
|Singapore     |SG                 |Singapore                                         |2509b9adc1fedfd2|
|Singapore     |SG                 |North Region, Singapore                           |14d9532bd696d8cb|
|Singapore     |SG                 |West Region, Singapore                            |0b37664066a8962a|
|Singapore     |SG                 |Lazarus Island     

In [230]:
# check the number of record in place_df dataframe
place_df.createOrReplaceTempView("df_data")
spark.sql("SELECT count(*) FROM df_data").show()

+--------+
|count(1)|
+--------+
|    1502|
+--------+



In [231]:
# flatten the neseted json to one record on row
user_df=flatten_nested_json(user_df)

Reading Nested JSON File ... 
Outside isinstance loop: users
Inside isinstance loop of ArrayType: users
+--------------------------------------------------------------------------------------+
|users                                                                                 |
+--------------------------------------------------------------------------------------+
|[1278250537972985856, stayville / chan's room, skztrees, skztrees]                    |
|[63608631, A Galaxy Far Far Away..., Я., raguugar]                                    |
|[125625919, Singapore, Lexelle de Charmaine, lexelledextjluv]                         |
|[1357206521407434752, Myanmar, HninOo05, HninOoWai05]                                 |
|[24787145, Singapore, SGFirstAid, sfatc]                                              |
|[74392791, Central Region, Singapore, Abu Zimal, chat2deen]                           |
|[6988732, Singapore, Daphne Maia, daphnemaia]                                         |
|[4814

Reading Nested JSON File ... 
Outside isinstance loop: users
Inside isinstance loop of StructType: users
+-------------------+------------------------------+--------------------------------+---------------+
|users_id           |users_location                |users_name                      |users_username |
+-------------------+------------------------------+--------------------------------+---------------+
|1278250537972985856|stayville / chan's room       |skztrees                        |skztrees       |
|63608631           |A Galaxy Far Far Away...      |Я.                              |raguugar       |
|125625919          |Singapore                     |Lexelle de Charmaine            |lexelledextjluv|
|1357206521407434752|Myanmar                       |HninOo05                        |HninOoWai05    |
|24787145           |Singapore                     |SGFirstAid                      |sfatc          |
|74392791           |Central Region, Singapore     |Abu Zimal                  

In [232]:
# show the tweet user
user_df.show(100, False)

+-------------------+------------------------------+--------------------------------+---------------+
|users_id           |users_location                |users_name                      |users_username |
+-------------------+------------------------------+--------------------------------+---------------+
|1278250537972985856|stayville / chan's room       |skztrees                        |skztrees       |
|63608631           |A Galaxy Far Far Away...      |Я.                              |raguugar       |
|125625919          |Singapore                     |Lexelle de Charmaine            |lexelledextjluv|
|1357206521407434752|Myanmar                       |HninOo05                        |HninOoWai05    |
|24787145           |Singapore                     |SGFirstAid                      |sfatc          |
|74392791           |Central Region, Singapore     |Abu Zimal                       |chat2deen      |
|6988732            |Singapore                     |Daphne Maia                   

In [233]:
# check the number of record in user_df dataframe
user_df.createOrReplaceTempView("df_data")
spark.sql("SELECT count(DISTINCT users_id) FROM df_data").show()

+------------------------+
|count(DISTINCT users_id)|
+------------------------+
|                    4374|
+------------------------+



In [234]:
spark.sql("SELECT users_name,users_location FROM df_data where users_location is not null").show(500,False)

+-----------------------------------------------------+------------------------------------+
|users_name                                           |users_location                      |
+-----------------------------------------------------+------------------------------------+
|skztrees                                             |stayville / chan's room             |
|Я.                                                   |A Galaxy Far Far Away...            |
|Lexelle de Charmaine                                 |Singapore                           |
|HninOo05                                             |Myanmar                             |
|SGFirstAid                                           |Singapore                           |
|Abu Zimal                                            |Central Region, Singapore           |
|Daphne Maia                                          |Singapore                           |
|bakchormeeboy                                        |London/Singapor

In [235]:
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from gensim.models import LsiModel

In [236]:
# get the tweet text 
raw_tweets = data_df.select('id','text')

In [237]:
raw_tweets.show(10)

+-------------------+--------------------+
|                 id|                text|
+-------------------+--------------------+
|1384064534600507392|@hhjflrts im sorr...|
|1384064515759693830|@yogezlor By now ...|
|1384064394011709446|Forgot about the ...|
|1384064393017626635|Massive protester...|
|1384063860852678657|Just posted a pho...|
|1384063582938140673|@Ibraheema_Ykb Ok...|
|1384063508652851200|@mintea AND KIDS....|
|1384063029076127747|Just posted a pho...|
|1384062369492389892|@SheaSonia I am s...|
|1384062319781502977|In MyinGyan, terr...|
+-------------------+--------------------+
only showing top 10 rows



In [238]:
# Create a tokenizer that Filter away tokens with length < 3, and get rid of symbols like $,#,...
tokenizer = RegexTokenizer().setPattern("[\\W_]+").setMinTokenLength(3).setInputCol("text").setOutputCol("tokens")

In [239]:
# Tokenize tweets
tokenized_tweets = tokenizer.transform(raw_tweets)

In [240]:
tokenized_tweets.show()

+-------------------+--------------------+--------------------+
|                 id|                text|              tokens|
+-------------------+--------------------+--------------------+
|1384064534600507392|@hhjflrts im sorr...|[hhjflrts, sorry,...|
|1384064515759693830|@yogezlor By now ...|[yogezlor, now, y...|
|1384064394011709446|Forgot about the ...|[forgot, about, t...|
|1384064393017626635|Massive protester...|[massive, protest...|
|1384063860852678657|Just posted a pho...|[just, posted, ph...|
|1384063582938140673|@Ibraheema_Ykb Ok...|[ibraheema, ykb, ...|
|1384063508652851200|@mintea AND KIDS....|[mintea, and, kid...|
|1384063029076127747|Just posted a pho...|[just, posted, ph...|
|1384062369492389892|@SheaSonia I am s...|[sheasonia, heart...|
|1384062319781502977|In MyinGyan, terr...|[myingyan, terror...|
|1384062091493994496|In Myingyan, Mand...|[myingyan, mandal...|
|1384062077375971333|        Exhausted 😣|         [exhausted]|
|1384061831099019267|@HaroldYeddo Love...

In [241]:
# create cutomized extended stop work list
stopwordList = ["http"]
StopWordsRemover().getStopWords()
stopwordList.extend(StopWordsRemover().getStopWords())
stopwordList = list(set(stopwordList))

In [242]:
# Create a remover that Filter away stop word
remover = StopWordsRemover(stopWords=stopwordList).setInputCol("tokens").setOutputCol("cleaned")

In [243]:
# remove stopwords
cleaned_tweets = remover.transform(tokenized_tweets)

In [244]:
cleaned_tweets.show()

+-------------------+--------------------+--------------------+--------------------+
|                 id|                text|              tokens|             cleaned|
+-------------------+--------------------+--------------------+--------------------+
|1384064534600507392|@hhjflrts im sorr...|[hhjflrts, sorry,...|[hhjflrts, sorry,...|
|1384064515759693830|@yogezlor By now ...|[yogezlor, now, y...|[yogezlor, know, ...|
|1384064394011709446|Forgot about the ...|[forgot, about, t...|[forgot, wire, ta...|
|1384064393017626635|Massive protester...|[massive, protest...|[massive, protest...|
|1384063860852678657|Just posted a pho...|[just, posted, ph...|[posted, photo, s...|
|1384063582938140673|@Ibraheema_Ykb Ok...|[ibraheema, ykb, ...|[ibraheema, ykb, ...|
|1384063508652851200|@mintea AND KIDS....|[mintea, and, kid...|[mintea, kids, da...|
|1384063029076127747|Just posted a pho...|[just, posted, ph...|[posted, photo, s...|
|1384062369492389892|@SheaSonia I am s...|[sheasonia, heart...|[s

In [245]:
# create a vector of words that at least appeared in two different tweets, and set maximum vocab size to 20000.
vectorizer = CountVectorizer().setInputCol("cleaned").setOutputCol("features").setVocabSize(20000).setMinDF(2).fit(cleaned_tweets)
wordVectors = vectorizer.transform(cleaned_tweets)#.select("id", "features")

In [246]:
wordVectors.show()

+-------------------+--------------------+--------------------+--------------------+--------------------+
|                 id|                text|              tokens|             cleaned|            features|
+-------------------+--------------------+--------------------+--------------------+--------------------+
|1384064534600507392|@hhjflrts im sorr...|[hhjflrts, sorry,...|[hhjflrts, sorry,...|(20000,[88,336,33...|
|1384064515759693830|@yogezlor By now ...|[yogezlor, now, y...|[yogezlor, know, ...|(20000,[16,29,241...|
|1384064394011709446|Forgot about the ...|[forgot, about, t...|[forgot, wire, ta...|(20000,[0,1,503,6...|
|1384064393017626635|Massive protester...|[massive, protest...|[massive, protest...|(20000,[0,2,7,21,...|
|1384063860852678657|Just posted a pho...|[just, posted, ph...|[posted, photo, s...|(20000,[0,1,3,4,5...|
|1384063582938140673|@Ibraheema_Ykb Ok...|[ibraheema, ykb, ...|[ibraheema, ykb, ...|(20000,[28,71,98,...|
|1384063508652851200|@mintea AND KIDS....|[min

In [247]:
# LDA
# create Latent Dirichlet Allocation model and run it on our data with 10 iteration and 10 topics
lda = LDA(k=10, maxIter=25)

In [None]:
# fit the model on data
ldaModel = lda.fit(wordVectors)

In [None]:
ll = ldaModel.logLikelihood(wordVectors)
lp = ldaModel.logPerplexity(wordVectors)

In [None]:
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

In [None]:
# extract vocabulary from CountVectorizer
vocab = vectorizer.vocabulary

In [None]:
# create topics based on LDA
lda_topics = ldaModel.describeTopics()
lda_topics.show()

In [None]:
lda_topics.select('termIndices').show(10,False)

In [None]:
topics_rdd = lda_topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()

for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)