In [72]:
import json
from pyspark.sql.functions import split, explode

In [73]:
import findspark
findspark.init()

In [74]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('read JSON files').getOrCreate()

In [4]:
json_df=spark.read.option("inferSchema","true") \
                .option("header","true") \
                .option("sep",",") \
                .json("Mar*.json")

In [5]:
json_df=spark.read.json("Apr_tweets1?.json")

In [75]:
json_df=spark.read.json("Apr_tweets*.json")

In [77]:
json_df.show(100)

+--------------------+--------------------+--------------------+
|                data|            includes|                meta|
+--------------------+--------------------+--------------------+
|[[127825053797298...|[[[Singapore, SG,...|[1384064534600507...|
|[[2378489462, 202...|[[[Singapore, SG,...|[1380850453634760...|
|[[873550863121371...|[[[Singapore, SG,...|[1385188172414418...|
|[[135719655363146...|[[[Singapore, SG,...|[1385609869701517...|
|[[135719655363146...|[[[Singapore, SG,...|[1382310090133307...|
|[[824637752574488...|[[[Singapore, SG,...|[1380766986632843...|
|[[111780791676384...|[[[Singapore, SG,...|[1382946091482505...|
|[[3312668113, 202...|[[[Singapore, SG,...|[1387358181194485...|
|[[138151542070472...|[[[Singapore, SG,...|[1381964154198585...|
|[[1269628692, 202...|[[[Singapore, SG,...|[1382242596253175...|
|[[135719655363146...|[[[Singapore, SG,...|[1383742348102758...|
|[[1369590464, 202...|[[[Singapore, SG,...|[1386990751050092...|
|[[2917431175, 202...|[[[

In [78]:
json_df.printSchema()

root
 |-- data: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: string (nullable = true)
 |    |    |-- created_at: string (nullable = true)
 |    |    |-- geo: struct (nullable = true)
 |    |    |    |-- coordinates: struct (nullable = true)
 |    |    |    |    |-- coordinates: array (nullable = true)
 |    |    |    |    |    |-- element: double (containsNull = true)
 |    |    |    |    |-- type: string (nullable = true)
 |    |    |    |-- place_id: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- public_metrics: struct (nullable = true)
 |    |    |    |-- like_count: long (nullable = true)
 |    |    |    |-- quote_count: long (nullable = true)
 |    |    |    |-- reply_count: long (nullable = true)
 |    |    |    |-- retweet_count: long (nullable = true)
 |    |    |-- text: string (nullable = true)
 |-- includes: struct (nullable = true)
 |    |-- places: array (nullable = true)
 |    |    |

In [79]:
json_df.select('data').printSchema()

root
 |-- data: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: string (nullable = true)
 |    |    |-- created_at: string (nullable = true)
 |    |    |-- geo: struct (nullable = true)
 |    |    |    |-- coordinates: struct (nullable = true)
 |    |    |    |    |-- coordinates: array (nullable = true)
 |    |    |    |    |    |-- element: double (containsNull = true)
 |    |    |    |    |-- type: string (nullable = true)
 |    |    |    |-- place_id: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- public_metrics: struct (nullable = true)
 |    |    |    |-- like_count: long (nullable = true)
 |    |    |    |-- quote_count: long (nullable = true)
 |    |    |    |-- reply_count: long (nullable = true)
 |    |    |    |-- retweet_count: long (nullable = true)
 |    |    |-- text: string (nullable = true)



In [80]:
data_df=json_df.select('data').withColumn('data', explode('data').alias('data'))

In [81]:
data_df.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- author_id: string (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- geo: struct (nullable = true)
 |    |    |-- coordinates: struct (nullable = true)
 |    |    |    |-- coordinates: array (nullable = true)
 |    |    |    |    |-- element: double (containsNull = true)
 |    |    |    |-- type: string (nullable = true)
 |    |    |-- place_id: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- public_metrics: struct (nullable = true)
 |    |    |-- like_count: long (nullable = true)
 |    |    |-- quote_count: long (nullable = true)
 |    |    |-- reply_count: long (nullable = true)
 |    |    |-- retweet_count: long (nullable = true)
 |    |-- text: string (nullable = true)



In [82]:
data_df=data_df.select('data.author_id',
                       'data.created_at',
                       'data.geo.place_id',
                       'data.id',
                       'data.public_metrics',
                       'data.text')

In [83]:
data_df.printSchema()

root
 |-- author_id: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- place_id: string (nullable = true)
 |-- id: string (nullable = true)
 |-- public_metrics: struct (nullable = true)
 |    |-- like_count: long (nullable = true)
 |    |-- quote_count: long (nullable = true)
 |    |-- reply_count: long (nullable = true)
 |    |-- retweet_count: long (nullable = true)
 |-- text: string (nullable = true)



In [84]:
place_df=json_df.select('includes.places')

In [85]:
place_df.printSchema()

root
 |-- places: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- country_code: string (nullable = true)
 |    |    |-- full_name: string (nullable = true)
 |    |    |-- id: string (nullable = true)



In [86]:
user_df=json_df.select('includes.users')

In [87]:
user_df.printSchema()

root
 |-- users: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- location: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- username: string (nullable = true)



In [88]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [89]:
def read_nested_json(df):
    column_list = []

    for column_name in df.schema.names:
        print("Outside isinstance loop: " + column_name)
        # Checking column type is ArrayType
        if isinstance(df.schema[column_name].dataType, ArrayType):
            print("Inside isinstance loop of ArrayType: " + column_name)
            df = df.withColumn(column_name, explode(column_name).alias(column_name))
            column_list.append(column_name)

        elif isinstance(df.schema[column_name].dataType, StructType):
            print("Inside isinstance loop of StructType: " + column_name)
            for field in df.schema[column_name].dataType.fields:
                column_list.append(col(column_name + "." + field.name).alias(column_name + "_" + field.name))
        else:
            column_list.append(column_name)

    # Selecting columns using column_list from dataframe: df
    df = df.select(column_list)
    return df

In [92]:
read_nested_json_flag = True

while read_nested_json_flag:
  print("Reading Nested JSON File ... ")
  data_df = read_nested_json(data_df)
  data_df.show(100, False)
  read_nested_json_flag = False

  for column_name in df.schema.names:
    if isinstance(df.schema[column_name].dataType, ArrayType):
      read_nested_json_flag = True
    elif isinstance(df.schema[column_name].dataType, StructType):
      read_nested_json_flag = True

data_df.show(500, False)

Reading Nested JSON File ... 
Outside isinstance loop: author_id
Outside isinstance loop: created_at
Outside isinstance loop: place_id
Outside isinstance loop: id
Outside isinstance loop: public_metrics_like_count
Outside isinstance loop: public_metrics_quote_count
Outside isinstance loop: public_metrics_reply_count
Outside isinstance loop: public_metrics_retweet_count
Outside isinstance loop: text
+-------------------+------------------------+----------------+-------------------+-------------------------+--------------------------+--------------------------+----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|author_id          |created_at              |place_id        |id                 |public_metric

NameError: name 'df' is not defined

In [93]:
data_df.show(10, False)

+-------------------+------------------------+----------------+-------------------+-------------------------+--------------------------+--------------------------+----------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|author_id          |created_at              |place_id        |id                 |public_metrics_like_count|public_metrics_quote_count|public_metrics_reply_count|public_metrics_retweet_count|text                                                                                                                                                                                                                                                                                          |
+-------------------+-------------------

In [94]:
data_df.createOrReplaceTempView("df_data")
spark.sql("SELECT count(*) FROM df_data").show()

+--------+
|count(1)|
+--------+
|   47978|
+--------+



In [95]:
spark.sql("SELECT count(DISTINCT id) FROM df_data").show()

+------------------+
|count(DISTINCT id)|
+------------------+
|             47978|
+------------------+



In [96]:
read_nested_json_flag = True

while read_nested_json_flag:
  print("Reading Nested JSON File ... ")
  place_df = read_nested_json(place_df)
  place_df.show(100, False)
  read_nested_json_flag = False

  for column_name in df.schema.names:
    if isinstance(df.schema[column_name].dataType, ArrayType):
      read_nested_json_flag = True
    elif isinstance(df.schema[column_name].dataType, StructType):
      read_nested_json_flag = True

place_df.show(500, False)

Reading Nested JSON File ... 
Outside isinstance loop: places
Inside isinstance loop of ArrayType: places
+-------------------------------------------------------------------------------------+
|places                                                                               |
+-------------------------------------------------------------------------------------+
|[Singapore, SG, North-East Region, Singapore, 5f1f473ed6455f55]                      |
|[Singapore, SG, Central Region, Singapore, 58a4c3a0d54e1400]                         |
|[Singapore, SG, East Region, Singapore, 6635b2fcebd13c64]                            |
|[Singapore, SG, Singapore, 2509b9adc1fedfd2]                                         |
|[Singapore, SG, North Region, Singapore, 14d9532bd696d8cb]                           |
|[Singapore, SG, West Region, Singapore, 0b37664066a8962a]                            |
|[Singapore, SG, Lazarus Island, 07d9ed829f484001]                                    |
|[Singapore, S

NameError: name 'df' is not defined

In [97]:
place_df.createOrReplaceTempView("df_data")
spark.sql("SELECT count(*) FROM df_data").show()

+--------+
|count(1)|
+--------+
|    1502|
+--------+



In [104]:
read_nested_json_flag = True

while read_nested_json_flag:
  print("Reading Nested JSON File ... ")
  user_df = read_nested_json(user_df)
  user_df.show(100, False)
  read_nested_json_flag = False

  for column_name in df.schema.names:
    if isinstance(df.schema[column_name].dataType, ArrayType):
      read_nested_json_flag = True
    elif isinstance(df.schema[column_name].dataType, StructType):
      read_nested_json_flag = True


Reading Nested JSON File ... 
Outside isinstance loop: users
Inside isinstance loop of StructType: users
+-------------------+------------------------------+--------------------------------+---------------+
|users_id           |users_location                |users_name                      |users_username |
+-------------------+------------------------------+--------------------------------+---------------+
|1278250537972985856|stayville / chan's room       |skztrees                        |skztrees       |
|63608631           |A Galaxy Far Far Away...      |Я.                              |raguugar       |
|125625919          |Singapore                     |Lexelle de Charmaine            |lexelledextjluv|
|1357206521407434752|Myanmar                       |HninOo05                        |HninOoWai05    |
|24787145           |Singapore                     |SGFirstAid                      |sfatc          |
|74392791           |Central Region, Singapore     |Abu Zimal                  

NameError: name 'df' is not defined

In [105]:
user_df.show(100, False)

+-------------------+------------------------------+--------------------------------+---------------+
|users_id           |users_location                |users_name                      |users_username |
+-------------------+------------------------------+--------------------------------+---------------+
|1278250537972985856|stayville / chan's room       |skztrees                        |skztrees       |
|63608631           |A Galaxy Far Far Away...      |Я.                              |raguugar       |
|125625919          |Singapore                     |Lexelle de Charmaine            |lexelledextjluv|
|1357206521407434752|Myanmar                       |HninOo05                        |HninOoWai05    |
|24787145           |Singapore                     |SGFirstAid                      |sfatc          |
|74392791           |Central Region, Singapore     |Abu Zimal                       |chat2deen      |
|6988732            |Singapore                     |Daphne Maia                   

In [106]:
user_df.createOrReplaceTempView("df_data")
spark.sql("SELECT count(DISTINCT users_id) FROM df_data").show()

+------------------------+
|count(DISTINCT users_id)|
+------------------------+
|                    4374|
+------------------------+



In [107]:
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from gensim.models import LsiModel

In [108]:
# get the tweet text 
raw_tweets = data_df.select('id','text')

In [109]:
raw_tweets.show(10)

+-------------------+--------------------+
|                 id|                text|
+-------------------+--------------------+
|1384064534600507392|@hhjflrts im sorr...|
|1384064515759693830|@yogezlor By now ...|
|1384064394011709446|Forgot about the ...|
|1384064393017626635|Massive protester...|
|1384063860852678657|Just posted a pho...|
|1384063582938140673|@Ibraheema_Ykb Ok...|
|1384063508652851200|@mintea AND KIDS....|
|1384063029076127747|Just posted a pho...|
|1384062369492389892|@SheaSonia I am s...|
|1384062319781502977|In MyinGyan, terr...|
+-------------------+--------------------+
only showing top 10 rows



In [110]:
# Create a tokenizer that Filter away tokens with length < 3, and get rid of symbols like $,#,...
tokenizer = RegexTokenizer().setPattern("[\\W_]+").setMinTokenLength(3).setInputCol("text").setOutputCol("tokens")

In [111]:
# Tokenize tweets
tokenized_tweets = tokenizer.transform(raw_tweets)
remover = StopWordsRemover().setInputCol("tokens").setOutputCol("cleaned")

In [112]:
tokenized_tweets.show()

+-------------------+--------------------+--------------------+
|                 id|                text|              tokens|
+-------------------+--------------------+--------------------+
|1384064534600507392|@hhjflrts im sorr...|[hhjflrts, sorry,...|
|1384064515759693830|@yogezlor By now ...|[yogezlor, now, y...|
|1384064394011709446|Forgot about the ...|[forgot, about, t...|
|1384064393017626635|Massive protester...|[massive, protest...|
|1384063860852678657|Just posted a pho...|[just, posted, ph...|
|1384063582938140673|@Ibraheema_Ykb Ok...|[ibraheema, ykb, ...|
|1384063508652851200|@mintea AND KIDS....|[mintea, and, kid...|
|1384063029076127747|Just posted a pho...|[just, posted, ph...|
|1384062369492389892|@SheaSonia I am s...|[sheasonia, heart...|
|1384062319781502977|In MyinGyan, terr...|[myingyan, terror...|
|1384062091493994496|In Myingyan, Mand...|[myingyan, mandal...|
|1384062077375971333|        Exhausted 😣|         [exhausted]|
|1384061831099019267|@HaroldYeddo Love...

In [113]:
# remove stopwords
cleaned_tweets = remover.transform(tokenized_tweets)

In [114]:
cleaned_tweets.show()

+-------------------+--------------------+--------------------+--------------------+
|                 id|                text|              tokens|             cleaned|
+-------------------+--------------------+--------------------+--------------------+
|1384064534600507392|@hhjflrts im sorr...|[hhjflrts, sorry,...|[hhjflrts, sorry,...|
|1384064515759693830|@yogezlor By now ...|[yogezlor, now, y...|[yogezlor, know, ...|
|1384064394011709446|Forgot about the ...|[forgot, about, t...|[forgot, wire, ta...|
|1384064393017626635|Massive protester...|[massive, protest...|[massive, protest...|
|1384063860852678657|Just posted a pho...|[just, posted, ph...|[posted, photo, s...|
|1384063582938140673|@Ibraheema_Ykb Ok...|[ibraheema, ykb, ...|[ibraheema, ykb, ...|
|1384063508652851200|@mintea AND KIDS....|[mintea, and, kid...|[mintea, kids, da...|
|1384063029076127747|Just posted a pho...|[just, posted, ph...|[posted, photo, s...|
|1384062369492389892|@SheaSonia I am s...|[sheasonia, heart...|[s

In [115]:
# create a vector of words that at least appeared in two different tweets, and set maximum vocab size to 20000.
vectorizer = CountVectorizer().setInputCol("cleaned").setOutputCol("features").setVocabSize(20000).setMinDF(2).fit(cleaned_tweets)
wordVectors = vectorizer.transform(cleaned_tweets)#.select("id", "features")

In [116]:
wordVectors.show()

+-------------------+--------------------+--------------------+--------------------+--------------------+
|                 id|                text|              tokens|             cleaned|            features|
+-------------------+--------------------+--------------------+--------------------+--------------------+
|1384064534600507392|@hhjflrts im sorr...|[hhjflrts, sorry,...|[hhjflrts, sorry,...|(20000,[88,337,33...|
|1384064515759693830|@yogezlor By now ...|[yogezlor, now, y...|[yogezlor, know, ...|(20000,[16,29,240...|
|1384064394011709446|Forgot about the ...|[forgot, about, t...|[forgot, wire, ta...|(20000,[0,1,505,6...|
|1384064393017626635|Massive protester...|[massive, protest...|[massive, protest...|(20000,[0,2,7,21,...|
|1384063860852678657|Just posted a pho...|[just, posted, ph...|[posted, photo, s...|(20000,[0,1,3,4,5...|
|1384063582938140673|@Ibraheema_Ykb Ok...|[ibraheema, ykb, ...|[ibraheema, ykb, ...|(20000,[28,71,98,...|
|1384063508652851200|@mintea AND KIDS....|[min

In [117]:
# LDA
# create Latent Dirichlet Allocation model and run it on our data with 10 iteration and 10 topics
lda = LDA(k=10, maxIter=25)

In [118]:
# fit the model on data
ldaModel = lda.fit(wordVectors)

In [119]:
ll = ldaModel.logLikelihood(wordVectors)
lp = ldaModel.logPerplexity(wordVectors)

In [120]:
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

The lower bound on the log likelihood of the entire corpus: -3586050.9115604633
The upper bound on perplexity: 8.542345256303555


In [121]:
# extract vocabulary from CountVectorizer
vocab = vectorizer.vocabulary

In [122]:
# create topics based on LDA
lda_topics = ldaModel.describeTopics()
lda_topics.show()

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[0, 2, 7, 12, 10,...|[0.03993602920883...|
|    1|[5, 6, 19, 57, 37...|[0.05888047890597...|
|    2|[0, 1, 3, 4, 26, ...|[0.10644432705193...|
|    3|[0, 1833, 371, 61...|[0.00241325988843...|
|    4|[1325, 0, 2262, 2...|[0.00198421929755...|
|    5|[580, 2337, 2639,...|[0.00236932821090...|
|    6|[0, 1, 23, 53, 59...|[0.03145577360718...|
|    7|[0, 2, 93, 127, 6...|[0.03963668235757...|
|    8|[0, 2116, 3842, 2...|[0.00210322691202...|
|    9|[0, 2518, 758, 23...|[0.00291004701849...|
+-----+--------------------+--------------------+



In [123]:
lda_topics.select('termIndices').show(10,False)

+-----------------------------------------------------------+
|termIndices                                                |
+-----------------------------------------------------------+
|[0, 2, 7, 12, 10, 15, 14, 9, 8, 22]                        |
|[5, 6, 19, 57, 37, 0, 299, 48, 87, 18]                     |
|[0, 1, 3, 4, 26, 13, 11, 9, 16, 51]                        |
|[0, 1833, 371, 619, 2187, 5, 357, 6, 3522, 3469]           |
|[1325, 0, 2262, 2062, 1010, 68, 2735, 3419, 1717, 26]      |
|[580, 2337, 2639, 2683, 2680, 2675, 2700, 2931, 2897, 2842]|
|[0, 1, 23, 53, 59, 104, 18, 41, 99, 160]                   |
|[0, 2, 93, 127, 64, 14, 106, 130, 132, 168]                |
|[0, 2116, 3842, 2547, 2764, 2731, 2936, 2757, 2852, 2728]  |
|[0, 2518, 758, 2345, 1378, 1, 2013, 13, 4501, 3368]        |
+-----------------------------------------------------------+



In [124]:
topics_rdd = lda_topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()

for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)

topic: 0
*************************
https
whatshappeninginmyanmar
military
terrorists
people
good
junta
amp
like
abducted
*************************
topic: 1
*************************
edward
barber
edwardbarber
mayward
life
https
maymayentrata07
best
live
love
*************************
topic: 2
*************************
https
singapore
posted
photo
thank
day
one
amp
time
part
*************************
topic: 3
*************************
https
robson
official
design
pyc
edward
women
barber
sgd1
carbonfiber
*************************
topic: 4
*************************
goodnight
https
nope
accelerateart
indeed
thanks
parishilton
cryptovoxels
tanhuiyi
thank
*************************
topic: 5
*************************
sat
adnlivefortoday
kharel
solidadntrends
hotxander01
barbarapunzala1
rave
marlenesalsona
randomgames
tess
*************************
topic: 6
*************************
https
singapore
free
zerowaste
bts
twt
love
really
lol
fucking
*************************
topic: 7
***************