In [1]:
from pyspark import *
from pyspark.sql import SQLContext
import json

In [2]:
# if true Enable Arrow-based columnar data transfers for speed.
spark.conf.set("spark.sql.execution.arrow.enabled", "false")

In [3]:
# create Spark context and sql context.
#sc = SparkContext()
sqlContext = SQLContext(sc)

In [4]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize #package for flattening json in pandas df

In [5]:
# load json object testData
with open("data/testData.json", encoding="utf-8") as f:
    d = json.load(f)

In [6]:
# Creates a dataframe; each row is a tweet, each column is a tweet attribute + tweet ID
pdf = json_normalize(data=d['tweets'], record_path='tweet',
                            meta=['tweet_id'])
pdf.head(5)

Unnamed: 0,tweet_date,tweet_location,tweet_text,tweet_user_name,tweet_id
0,2018-11-07 15:09:51,"[-116.9899205, 33.729378499999996]",@cosmic_coolness @hardhouz13 Yup!,Marilyn Edmonds,1060187580392628234
1,2018-11-07 15:09:51,"[-74.7243235, 40.073040500000005]","@RVAwonk There’s some really, really great new...",Sue,1060187580145303552
2,2018-11-07 15:09:51,"[-96.4765385, 32.092979]",@Pix_Stixx I love you 💙,James Thomas,1060187580724113414
3,2018-11-07 15:09:51,"[-96.9827885, 32.958255]",Today my Friday 💪🏾,Robert Jackson 🙏🏾,1060187581109886976
4,2018-11-07 15:09:51,"[-87.5334065, 33.1966005]",@biglehsee That doesn’t change the facts that ...,tanner,1060187579885281282


In [7]:
#_____________Pandas Text Cleaning__________________
# Convert to lowercase
pdf['tweet_text'] = pdf['tweet_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
pdf['tweet_text'].head()

0                    @cosmic_coolness @hardhouz13 yup!
1    @rvawonk there’s some really, really great new...
2                              @pix_stixx i love you 💙
3                                   today my friday 💪🏾
4    @biglehsee that doesn’t change the facts that ...
Name: tweet_text, dtype: object

In [8]:
rgx = '[.,]'  # remove , or .
pdf['tweet_text'] = pdf['tweet_text'].str.replace(rgx, '')
pdf['tweet_text'].head()

0                    @cosmic_coolness @hardhouz13 yup!
1    @rvawonk there’s some really really great news...
2                              @pix_stixx i love you 💙
3                                   today my friday 💪🏾
4    @biglehsee that doesn’t change the facts that ...
Name: tweet_text, dtype: object

In [9]:
# Create count numbers of hastags used in each tweet.
pdf['hastags'] = pdf['tweet_text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
pdf[['tweet_text','hastags']].head()
pdf['hastags'].max()
pdf["hastags"].sum()

862

In [10]:
# Filter by keyword, we can have it search many keywords such as [roadblocking, landslide, poweroutage...]
#Which ever we find most suitable.
keyword = "help"
pdf['keyword'] = pdf['tweet_text'].apply(lambda x: len([x for x in x.split() if x.startswith(keyword)]))
pdf[['keyword']].max()

keyword    3
dtype: int64

In [11]:
# Split latitude and longitude.
pdf["lat"] = pdf["tweet_location"].apply(lambda x: x[0])
pdf["long"] = pdf["tweet_location"].apply(lambda x: x[1])
pdf

Unnamed: 0,tweet_date,tweet_location,tweet_text,tweet_user_name,tweet_id,hastags,keyword,lat,long
0,2018-11-07 15:09:51,"[-116.9899205, 33.729378499999996]",@cosmic_coolness @hardhouz13 yup!,Marilyn Edmonds,1060187580392628234,0,0,-116.989920,33.729378
1,2018-11-07 15:09:51,"[-74.7243235, 40.073040500000005]",@rvawonk there’s some really really great news...,Sue,1060187580145303552,0,0,-74.724323,40.073041
2,2018-11-07 15:09:51,"[-96.4765385, 32.092979]",@pix_stixx i love you 💙,James Thomas,1060187580724113414,0,0,-96.476539,32.092979
3,2018-11-07 15:09:51,"[-96.9827885, 32.958255]",today my friday 💪🏾,Robert Jackson 🙏🏾,1060187581109886976,0,0,-96.982788,32.958255
4,2018-11-07 15:09:51,"[-87.5334065, 33.1966005]",@biglehsee that doesn’t change the facts that ...,tanner,1060187579885281282,0,0,-87.533406,33.196601
5,2018-11-07 15:09:51,"[-87.5334065, 33.1966005]",anyone else hope jeremy pruitt leaves ut after...,Hunter Letson,1060187581558767617,0,0,-87.533406,33.196601
6,2018-11-07 15:09:51,"[-87.263022, 33.8583035]",@_queenalexiaa true i respect how you feel but...,LLRIO💯,1060187581294526464,0,1,-87.263022,33.858303
7,2018-11-07 15:09:51,"[-93.2620465, 44.9706105]",happy wednesday! we’re kicking off our morning...,Denamico,1060187581487419394,2,0,-93.262046,44.970610
8,2018-11-07 15:09:51,"[-73.022108, 41.5665785]",busy busy day,Ytcher Marte 🇩🇴,1060187582171172864,0,0,-73.022108,41.566578
9,2018-11-07 15:09:51,"[-121.2431435, 38.6495685]",@shoq @andrewgillum @berniesanders true!!!,Patricia Buchanan,1060187582133202944,0,0,-121.243144,38.649569


In [12]:
# Remove tweet that does not cointain any of the keywords.
#keywordDf = pdf[(pdf['keyword'] >= 1)]
#keywordDf.head()

In [13]:
# Create a Spark DataFrame from a Pandas DataFrame
df = sqlContext.createDataFrame(pdf)

In [14]:
df

DataFrame[tweet_date: string, tweet_location: array<double>, tweet_text: string, tweet_user_name: string, tweet_id: bigint, hastags: bigint, keyword: bigint, lat: double, long: double]

In [15]:
df.select("tweet_id","tweet_date","lat","long","tweet_text").show()

+-------------------+-------------------+------------------+------------------+--------------------+
|           tweet_id|         tweet_date|               lat|              long|          tweet_text|
+-------------------+-------------------+------------------+------------------+--------------------+
|1060187580392628234|2018-11-07 15:09:51|      -116.9899205|33.729378499999996|@cosmic_coolness ...|
|1060187580145303552|2018-11-07 15:09:51|       -74.7243235|40.073040500000005|@rvawonk there’s ...|
|1060187580724113414|2018-11-07 15:09:51|       -96.4765385|         32.092979|@pix_stixx i love...|
|1060187581109886976|2018-11-07 15:09:51|       -96.9827885|         32.958255|today my friday 💪🏾|
|1060187579885281282|2018-11-07 15:09:51|       -87.5334065|        33.1966005|@biglehsee that d...|
|1060187581558767617|2018-11-07 15:09:51|       -87.5334065|        33.1966005|anyone else hope ...|
|1060187581294526464|2018-11-07 15:09:51|        -87.263022|        33.8583035|@_queenalexiaa

In [16]:
# use latitude and longitide as x and y features with k-means
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=["lat", "long"], outputCol="features")
new_df = vecAssembler.transform(df)
new_df.select("tweet_id","features","tweet_text").show()

+-------------------+--------------------+--------------------+
|           tweet_id|            features|          tweet_text|
+-------------------+--------------------+--------------------+
|1060187580392628234|[-116.9899205,33....|@cosmic_coolness ...|
|1060187580145303552|[-74.7243235,40.0...|@rvawonk there’s ...|
|1060187580724113414|[-96.4765385,32.0...|@pix_stixx i love...|
|1060187581109886976|[-96.9827885,32.9...|today my friday 💪🏾|
|1060187579885281282|[-87.5334065,33.1...|@biglehsee that d...|
|1060187581558767617|[-87.5334065,33.1...|anyone else hope ...|
|1060187581294526464|[-87.263022,33.85...|@_queenalexiaa tr...|
|1060187581487419394|[-93.2620465,44.9...|happy wednesday! ...|
|1060187582171172864|[-73.022108,41.56...|       busy busy day|
|1060187582133202944|[-121.2431435,38....|@shoq @andrewgill...|
|1060187579679674368|[-75.117998,40.00...|looking forward t...|
|1060187582884126721|[-84.433106,33.76...|fell asleep while...|
|1060187583035125760|[-100.0768885,31....|

In [17]:
# run K-means over longitude and latitude, with k amount of target clusters.
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=16, seed=1) # 16 clusters
model = kmeans.fit(new_df.select('features'))

In [18]:
# show tweets with their associated cluster("prediction") 
transformed = model.transform(new_df)
transformed.select("tweet_id","prediction","lat","long","tweet_text").show()

+-------------------+----------+------------------+------------------+--------------------+
|           tweet_id|prediction|               lat|              long|          tweet_text|
+-------------------+----------+------------------+------------------+--------------------+
|1060187580392628234|         6|      -116.9899205|33.729378499999996|@cosmic_coolness ...|
|1060187580145303552|        11|       -74.7243235|40.073040500000005|@rvawonk there’s ...|
|1060187580724113414|         8|       -96.4765385|         32.092979|@pix_stixx i love...|
|1060187581109886976|         8|       -96.9827885|         32.958255|today my friday 💪🏾|
|1060187579885281282|        12|       -87.5334065|        33.1966005|@biglehsee that d...|
|1060187581558767617|        12|       -87.5334065|        33.1966005|anyone else hope ...|
|1060187581294526464|        12|        -87.263022|        33.8583035|@_queenalexiaa tr...|
|1060187581487419394|         5|       -93.2620465|        44.9706105|happy wednes

In [19]:
# Shows the cluster centers. ("Centroids")
centers = model.clusterCenters()
print("Cluster Centers: ")
for idx,center in enumerate(centers):
    print(idx,center)

Cluster Centers: 
0 [-80.3930815   35.08578494]
1 [-105.37413186   38.992294  ]
2 [-121.55264132   37.81999371]
3 [-70.87457875  43.1822945 ]
4 [-78.76553445  42.57045337]
5 [-88.86159983  42.79483043]
6 [-116.21787624   34.02920732]
7 [-93.22945727  38.08574539]
8 [-96.17656339  33.00276986]
9 [-77.43979503  38.58519806]
10 [-122.23992986   47.08393928]
11 [-74.24344948  40.85643824]
12 [-85.3365662   33.86646726]
13 [-101.60343243   31.68036666]
14 [-84.09548128  40.76566896]
15 [-112.83001803   41.65939656]


In [20]:
transformed.printSchema()

root
 |-- tweet_date: string (nullable = true)
 |-- tweet_location: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- tweet_text: string (nullable = true)
 |-- tweet_user_name: string (nullable = true)
 |-- tweet_id: long (nullable = true)
 |-- hastags: long (nullable = true)
 |-- keyword: long (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: integer (nullable = false)



In [21]:
# Combines all tweets under its associated k-means cluster class.
import pyspark.sql.functions as F
documents = transformed.groupBy('prediction')\
  .agg(F.collect_list('tweet_text').alias("cluster_text")).orderBy("prediction")

In [22]:
documents.show()

+----------+--------------------+
|prediction|        cluster_text|
+----------+--------------------+
|         0|[@colegallagherrr...|
|         1|[nice https://tco...|
|         2|[@shoq @andrewgil...|
|         3|[excited to be on...|
|         4|[@rasnick_kevin y...|
|         5|[happy wednesday!...|
|         6|[@cosmic_coolness...|
|         7|[@betoorourke we ...|
|         8|[@pix_stixx i lov...|
|         9|[@kaossblade gooo...|
|        10|[@rosievideo_ @nu...|
|        11|[@rvawonk there’s...|
|        12|[@biglehsee that ...|
|        13|[girls need love ...|
|        14|[@john_siracusa s...|
|        15|[#talent, no se d...|
+----------+--------------------+



In [23]:
documents.select("cluster_text").first()

Row(cluster_text=['@colegallagherrr i want 5 teams that you think bama has a chance of beating since 2008', 'on the run x young thug &amp; offset 😈', "#democrats still can't believe #hrc lost to #trump now they can't believe with #oprah's support her candidate lost what! how could that have happened! -----i am tired of hollywood expecting everybody to disregard our views and to accept theirs https://tco/4wz29dyo0z", 'we have officially moved!!! i’m super excited to let you know that we moved to our permanent location of body by t fitness in the northlake/ university area of charlotte nc i invite you… https://tco/irbolwswwa', 'the fact that @trinarockstarr x redemption tells everything that i feel 😭 you did that period', "want to work at kelly services? we're #hiring in #millsriver nc! click for details: https://tco/tw6nh3f5n8 #kellyjobs #kellyservices #job #jobs #careerarc", '#justice', '@andrew_messer99 reddish 3rd', '@jessicasquared9 @harrisk1111 @bdckool @always5star @tw2cayc @bill4

In [24]:
# Combine each list of tweets to a string.
documents = documents.withColumn('cluster_text', F.concat_ws(',', 'cluster_text'))
documents.select("cluster_text").first()

Row(cluster_text='@colegallagherrr i want 5 teams that you think bama has a chance of beating since 2008,on the run x young thug &amp; offset 😈,#democrats still can\'t believe #hrc lost to #trump now they can\'t believe with #oprah\'s support her candidate lost what! how could that have happened! -----i am tired of hollywood expecting everybody to disregard our views and to accept theirs https://tco/4wz29dyo0z,we have officially moved!!! i’m super excited to let you know that we moved to our permanent location of body by t fitness in the northlake/ university area of charlotte nc i invite you… https://tco/irbolwswwa,the fact that @trinarockstarr x redemption tells everything that i feel 😭 you did that period,want to work at kelly services? we\'re #hiring in #millsriver nc! click for details: https://tco/tw6nh3f5n8 #kellyjobs #kellyservices #job #jobs #careerarc,#justice,@andrew_messer99 reddish 3rd,@jessicasquared9 @harrisk1111 @bdckool @always5star @tw2cayc @bill44077 @marshallrteague

In [25]:
# Convert the Spark DataFrame back to a Pandas DataFrame.
result_pdf = transformed.select("*").toPandas()

In [26]:
result_pdf.head(5)

Unnamed: 0,tweet_date,tweet_location,tweet_text,tweet_user_name,tweet_id,hastags,keyword,lat,long,features,prediction
0,2018-11-07 15:09:51,"[-116.9899205, 33.729378499999996]",@cosmic_coolness @hardhouz13 yup!,Marilyn Edmonds,1060187580392628234,0,0,-116.98992,33.729378,"[-116.9899205, 33.729378499999996]",6
1,2018-11-07 15:09:51,"[-74.7243235, 40.073040500000005]",@rvawonk there’s some really really great news...,Sue,1060187580145303552,0,0,-74.724323,40.073041,"[-74.7243235, 40.073040500000005]",11
2,2018-11-07 15:09:51,"[-96.4765385, 32.092979]",@pix_stixx i love you 💙,James Thomas,1060187580724113414,0,0,-96.476539,32.092979,"[-96.4765385, 32.092979]",8
3,2018-11-07 15:09:51,"[-96.9827885, 32.958255]",today my friday 💪🏾,Robert Jackson 🙏🏾,1060187581109886976,0,0,-96.982788,32.958255,"[-96.9827885, 32.958255]",8
4,2018-11-07 15:09:51,"[-87.5334065, 33.1966005]",@biglehsee that doesn’t change the facts that ...,tanner,1060187579885281282,0,0,-87.533406,33.196601,"[-87.5334065, 33.1966005]",12


In [27]:
# Write pandas dataframe to jsone file
with open('data/testDataCluster.json', 'w') as f:
    f.write(result_pdf.to_json(orient='records'))

In [28]:
sc.stop()

In [29]:
# add "var" to json file output.  
import convert
convert.jjson_convert('data/testDataCluster.json', 'data/testDataClusterVar.json')