In [2]:
from pyspark import *
from pyspark.sql import SQLContext
import json

In [3]:
# if true Enable Arrow-based columnar data transfers for speed.
spark.conf.set("spark.sql.execution.arrow.enabled", "false")

In [4]:
# create Spark context and sql context.
#sc = SparkContext()
sqlContext = SQLContext(sc)

In [5]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize #package for flattening json in pandas df

In [6]:
# load json object testData, 
#with open("data/testData.json", encoding="utf-8") as f:
#    d = json.load(f)

In [7]:
# load this json file if you ran TwitterStream.py first. 
with open("data/json_data.json", encoding="utf-8") as f:
    d = json.load(f)

In [8]:
# Creates a dataframe; each row is a tweet, each column is a tweet attribute + tweet ID
pdf = json_normalize(data=d['tweets'])
pdf.head(5)

Unnamed: 0,tweet_date,tweet_id,tweet_location,tweet_text,tweet_user_name
0,2018-11-26 19:18:34,1067135541336915968,"[-118.36964499999999, 34.0870095]",New work w/ @10magazine_ @SoKothecat in @gucc...,Villani Productions
1,2018-11-26 19:18:35,1067135546227486721,"[-118.129042, 33.8880815]",Scan my #kikcode to chat with me. My username ...,Bethany Clark
2,2018-11-26 19:18:36,1067135552741199873,"[-120.4357191, 34.9530337]","Want to work in #SantaMaria, CA? View our late...",California S-Chain
3,2018-11-26 19:18:37,1067135553613623296,"[-120.47181, 37.316222499999995]",@VeraAdxer_Art @Writer_DG @SamHeughan @caitrio...,🦃Mary E. Anderson🦃
4,2018-11-26 19:18:37,1067135553471041536,"[-119.1778649, 34.2404545]","Through fashion, we inspire women to feel good...",SantaBarbara Sales


In [9]:
#_____________Pandas Text Cleaning__________________
# Convert to lowercase
pdf['tweet_text'] = pdf['tweet_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
pdf['tweet_text'].head()

0    new work w/ @10magazine_ @sokothecat in @gucci...
1    scan my #kikcode to chat with me. my username ...
2    want to work in #santamaria, ca? view our late...
3    @veraadxer_art @writer_dg @samheughan @caitrio...
4    through fashion, we inspire women to feel good...
Name: tweet_text, dtype: object

In [10]:
rgx = '[.,]'  # remove , or .
pdf['tweet_text'] = pdf['tweet_text'].str.replace(rgx, '')
pdf['tweet_text'].head()

0    new work w/ @10magazine_ @sokothecat in @gucci...
1    scan my #kikcode to chat with me my username i...
2    want to work in #santamaria ca? view our lates...
3    @veraadxer_art @writer_dg @samheughan @caitrio...
4    through fashion we inspire women to feel good ...
Name: tweet_text, dtype: object

In [11]:
# Create count numbers of hastags used in each tweet.
pdf['hastags'] = pdf['tweet_text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
pdf[['tweet_text','hastags']].head()
pdf['hastags'].max()
pdf["hastags"].sum()

980

In [12]:
# Filter by keyword, we can have it search many keywords such as [roadblocking, landslide, poweroutage...]
#Which ever we find most suitable.
keyword = "help"
pdf['keyword'] = pdf['tweet_text'].apply(lambda x: len([x for x in x.split() if x.startswith(keyword)]))
pdf[['keyword']].max()

keyword    2
dtype: int64

In [13]:
# Split latitude and longitude.
pdf["lat"] = pdf["tweet_location"].apply(lambda x: x[0])
pdf["long"] = pdf["tweet_location"].apply(lambda x: x[1])
pdf

Unnamed: 0,tweet_date,tweet_id,tweet_location,tweet_text,tweet_user_name,hastags,keyword,lat,long
0,2018-11-26 19:18:34,1067135541336915968,"[-118.36964499999999, 34.0870095]",new work w/ @10magazine_ @sokothecat in @gucci...,Villani Productions,15,0,-118.369645,34.087010
1,2018-11-26 19:18:35,1067135546227486721,"[-118.129042, 33.8880815]",scan my #kikcode to chat with me my username i...,Bethany Clark,7,0,-118.129042,33.888081
2,2018-11-26 19:18:36,1067135552741199873,"[-120.4357191, 34.9530337]",want to work in #santamaria ca? view our lates...,California S-Chain,6,0,-120.435719,34.953034
3,2018-11-26 19:18:37,1067135553613623296,"[-120.47181, 37.316222499999995]",@veraadxer_art @writer_dg @samheughan @caitrio...,🦃Mary E. Anderson🦃,0,0,-120.471810,37.316222
4,2018-11-26 19:18:37,1067135553471041536,"[-119.1778649, 34.2404545]",through fashion we inspire women to feel good ...,SantaBarbara Sales,6,0,-119.177865,34.240454
5,2018-11-26 19:18:37,1067135553697570816,"[-118.2436849, 34.0522342]",we're committed to maintaining benefit program...,TMJ-LAX IT PM Jobs,3,1,-118.243685,34.052234
6,2018-11-26 19:18:37,1067135554116968448,"[-116.9880225, 32.85118]",i need to go to disneyland,crys🕸🕷,0,0,-116.988022,32.851180
7,2018-11-26 19:18:37,1067135554880327680,"[-120.713369, 35.5626375]",cyber monday shopping hard right now lol,Vanessa,0,0,-120.713369,35.562638
8,2018-11-26 19:18:37,1067135554477731840,"[-121.4024847, 38.5724111]",can you recommend anyone for this #job? busine...,TMJ-SAC Finance Jobs,5,0,-121.402485,38.572411
9,2018-11-26 19:18:37,1067135555140374529,"[-122.1802812, 37.3813444]",want to work in #paloalto ca? view our latest ...,TMJ-CAP Health Jobs,8,0,-122.180281,37.381344


In [14]:
# Remove tweet that does not cointain any of the keywords.
#keywordDf = pdf[(pdf['keyword'] >= 1)]
#keywordDf.head()

In [15]:
# Create a Spark DataFrame from a Pandas DataFrame
df = sqlContext.createDataFrame(pdf)

In [16]:
df

DataFrame[tweet_date: string, tweet_id: bigint, tweet_location: array<double>, tweet_text: string, tweet_user_name: string, hastags: bigint, keyword: bigint, lat: double, long: double]

In [17]:
df.select("tweet_id","tweet_date","lat","long","tweet_text").show()

+-------------------+-------------------+-------------------+------------------+--------------------+
|           tweet_id|         tweet_date|                lat|              long|          tweet_text|
+-------------------+-------------------+-------------------+------------------+--------------------+
|1067135541336915968|2018-11-26 19:18:34|-118.36964499999999|        34.0870095|new work w/ @10ma...|
|1067135546227486721|2018-11-26 19:18:35|        -118.129042|        33.8880815|scan my #kikcode ...|
|1067135552741199873|2018-11-26 19:18:36|       -120.4357191|        34.9530337|want to work in #...|
|1067135553613623296|2018-11-26 19:18:37|         -120.47181|37.316222499999995|@veraadxer_art @w...|
|1067135553471041536|2018-11-26 19:18:37|       -119.1778649|        34.2404545|through fashion w...|
|1067135553697570816|2018-11-26 19:18:37|       -118.2436849|        34.0522342|we're committed t...|
|1067135554116968448|2018-11-26 19:18:37|       -116.9880225|          32.85118|i 

In [18]:
# use latitude and longitide as x and y features with k-means
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=["lat", "long"], outputCol="features")
new_df = vecAssembler.transform(df)
new_df.select("tweet_id","features","tweet_text").show()

+-------------------+--------------------+--------------------+
|           tweet_id|            features|          tweet_text|
+-------------------+--------------------+--------------------+
|1067135541336915968|[-118.36964499999...|new work w/ @10ma...|
|1067135546227486721|[-118.129042,33.8...|scan my #kikcode ...|
|1067135552741199873|[-120.4357191,34....|want to work in #...|
|1067135553613623296|[-120.47181,37.31...|@veraadxer_art @w...|
|1067135553471041536|[-119.1778649,34....|through fashion w...|
|1067135553697570816|[-118.2436849,34....|we're committed t...|
|1067135554116968448|[-116.9880225,32....|i need to go to d...|
|1067135554880327680|[-120.713369,35.5...|cyber monday shop...|
|1067135554477731840|[-121.4024847,38....|can you recommend...|
|1067135555140374529|[-122.1802812,37....|want to work in #...|
|1067135555140374528|[-122.4359785,37....|😂 mike is me and...|
|1067135555614330880|[-115.135165,36.0...|@kfosterstomberg ...|
|1067135557333995521|[-119.2289915,34....

In [19]:
# run K-means over longitude and latitude, with k amount of target clusters.
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=16, seed=1) # 16 clusters
model = kmeans.fit(new_df.select('features'))

In [20]:
# show tweets with their associated cluster("prediction") 
transformed = model.transform(new_df)
transformed.select("tweet_id","prediction","lat","long","tweet_text").show()

+-------------------+----------+-------------------+------------------+--------------------+
|           tweet_id|prediction|                lat|              long|          tweet_text|
+-------------------+----------+-------------------+------------------+--------------------+
|1067135541336915968|         1|-118.36964499999999|        34.0870095|new work w/ @10ma...|
|1067135546227486721|         1|        -118.129042|        33.8880815|scan my #kikcode ...|
|1067135552741199873|         5|       -120.4357191|        34.9530337|want to work in #...|
|1067135553613623296|        13|         -120.47181|37.316222499999995|@veraadxer_art @w...|
|1067135553471041536|         1|       -119.1778649|        34.2404545|through fashion w...|
|1067135553697570816|         1|       -118.2436849|        34.0522342|we're committed t...|
|1067135554116968448|         6|       -116.9880225|          32.85118|i need to go to d...|
|1067135554880327680|         5|        -120.713369|        35.5626375

In [21]:
# Shows the cluster centers. ("Centroids")
centers = model.clusterCenters()
print("Cluster Centers: ")
for idx,center in enumerate(centers):
    print(idx,center)

Cluster Centers: 
0 [-122.31259788   37.86336503]
1 [-118.42236205   34.07961311]
2 [-115.12102799   36.09401648]
3 [-114.1432625   45.4945515]
4 [-120.5834015   44.1454365]
5 [-119.53903689   36.8961365 ]
6 [-116.93831607   33.01456805]
7 [-102.5580375   23.6254185]
8 [-96.3073065  65.8381145]
9 [-117.69539341   33.90665902]
10 [-111.89289145   34.27128046]
11 [-119.69123679   39.40576983]
12 [-121.77650736   39.04607097]
13 [-121.05910909   37.69214509]
14 [-121.94505128   37.26056338]
15 [-116.70582187   39.04277152]


In [22]:
transformed.printSchema()

root
 |-- tweet_date: string (nullable = true)
 |-- tweet_id: long (nullable = true)
 |-- tweet_location: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- tweet_text: string (nullable = true)
 |-- tweet_user_name: string (nullable = true)
 |-- hastags: long (nullable = true)
 |-- keyword: long (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: integer (nullable = false)



In [23]:
# Combines all tweets under its associated k-means cluster class.
import pyspark.sql.functions as F
documents = transformed.groupBy('prediction')\
  .agg(F.collect_list('tweet_text').alias("cluster_text")).orderBy("prediction")

In [24]:
documents.show()

+----------+--------------------+
|prediction|        cluster_text|
+----------+--------------------+
|         0|[😂 mike is me an...|
|         1|[new work w/ @10m...|
|         2|[@kfosterstomberg...|
|         3|[wait till you re...|
|         4|[cutie pie! 👅👍?...|
|         5|[want to work in ...|
|         6|[i need to go to ...|
|         7|[@dodgergeo @theh...|
|         8|[this would most ...|
|         9|[i’m doing a coup...|
|        10|[@jmariec36 me 2 ...|
|        11|[it’s not too lat...|
|        12|[can you recommen...|
|        13|[@veraadxer_art @...|
|        14|[want to work in ...|
|        15|[y’all smell that...|
+----------+--------------------+



In [25]:
documents.select("cluster_text").first()

Row(cluster_text=['😂 mike is me and i am mikeoffice is a no go zone @mikeisaac don’t do it!', "@nerfsqueezer @chrisshermanap that's the legal way to present yourself for asylum at the border on usa soil", '@jefflawson65 @realdonaldtrump as far as i am concerned everything that issues from your mouth scumbag in the white house is a lie❗😠😡😡😡😠😠😠', '@parispittsburgh send complimentary copy to trump', 'awesome! great part is i was fortunate to be on the bench for both of those games!', '“honey it’s after a thanksgiving and the halloween decorations are still out!” “let’s just put some santa hats on them and call it good” @ alameda california https://tco/7d5jq2np9h', 'i’m really hungry and want this to be over soon i’m back inside now and my car is “verified” and they’re going to let me get it smogged later so we’re getting somewhere', 'housemate says i look “hella spiritual today” #onlyinoakland 🤣🤣🤣 going to meet saint amma tonight join us for the atma puja ceremony for world peace at 7pm a

In [26]:
# Combine each list of tweets to a string.
documents = documents.withColumn('cluster_text', F.concat_ws(',', 'cluster_text'))
documents.select("cluster_text").first()

Row(cluster_text="😂 mike is me and i am mikeoffice is a no go zone @mikeisaac don’t do it!,@nerfsqueezer @chrisshermanap that's the legal way to present yourself for asylum at the border on usa soil,@jefflawson65 @realdonaldtrump as far as i am concerned everything that issues from your mouth scumbag in the white house is a lie❗😠😡😡😡😠😠😠,@parispittsburgh send complimentary copy to trump,awesome! great part is i was fortunate to be on the bench for both of those games!,“honey it’s after a thanksgiving and the halloween decorations are still out!” “let’s just put some santa hats on them and call it good” @ alameda california https://tco/7d5jq2np9h,i’m really hungry and want this to be over soon i’m back inside now and my car is “verified” and they’re going to let me get it smogged later so we’re getting somewhere,housemate says i look “hella spiritual today” #onlyinoakland 🤣🤣🤣 going to meet saint amma tonight join us for the atma puja ceremony for world peace at 7pm and then indian dinner 

In [27]:
# Convert the Spark DataFrame back to a Pandas DataFrame.
result_pdf = transformed.select("*").toPandas()

In [28]:
result_pdf.head(5)

Unnamed: 0,tweet_date,tweet_id,tweet_location,tweet_text,tweet_user_name,hastags,keyword,lat,long,features,prediction
0,2018-11-26 19:18:34,1067135541336915968,"[-118.36964499999999, 34.0870095]",new work w/ @10magazine_ @sokothecat in @gucci...,Villani Productions,15,0,-118.369645,34.08701,"[-118.36964499999999, 34.0870095]",1
1,2018-11-26 19:18:35,1067135546227486721,"[-118.129042, 33.8880815]",scan my #kikcode to chat with me my username i...,Bethany Clark,7,0,-118.129042,33.888081,"[-118.129042, 33.8880815]",1
2,2018-11-26 19:18:36,1067135552741199873,"[-120.4357191, 34.9530337]",want to work in #santamaria ca? view our lates...,California S-Chain,6,0,-120.435719,34.953034,"[-120.4357191, 34.9530337]",5
3,2018-11-26 19:18:37,1067135553613623296,"[-120.47181, 37.316222499999995]",@veraadxer_art @writer_dg @samheughan @caitrio...,🦃Mary E. Anderson🦃,0,0,-120.47181,37.316222,"[-120.47181, 37.316222499999995]",13
4,2018-11-26 19:18:37,1067135553471041536,"[-119.1778649, 34.2404545]",through fashion we inspire women to feel good ...,SantaBarbara Sales,6,0,-119.177865,34.240454,"[-119.1778649, 34.2404545]",1


In [29]:
# Write pandas dataframe to jsone file
with open('data/testDataCluster.json', 'w') as f:
    f.write(result_pdf.to_json(orient='records'))

In [30]:
sc.stop()

In [31]:
# add "var tweets = " to json file output.  
import convert
convert.add_firstline('data/testDataCluster.json',"var tweets = ")