In [1]:
# https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#apache-arrow-in-spark
import findspark
findspark.init()
import pyspark
import os
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
import json

In [2]:
# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [3]:
#sc = SparkContext()
#from pyspark.sql import SQLContext
#sqlContext = SQLContext(sc)

In [4]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize #package for flattening json in pandas df

In [5]:
# load json object testData
with open("data/testData.json", encoding="utf-8") as f:
    d = json.load(f)

In [6]:
# Creates a dataframe; each row is a tweet, each column is a tweet attribute + tweet ID
pdf = json_normalize(data=d['tweets'], record_path='tweet',
                            meta=['tweet_id'])
pdf.head(5)

Unnamed: 0,tweet_date,tweet_location,tweet_text,tweet_user_name,tweet_id
0,2018-11-07 15:09:51,"[-116.9899205, 33.729378499999996]",@cosmic_coolness @hardhouz13 Yup!,Marilyn Edmonds,1060187580392628234
1,2018-11-07 15:09:51,"[-74.7243235, 40.073040500000005]","@RVAwonk There’s some really, really great new...",Sue,1060187580145303552
2,2018-11-07 15:09:51,"[-96.4765385, 32.092979]",@Pix_Stixx I love you 💙,James Thomas,1060187580724113414
3,2018-11-07 15:09:51,"[-96.9827885, 32.958255]",Today my Friday 💪🏾,Robert Jackson 🙏🏾,1060187581109886976
4,2018-11-07 15:09:51,"[-87.5334065, 33.1966005]",@biglehsee That doesn’t change the facts that ...,tanner,1060187579885281282


In [7]:
#_____________Pandas Text Cleaning__________________
# Convert to lowercase
pdf['tweet_text'] = pdf['tweet_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
pdf['tweet_text'].head()

0                    @cosmic_coolness @hardhouz13 yup!
1    @rvawonk there’s some really, really great new...
2                              @pix_stixx i love you 💙
3                                   today my friday 💪🏾
4    @biglehsee that doesn’t change the facts that ...
Name: tweet_text, dtype: object

In [8]:
rgx = '[.,]'  # remove , or .
pdf['tweet_text'] = pdf['tweet_text'].str.replace(rgx, '')
pdf['tweet_text'].head()

0                    @cosmic_coolness @hardhouz13 yup!
1    @rvawonk there’s some really really great news...
2                              @pix_stixx i love you 💙
3                                   today my friday 💪🏾
4    @biglehsee that doesn’t change the facts that ...
Name: tweet_text, dtype: object

In [9]:
# Create count numbers of hastags used in each tweet.
pdf['hastags'] = pdf['tweet_text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
pdf[['tweet_text','hastags']].head()
pdf['hastags'].max()
pdf["hastags"].sum()

862

In [10]:
# Filter by keyword, we can have it search many keywords such as [roadblocking, landslide, poweroutage...]
#Which ever we find most suitable.
keyword = "help"
pdf['keyword'] = pdf['tweet_text'].apply(lambda x: len([x for x in x.split() if x.startswith(keyword)]))
pdf[['keyword']].max()

keyword    3
dtype: int64

In [11]:
pdf["lat"] = pdf["tweet_location"].apply(lambda x: x[0])
pdf["long"] = pdf["tweet_location"].apply(lambda x: x[1])
pdf

Unnamed: 0,tweet_date,tweet_location,tweet_text,tweet_user_name,tweet_id,hastags,keyword,lat,long
0,2018-11-07 15:09:51,"[-116.9899205, 33.729378499999996]",@cosmic_coolness @hardhouz13 yup!,Marilyn Edmonds,1060187580392628234,0,0,-116.989920,33.729378
1,2018-11-07 15:09:51,"[-74.7243235, 40.073040500000005]",@rvawonk there’s some really really great news...,Sue,1060187580145303552,0,0,-74.724323,40.073041
2,2018-11-07 15:09:51,"[-96.4765385, 32.092979]",@pix_stixx i love you 💙,James Thomas,1060187580724113414,0,0,-96.476539,32.092979
3,2018-11-07 15:09:51,"[-96.9827885, 32.958255]",today my friday 💪🏾,Robert Jackson 🙏🏾,1060187581109886976,0,0,-96.982788,32.958255
4,2018-11-07 15:09:51,"[-87.5334065, 33.1966005]",@biglehsee that doesn’t change the facts that ...,tanner,1060187579885281282,0,0,-87.533406,33.196601
5,2018-11-07 15:09:51,"[-87.5334065, 33.1966005]",anyone else hope jeremy pruitt leaves ut after...,Hunter Letson,1060187581558767617,0,0,-87.533406,33.196601
6,2018-11-07 15:09:51,"[-87.263022, 33.8583035]",@_queenalexiaa true i respect how you feel but...,LLRIO💯,1060187581294526464,0,1,-87.263022,33.858303
7,2018-11-07 15:09:51,"[-93.2620465, 44.9706105]",happy wednesday! we’re kicking off our morning...,Denamico,1060187581487419394,2,0,-93.262046,44.970610
8,2018-11-07 15:09:51,"[-73.022108, 41.5665785]",busy busy day,Ytcher Marte 🇩🇴,1060187582171172864,0,0,-73.022108,41.566578
9,2018-11-07 15:09:51,"[-121.2431435, 38.6495685]",@shoq @andrewgillum @berniesanders true!!!,Patricia Buchanan,1060187582133202944,0,0,-121.243144,38.649569


In [12]:
keywordDf = pdf[(pdf['keyword'] >= 1)]
keywordDf.head()

Unnamed: 0,tweet_date,tweet_location,tweet_text,tweet_user_name,tweet_id,hastags,keyword,lat,long
6,2018-11-07 15:09:51,"[-87.263022, 33.8583035]",@_queenalexiaa true i respect how you feel but...,LLRIO💯,1060187581294526464,0,1,-87.263022,33.858303
72,2018-11-07 15:09:55,"[-122.299, 38.3048]",trainers brian and melissa gruber with the hel...,ProEquest,1060187599032193026,0,1,-122.299,38.3048
287,2018-11-07 15:10:08,"[-97.74229199999999, 35.5006305]",@niallofficial niall sweetie im so sorry but i...,ry loves luke,1060187653310742529,0,1,-97.742292,35.50063
318,2018-11-07 15:10:10,"[-92.342322, 34.721886]",@briligerent @k_liakos not surprising low info...,evidence based tweeting,1060187661233741827,0,1,-92.342322,34.721886
386,2018-11-07 15:10:14,"[-112.4249755, 33.4128725]",@realpolitidiva i’m still dumbfounded over #sp...,“Big Wave Dave” Lutter,1060187678086295552,1,1,-112.424976,33.412872


In [13]:
#from pyspark.sql.types import *
#mySchema = StructType([StructField("lat", FloatType(), True),
                      #StructField("long", FloatType(), True)])

In [14]:
# Create a Spark DataFrame from a Pandas DataFrame using Arrow
#df = spark.createDataFrame(pdf)
#df = spark.createDataFrame(keywordDf, schema = mySchema)
df = spark.createDataFrame(pdf)

In [15]:
df.collect()

[Row(tweet_date='2018-11-07 15:09:51', tweet_location=[-116.9899205, 33.729378499999996], tweet_text='@cosmic_coolness @hardhouz13 yup!', tweet_user_name='Marilyn Edmonds', tweet_id=1060187580392628234, hastags=0, keyword=0, lat=-116.9899205, long=33.729378499999996),
 Row(tweet_date='2018-11-07 15:09:51', tweet_location=[-74.7243235, 40.073040500000005], tweet_text='@rvawonk there’s some really really great news! as we lick our wounds cause of our high expectations we’ll find many silver linings', tweet_user_name='Sue', tweet_id=1060187580145303552, hastags=0, keyword=0, lat=-74.7243235, long=40.073040500000005),
 Row(tweet_date='2018-11-07 15:09:51', tweet_location=[-96.4765385, 32.092979], tweet_text='@pix_stixx i love you 💙', tweet_user_name='James Thomas', tweet_id=1060187580724113414, hastags=0, keyword=0, lat=-96.4765385, long=32.092979),
 Row(tweet_date='2018-11-07 15:09:51', tweet_location=[-96.9827885, 32.958255], tweet_text='today my friday 💪🏾', tweet_user_name='Robert Jacks

In [16]:
df.select("tweet_id","lat","long","tweet_text").show()

+-------------------+------------------+------------------+--------------------+
|           tweet_id|               lat|              long|          tweet_text|
+-------------------+------------------+------------------+--------------------+
|1060187580392628234|      -116.9899205|33.729378499999996|@cosmic_coolness ...|
|1060187580145303552|       -74.7243235|40.073040500000005|@rvawonk there’s ...|
|1060187580724113414|       -96.4765385|         32.092979|@pix_stixx i love...|
|1060187581109886976|       -96.9827885|         32.958255|today my friday 💪🏾|
|1060187579885281282|       -87.5334065|        33.1966005|@biglehsee that d...|
|1060187581558767617|       -87.5334065|        33.1966005|anyone else hope ...|
|1060187581294526464|        -87.263022|        33.8583035|@_queenalexiaa tr...|
|1060187581487419394|       -93.2620465|        44.9706105|happy wednesday! ...|
|1060187582171172864|        -73.022108|        41.5665785|       busy busy day|
|1060187582133202944|      -12

In [17]:
# https://stackoverflow.com/questions/47585723/kmeans-clustering-in-pyspark
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=["lat", "long"], outputCol="features")
new_df = vecAssembler.transform(df)
new_df.select("tweet_id","features","tweet_text").show()

+-------------------+--------------------+--------------------+
|           tweet_id|            features|          tweet_text|
+-------------------+--------------------+--------------------+
|1060187580392628234|[-116.9899205,33....|@cosmic_coolness ...|
|1060187580145303552|[-74.7243235,40.0...|@rvawonk there’s ...|
|1060187580724113414|[-96.4765385,32.0...|@pix_stixx i love...|
|1060187581109886976|[-96.9827885,32.9...|today my friday 💪🏾|
|1060187579885281282|[-87.5334065,33.1...|@biglehsee that d...|
|1060187581558767617|[-87.5334065,33.1...|anyone else hope ...|
|1060187581294526464|[-87.263022,33.85...|@_queenalexiaa tr...|
|1060187581487419394|[-93.2620465,44.9...|happy wednesday! ...|
|1060187582171172864|[-73.022108,41.56...|       busy busy day|
|1060187582133202944|[-121.2431435,38....|@shoq @andrewgill...|
|1060187579679674368|[-75.117998,40.00...|looking forward t...|
|1060187582884126721|[-84.433106,33.76...|fell asleep while...|
|1060187583035125760|[-100.0768885,31....|

In [18]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=16, seed=1) # 16 clusters
model = kmeans.fit(new_df.select('features'))

In [19]:
transformed = model.transform(new_df)
transformed.select("tweet_id","prediction","lat","long","tweet_text").show()

+-------------------+----------+------------------+------------------+--------------------+
|           tweet_id|prediction|               lat|              long|          tweet_text|
+-------------------+----------+------------------+------------------+--------------------+
|1060187580392628234|         1|      -116.9899205|33.729378499999996|@cosmic_coolness ...|
|1060187580145303552|        12|       -74.7243235|40.073040500000005|@rvawonk there’s ...|
|1060187580724113414|         4|       -96.4765385|         32.092979|@pix_stixx i love...|
|1060187581109886976|         4|       -96.9827885|         32.958255|today my friday 💪🏾|
|1060187579885281282|         5|       -87.5334065|        33.1966005|@biglehsee that d...|
|1060187581558767617|         5|       -87.5334065|        33.1966005|anyone else hope ...|
|1060187581294526464|         5|        -87.263022|        33.8583035|@_queenalexiaa tr...|
|1060187581487419394|        11|       -93.2620465|        44.9706105|happy wednes

In [20]:
# Shows the cluster centers. ("Centroids")
centers = model.clusterCenters()
print("Cluster Centers: ")
for idx,center in enumerate(centers):
    print(idx,center)

Cluster Centers: 
0 [-70.87457875  43.1822945 ]
1 [-118.95252985   35.5040286 ]
2 [-85.99685764  43.55569475]
3 [-121.7210247    46.68187553]
4 [-98.26986969  32.65184136]
5 [-84.79721901  33.96286556]
6 [-78.76496362  43.47945378]
7 [-110.65512961   33.33003999]
8 [-91.00321843  33.24441088]
9 [-107.64779006   40.08427934]
10 [-77.72716458  39.13556871]
11 [-89.05332797  42.00280415]
12 [-74.24360651  40.83931585]
13 [-80.32862883  35.12568573]
14 [-83.79148509  40.59121396]
15 [-95.23324171  39.34176664]


In [21]:
transformed.printSchema()

root
 |-- tweet_date: string (nullable = true)
 |-- tweet_location: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- tweet_text: string (nullable = true)
 |-- tweet_user_name: string (nullable = true)
 |-- tweet_id: long (nullable = true)
 |-- hastags: long (nullable = true)
 |-- keyword: long (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: integer (nullable = false)



In [22]:
import pyspark.sql.functions as F
documents = transformed.groupBy('prediction')\
  .agg(F.collect_list('tweet_text').alias("cluster_text")).orderBy("prediction")

In [23]:
documents.show()

+----------+--------------------+
|prediction|        cluster_text|
+----------+--------------------+
|         0|[excited to be on...|
|         1|[@cosmic_coolness...|
|         2|[off to kingfishe...|
|         3|[@rosievideo_ @nu...|
|         4|[@pix_stixx i lov...|
|         5|[@biglehsee that ...|
|         6|[@rasnick_kevin y...|
|         7|[@jackypadilla_ w...|
|         8|[💯bro💪🏾, a sto...|
|         9|[nice https://tco...|
|        10|[@kaossblade gooo...|
|        11|[happy wednesday!...|
|        12|[@rvawonk there’s...|
|        13|[@colegallagherrr...|
|        14|[@john_siracusa s...|
|        15|[i love the atlan...|
+----------+--------------------+



In [24]:
documents = documents.withColumn('cluster_text', F.concat_ws(',', 'cluster_text'))
documents.select("prediction","cluster_text").head(1)

[Row(prediction=0, cluster_text='excited to be on @boston25 news this morning with continuing coverage of the #spatchallenge @uofnh @unhinnovation @cityofdovernh @newhampshiredot #v2i #dsrc #sebagotechnics #connectedvehicles #smartcities https://tco/hmunch2alp,not enough espresso today,@realdonaldtrump https://tco/j5ewhxtlow,@maxiglb this means so so so much to me thank you for your love and support 😭💛,fière de faire partie de cette équipe audacieuse &amp; dynamique @projetmontreal &amp; surtout de représenter mes concitoyennes de #tétreaultville &amp; #mhm #polmtl #1an 🙌 https://tco/gjnefx04zt,https://tco/wsmsgwjjyn #signs #decalsticker #decals #banners #yahboiniiice @ yahboiniiice custom clothing store https://tco/ynd4dpun0s,the wisdom that comes from my young yoga students never ceases to impress me ❤️ #letotherpeopleseeyouforwhoyouare #notetoself #positiveselftalk #compassion #selflove #yogainschools… https://tco/ioquhev4he,6 foot 7 foot  john  how to love  nightmares of the bottom

In [25]:
#Tokenize
from pyspark.ml.feature import HashingTF, CountVectorizer , IDF, Tokenizer
tokenizer = Tokenizer(inputCol="cluster_text", outputCol="words")
tokenizer.getOutputCol()
wordsData = tokenizer.transform(documents)

In [26]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)

In [27]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [28]:
rescaledData.select("prediction", "features").first()

Row(prediction=0, features=SparseVector(20, {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0}))

In [29]:
rescaledData.printSchema()

root
 |-- prediction: integer (nullable = false)
 |-- cluster_text: string (nullable = false)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)
 |-- features: vector (nullable = true)



In [30]:
rescaledData.select("words").head(1)

[Row(words=['excited', 'to', 'be', 'on', '@boston25', 'news', 'this', 'morning', 'with', 'continuing', 'coverage', 'of', 'the', '#spatchallenge', '@uofnh', '@unhinnovation', '@cityofdovernh', '@newhampshiredot', '#v2i', '#dsrc', '#sebagotechnics', '#connectedvehicles', '#smartcities', 'https://tco/hmunch2alp,not', 'enough', 'espresso', 'today,@realdonaldtrump', 'https://tco/j5ewhxtlow,@maxiglb', 'this', 'means', 'so', 'so', 'so', 'much', 'to', 'me', 'thank', 'you', 'for', 'your', 'love', 'and', 'support', '😭💛,fière', 'de', 'faire', 'partie', 'de', 'cette', 'équipe', 'audacieuse', '&amp;', 'dynamique', '@projetmontreal', '&amp;', 'surtout', 'de', 'représenter', 'mes', 'concitoyennes', 'de', '#tétreaultville', '&amp;', '#mhm', '#polmtl', '#1an', '🙌', 'https://tco/gjnefx04zt,https://tco/wsmsgwjjyn', '#signs', '#decalsticker', '#decals', '#banners', '#yahboiniiice', '@', 'yahboiniiice', 'custom', 'clothing', 'store', 'https://tco/ynd4dpun0s,the', 'wisdom', 'that', 'comes', 'from', 'my', 'y

In [31]:
# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
result_pdf = transformed.select("*").toPandas()

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


In [32]:
result_pdf.head(5)

Unnamed: 0,tweet_date,tweet_location,tweet_text,tweet_user_name,tweet_id,hastags,keyword,lat,long,features,prediction
0,2018-11-07 15:09:51,"[-116.9899205, 33.729378499999996]",@cosmic_coolness @hardhouz13 yup!,Marilyn Edmonds,1060187580392628234,0,0,-116.98992,33.729378,"[-116.9899205, 33.729378499999996]",1
1,2018-11-07 15:09:51,"[-74.7243235, 40.073040500000005]",@rvawonk there’s some really really great news...,Sue,1060187580145303552,0,0,-74.724323,40.073041,"[-74.7243235, 40.073040500000005]",12
2,2018-11-07 15:09:51,"[-96.4765385, 32.092979]",@pix_stixx i love you 💙,James Thomas,1060187580724113414,0,0,-96.476539,32.092979,"[-96.4765385, 32.092979]",4
3,2018-11-07 15:09:51,"[-96.9827885, 32.958255]",today my friday 💪🏾,Robert Jackson 🙏🏾,1060187581109886976,0,0,-96.982788,32.958255,"[-96.9827885, 32.958255]",4
4,2018-11-07 15:09:51,"[-87.5334065, 33.1966005]",@biglehsee that doesn’t change the facts that ...,tanner,1060187579885281282,0,0,-87.533406,33.196601,"[-87.5334065, 33.1966005]",5


In [33]:
# Write pandas dataframe to jsone file
with open('data/testDataCluster.json', 'w') as f:
    f.write(result_pdf.to_json(orient='records'))

In [35]:
#sc.stop()