In [None]:
# https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#apache-arrow-in-spark
import findspark
findspark.init()
import pyspark
import os
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
import json

In [None]:
# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [None]:
#sc = SparkContext()
#from pyspark.sql import SQLContext
#sqlContext = SQLContext(sc)

In [None]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize #package for flattening json in pandas df

In [None]:
# load json object
#with open("data/json_data.json", encoding="utf-8") as f:
#    d = json.load(f)

In [None]:
# load json object testData
# load json object
with open("data/testData.json", encoding="utf-8") as f:
    d = json.load(f)

In [None]:
# Creates a dataframe; each row is a tweet, each column is a tweet attribute + tweet ID
pdf = json_normalize(data=d['tweets'], record_path='tweet',
                            meta=['tweet_id'])
pdf.head(5)

In [None]:
pdf 

In [None]:
#_____________Pandas Text Cleaning__________________
# Convert to lowercase
pdf['tweet_text'] = pdf['tweet_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
pdf['tweet_text'].head()

In [None]:
rgx = '[.,]'  # remove , or .
pdf['tweet_text'] = pdf['tweet_text'].str.replace(rgx, '')
pdf['tweet_text'].head()

In [None]:
# Create count numbers of hastags used in each tweet.
pdf['hastags'] = pdf['tweet_text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
pdf[['tweet_text','hastags']].head()
pdf['hastags'].max()
pdf["hastags"].sum()

In [None]:
# Filter by keyword, we can have it search many keywords such as [roadblocking, landslide, poweroutage...]
#Which ever we find most suitable.
keyword = "help"
pdf['keyword'] = pdf['tweet_text'].apply(lambda x: len([x for x in x.split() if x.startswith(keyword)]))
pdf[['keyword']].max()

In [None]:
pdf["lat"] = pdf["tweet_location"].apply(lambda x: x[0])
pdf["long"] = pdf["tweet_location"].apply(lambda x: x[1])
pdf

In [None]:
keywordDf = pdf[(pdf['keyword'] >= 1)]
keywordDf.head()

In [None]:
lkwdf = keywordDf[['lat','long']].astype(float)
lkwdf

In [None]:
#from pyspark.sql.types import *
#mySchema = StructType([StructField("lat", FloatType(), True),
                      #StructField("long", FloatType(), True)])

In [None]:
# Create a Spark DataFrame from a Pandas DataFrame using Arrow
#df = spark.createDataFrame(pdf)
#df = spark.createDataFrame(keywordDf, schema = mySchema)
df = spark.createDataFrame(pdf)

In [None]:
df.collect()

In [None]:
df.select("lat","long").show()

In [None]:
#Spark k-mean over locations to localize an event area.
from numpy import array
from math import sqrt
from pyspark.mllib.clustering import KMeans, KMeansModel

In [None]:
#locData = df.select("lat","long")
#locData.show()

In [None]:
# https://stackoverflow.com/questions/47585723/kmeans-clustering-in-pyspark
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=["lat", "long"], outputCol="features")
new_df = vecAssembler.transform(df)
new_df.show()

In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=16, seed=1)  # 2 clusters here
model = kmeans.fit(new_df.select('features'))

In [None]:
transformed = model.transform(new_df)
transformed.select("tweet_id","prediction","lat","long","tweet_text").show()

In [None]:
# Shows the cluster centers. ("Centroids")
centers = model.clusterCenters()
print("Cluster Centers: ")
for idx,center in enumerate(centers):
    print(idx,center)

In [None]:
transformed.select("prediction","tweet_text").show()

In [None]:
transformed.printSchema()

In [None]:
# TF tf-idf
from pyspark import SparkConf, SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
import pyspark.sql.functions as F

In [None]:
documents = transformed.groupBy('prediction')\
  .agg(F.collect_list('tweet_text')).orderBy("prediction")

In [None]:
documents.show()

In [None]:
documents.select("collect_list(tweet_text)").head(1)

In [None]:
#Tokenize
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [None]:
#https://stackoverflow.com/questions/46178325/flatten-nested-struct-in-pyspark-array
documents.printSchema()

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *
#flat_df = documents.select("prediction", "collect_list(tweet_text).*")

flat_df = documents.select("prediction", *[c + ".*" for c in "collect_list(tweet_text)"])

In [None]:
# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
result_pdf = transformed.select("*").toPandas()

In [None]:
result_pdf.head(5)

In [None]:
#sc.stop()

In [None]:
# Write pandas dataframe to jsone file
with open('data/testDataCluster.json', 'w') as f:
    f.write(result_pdf.to_json(orient='records'))