In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import nltk
import nltk.data
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA

spark = SparkSession \
    .builder \
    .appName("Overall Summary Statistic") \
    .getOrCreate()

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1558524031387_0001,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
data = "s3://amazon-reviews-pds/tsv/amazon_reviews_us_Music_v1_00.tsv.gz"
allData = spark.read.csv(data, header=True, sep='\t')
# only keep the column which would be needed after
allData = allData.select(allData.customer_id, allData.product_id, allData.star_rating, allData.review_id, allData.review_body)

reviewsProd = allData.groupBy('product_id')
reviewsProd = reviewsProd.agg({'review_id':'count'})
orderReviewsProdData = reviewsProd.orderBy(-reviewsProd['count(review_id)'])
# the top 10 products ranked by the number of reviews they have
top10Prod = orderReviewsProdData.select('product_id').head(10)

# choose the last one of the top 10 product
selected_prod = top10Prod[9][0]
productdata = allData.where(allData['product_id']==selected_prod)
#productdata.show()

# identify the positive class and negative class
positive_data = productdata.where(productdata['star_rating'] >= 4)
negative_data = productdata.where(productdata['star_rating'] <= 2)

VBox()

In [3]:
# to split the review_body to sentences

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# only data in the review_id and review_body column will be used and transform to rdd
positive_rdd = positive_data.select(positive_data.review_id,positive_data.review_body).rdd
negative_rdd = negative_data.select(negative_data.review_id,negative_data.review_body).rdd

# to split to sentences and return a pairRdd which key is review_id, value is sentences
def map_sentence(data):
    string = str(data[1])
    sentences = tokenizer.tokenize(string)
    if sentences[-1] == '.' or sentences[-1] == '!' or sentences[-1] == '?':
        sentences.pop()
    return ((data[0]), (sentences))

positive_filt = positive_rdd.map(map_sentence)
negative_filt = negative_rdd.map(map_sentence)

def split_sentence(data):
    return (data)

positive_split = positive_filt.flatMapValues(split_sentence)
negative_split = negative_filt.flatMapValues(split_sentence)

VBox()

In [4]:
#transform to dataframe, two column: review_id, sentences but the column name is _1, _2
positive_df = spark.createDataFrame(positive_split)
negative_df = spark.createDataFrame(negative_split)
#positive_df.show(5)

# only data in the review_body column will be used
# also try to filter data with empty body
p_result_rdd = positive_df.select('_2').rdd.map(lambda row: str(row[0])).filter(lambda data: data is not None).cache()
n_result_rdd = negative_df.select('_2').rdd.map(lambda row: str(row[0])).filter(lambda data: data is not None).cache()

VBox()

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 44834)
Traceback (most recent call last):
  File "/usr/lib64/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib64/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib64/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib64/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 266, in handle
    poll(authenticate_and_accum_updates)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 254, in authenticate_and_accum_updates
    received_to

In [5]:
def review_embed(rev_text_partition):
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]
    embed = hub.Module(module_url)
    # mapPartition would supply element inside a partition using generator stype
    # this does not fit tensorflow stype
    rev_text_list = [text for text in rev_text_partition]
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        message_embeddings = session.run(embed(rev_text_list))
    return message_embeddings

# use Google Pre-trained universal sentence encoder
# generate sentence enbedding for each input review text. The encoding is a 512 dimension vector.
p_review_embedding = p_result_rdd.mapPartitions(review_embed).cache()
n_review_embedding = n_result_rdd.mapPartitions(review_embed).cache()
p_review_embedding.take(5)
n_review_embedding.take(5)

VBox()

[array([ 3.35570201e-02,  1.81063488e-02,  4.37106043e-02,  3.41501972e-03,
       -3.41408588e-02,  3.91174592e-02, -3.38410847e-02,  2.54936870e-02,
        3.34180370e-02,  4.54882793e-02,  3.78673337e-02,  1.42549397e-02,
       -3.55267548e-03,  2.23622248e-02,  3.64716649e-02,  1.26756160e-02,
        7.56692365e-02,  1.78778917e-02,  7.97951594e-03, -4.46611159e-02,
       -1.76746007e-02, -1.86120123e-02,  4.77378555e-02,  2.65137162e-02,
        5.20654805e-02, -2.93165445e-02, -3.43355834e-02, -1.11251976e-02,
        7.89907798e-02, -1.96770802e-02,  8.00571069e-02,  7.67244771e-03,
        8.08431283e-02, -7.04592466e-02,  6.61226362e-02,  7.44588347e-03,
       -6.34521246e-02, -6.18547248e-03, -3.90799828e-02, -1.30949365e-02,
        6.04560189e-02, -3.97875831e-02, -4.21756022e-02,  1.96231045e-02,
       -3.54963131e-02,  1.21802054e-02,  4.88597117e-02, -6.47978485e-02,
        5.88479601e-02, -3.23842391e-02, -4.62767184e-02, -2.91011296e-02,
       -3.87932025e-02, 

In [6]:
p_review_embedding_df = spark.createDataFrame(p_review_embedding.map(lambda v: v.tolist()))
assembler = VectorAssembler(inputCols=p_review_embedding_df.columns, outputCol="features")
p_review_embedding_vectors = assembler.transform(p_review_embedding_df).select("features")

n_review_embedding_df = spark.createDataFrame(n_review_embedding.map(lambda v: v.tolist()))
assembler = VectorAssembler(inputCols=n_review_embedding_df.columns, outputCol="features")
n_review_embedding_vectors = assembler.transform(n_review_embedding_df).select("features")

VBox()

In [7]:
# use pca to reduce dimension
p_pca = PCA(k=78, inputCol="features", outputCol="pca")
p_model = p_pca.fit(p_review_embedding_vectors)
p_pca_result = p_model.transform(p_review_embedding_vectors).select('pca')

n_pca = PCA(k=78, inputCol="features", outputCol="pca")
n_model = n_pca.fit(n_review_embedding_vectors)
n_pca_result = n_model.transform(n_review_embedding_vectors).select('pca')

VBox()

In [8]:
# to transfer to numpy array
p_pca_result_rdd = p_pca_result.rdd.map(lambda r: r['pca'].toArray())
n_pca_result_rdd = n_pca_result.rdd.map(lambda r: r['pca'].toArray())

positive_array = np.array(p_pca_result_rdd.collect())
negative_array = np.array(n_pca_result_rdd.collect())

# calculate the cos and the distance is 1-cos
from sklearn.metrics.pairwise import cosine_similarity
def distance(array):
    cos = cosine_similarity(array, array)
    return 1-cos

# an array which represent the distance between vectors
positive_distance = distance(positive_array)
negative_distance = distance(negative_array)

p_average_distance = np.mean(positive_distance)
n_average_distance = np.mean(negative_distance)

print('the average distance between points in the positive class is', p_average_distance)
print('the average distance between points in the negative class is', n_average_distance)

VBox()

the average distance between points in the positive class is 0.6780276522973286
the average distance between points in the negative class is 0.7132072182935756

In [9]:
# calculate the average distance for each row
p_average_distance_list = np.mean(positive_distance, 1)
n_average_distance_list = np.mean(negative_distance, 1)
#print(average_distance_list)

VBox()

In [10]:
# get the index of the mininum distance, whichi is the class center's index
positive_index = p_average_distance_list.argmin()
negative_index = n_average_distance_list.argmin()

VBox()

In [11]:
class_center_positive_distance = positive_distance[positive_index, :]
class_center_negative_distance = negative_distance[negative_index, :]
#print(class_center_positive_distance)

VBox()

In [12]:
# get the 10 closest neighbours' index for positive and negative class respectively
closest_positive_idx = np.argsort(class_center_positive_distance)[1:11]
#print(closest_positive_idx)
closest_negative_idx = np.argsort(class_center_negative_distance)[1:11]

VBox()

In [13]:
# transform the review_id column to list
# transform the sentence column to list
p_reviewid_result = positive_df.select('_1').collect()
p_sentence_result = positive_df.select('_2').collect()

n_reviewid_result = negative_df.select('_1').collect()
n_sentence_result = negative_df.select('_2').collect()

VBox()

In [14]:
p_center_reviewid = p_reviewid_result[positive_index]
p_center_sentence = p_sentence_result[positive_index]

n_center_reviewid = n_reviewid_result[negative_index]
n_center_sentence = n_sentence_result[negative_index]

VBox()

In [15]:
print("Positive Class")
print("--------------------------------------------------------")
print("center_review_id : "+ p_center_reviewid[0])
print("center_sentence: " + p_center_sentence[0])
print("--------------------------------------------------------")
for i in range(len(closest_positive_idx)):
    print(str(i+1)+" closest neighbor: ")
    print("review_id: "+ p_reviewid_result[closest_positive_idx[i]][0])
    print("sentence: " + p_sentence_result[closest_positive_idx[i]][0])

VBox()

Positive Class
--------------------------------------------------------
center_review_id : R32NXZWC3RKH76
center_sentence: Every song is awesome.
--------------------------------------------------------
1 closest neighbor: 
review_id: R27M6LF1SF31D3
sentence: Every song is amazing.
2 closest neighbor: 
review_id: R1VR2BOC4IT29H
sentence: Every song is great!
3 closest neighbor: 
review_id: R144NZND4C5S5A
sentence: Every single song is GREAT!
4 closest neighbor: 
review_id: R2XN70XMURMD4B
sentence: !Every single song on it is awesome!
5 closest neighbor: 
review_id: R2L30H6HKZXUK7
sentence: every song is really good.
6 closest neighbor: 
review_id: R3CR46JJVYB5W9
sentence: Every song on it is absolutely awesome.
7 closest neighbor: 
review_id: R2N0NUDL7GEOCN
sentence: i LOVE every single song.
8 closest neighbor: 
review_id: RQUTYHT559L64
sentence: The lyrics are awesome and every song is great.
9 closest neighbor: 
review_id: R1S1N6XH9BVU8F
sentence: I just have to say that &quot;emoti

In [15]:
print("Negative Class")
print("--------------------------------------------------------")
print("center_review_id : "+ n_center_reviewid[0])
print("center_sentence: " + n_center_sentence[0])
print("--------------------------------------------------------")
for i in range(len(closest_negative_idx)):
    print(str(i+1)+" closest neighbor: ")
    print("review_id: "+ n_reviewid_result[closest_negative_idx[i]][0])
    print("sentence: " + n_sentence_result[closest_negative_idx[i]][0])

VBox()

Negative Class
--------------------------------------------------------
center_review_id : R3MGK1ZXH61TIS
center_sentence: its not even pop-punk.
--------------------------------------------------------
1 closest neighbor: 
review_id: R2OK5QFRPFADKK
sentence: Good Charlotte is not even real punk.
2 closest neighbor: 
review_id: R26GZ52ZCK961P
sentence: Good Charlotte is one of the worst bands out now.
3 closest neighbor: 
review_id: R3QG1GKJO8IL4K
sentence: Due to popular beleif, good charlotte are in no way punk.
4 closest neighbor: 
review_id: R2GWQHQMLPU4T1
sentence: And people think Good Charlotte is punk?!
5 closest neighbor: 
review_id: R1V5HHKRJNWUQM
sentence: There not even pop-punk, there not even pop, there krap.
6 closest neighbor: 
review_id: R1ND1K68UK17ZJ
sentence: GOOd CHarlotte is not freakin punk!!
7 closest neighbor: 
review_id: R38HLY2JJ8UX9Q
sentence: They are one of the worst bands i have ever heard along with simple plan and new found glory.
8 closest neighbor: 
r