In [1]:
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.feature import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import nltk.data
from nltk.tokenize import WordPunctTokenizer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
import re

spark = SparkSession \
    .builder \
    .appName("Stage 4") \
    .getOrCreate()

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1558529492852_0001,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
data = "s3://amazon-reviews-pds/tsv/amazon_reviews_us_Music_v1_00.tsv.gz"
allData = spark.read.csv(data, header=True, sep='\t')
# only keep the column which would be needed after
allData = allData.select(allData.customer_id, allData.product_id, allData.star_rating, allData.review_id, allData.review_body)

reviewsProd = allData.groupBy('product_id')
reviewsProd = reviewsProd.agg({'review_id':'count'})
orderReviewsProdData = reviewsProd.orderBy(-reviewsProd['count(review_id)'])
# the top 10 products ranked by the number of reviews they have
top10Prod = orderReviewsProdData.select('product_id').head(10)

# choose the last one of the top 10 product
selected_prod = top10Prod[9][0]
productdata = allData.where(allData['product_id']==selected_prod)
#productdata.show()

# identify the positive class and negative class
positive_data = productdata.where(productdata['star_rating'] >= 4)
negative_data = productdata.where(productdata['star_rating'] <= 2)

VBox()

In [3]:
# to split the review_body to sentences

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# only data in the review_id and review_body column will be used and transform to rdd
positive_rdd = positive_data.select(positive_data.review_id,positive_data.review_body).rdd
negative_rdd = negative_data.select(negative_data.review_id,negative_data.review_body).rdd

# to split to sentences and return a pairRdd which key is review_id, value is sentences
def map_sentence(data):
    string = str(data[1])
    sentences = tokenizer.tokenize(string)
    if sentences[-1] == '.' or sentences[-1] == '!' or sentences[-1] == '?':
        sentences.pop()
    return ((data[0]), (sentences))

positive_filt = positive_rdd.map(map_sentence)
negative_filt = negative_rdd.map(map_sentence)

def split_sentence(data):
    return (data)

positive_split = positive_filt.flatMapValues(split_sentence)
negative_split = negative_filt.flatMapValues(split_sentence)

VBox()

In [4]:
# to split sentence to words
def split_word(data):
    words_filt = re.findall("[a-zA-Z0-9]+", data[1])
    return data[0], data[1], words_filt
    
positive_word = positive_split.map(split_word)
negative_word = negative_split.map(split_word)
#positive_word.take(5)

VBox()

In [5]:
#transform to dataframe, two column: review_id, sentences and words but the column name is _1, _2, _3
positive_df = spark.createDataFrame(positive_word)
negative_df = spark.createDataFrame(negative_word)
#positive_df.show(5)

VBox()

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 51388)
Traceback (most recent call last):
  File "/usr/lib64/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib64/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib64/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib64/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 266, in handle
    poll(authenticate_and_accum_updates)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 254, in authenticate_and_accum_updates
    received_to

In [6]:
#transform to dataframe, two column: review_id, sentences and words but the column name is _1, _2, _3
positive_df = spark.createDataFrame(positive_word)
negative_df = spark.createDataFrame(negative_word)
#positive_df.show(5)

#generate sentence enbedding for each sentence
#to compare with google embedding
def wordToVec(df):
    word2vec = Word2Vec(vectorSize=512, minCount=0, inputCol='_3', outputCol='result')
    model = word2vec.fit(df)
    result = model.transform(df)
    return result

result_positive = wordToVec(positive_df)
result_negative = wordToVec(negative_df)
# the result of stage 4.1 which transfer 
result_positive.show(5)
result_negative.show(5)

VBox()

+--------------+--------------------+--------------------+--------------------+
|            _1|                  _2|                  _3|              result|
+--------------+--------------------+--------------------+--------------------+
|R3R7MRNK5HPULY|Good Charlotte's ...|[Good, Charlotte,...|[-0.0054475572242...|
| RQ9PYEGZ1N6LS|My daughter loves...|[My, daughter, lo...|[0.03392654993513...|
|R1P3A5U0M98JWW|              thanks|            [thanks]|[0.00425248872488...|
|R2TYDS7G24XRZC|vey good, I was c...|[vey, good, I, wa...|[-0.0162318474478...|
|R14IHG9LSIZLZK|I love this cd, I...|[I, love, this, c...|[-0.0107675544798...|
+--------------+--------------------+--------------------+--------------------+
only showing top 5 rows

+--------------+--------------------+--------------------+--------------------+
|            _1|                  _2|                  _3|              result|
+--------------+--------------------+--------------------+--------------------+
|R2F6WAB05QY47M

In [7]:
# use pca to reduce dimension
p_pca = PCA(k=78, inputCol="result", outputCol="pca")
p_model = p_pca.fit(result_positive)
p_pca_result = p_model.transform(result_positive).select('pca')

n_pca = PCA(k=78, inputCol="result", outputCol="pca")
n_model = n_pca.fit(result_negative)
n_pca_result = n_model.transform(result_negative).select('pca')

VBox()

In [8]:
# to transfer to numpy array
p_pca_result_rdd = p_pca_result.rdd.map(lambda r: r['pca'].toArray())
n_pca_result_rdd = n_pca_result.rdd.map(lambda r: r['pca'].toArray())

positive_array = np.array(p_pca_result_rdd.collect())
negative_array = np.array(n_pca_result_rdd.collect())

# calculate the cos and the distance is 1-cos
from sklearn.metrics.pairwise import cosine_similarity
def distance(array):
    cos = cosine_similarity(array, array)
    return 1-cos

# an array which represent the distance between vectors
positive_distance = distance(positive_array)
negative_distance = distance(negative_array)

p_average_distance = np.mean(positive_distance)
n_average_distance = np.mean(negative_distance)

print('the average distance between points in the positive class is', p_average_distance)
print('the average distance between points in the negative class is', n_average_distance)

VBox()

the average distance between points in the positive class is 0.6682456408886088
the average distance between points in the negative class is 0.46005071614426934

In [9]:
# calculate the average distance for each row
p_average_distance_list = np.mean(positive_distance, 1)
n_average_distance_list = np.mean(negative_distance, 1)
#print(average_distance_list)

VBox()

In [10]:
# get the index of the mininum distance, whichi is the class center's index
positive_index = p_average_distance_list.argmin()
negative_index = n_average_distance_list.argmin()
#print(negative_index)

VBox()

In [11]:
class_center_positive_distance = positive_distance[positive_index, :]
class_center_negative_distance = negative_distance[negative_index, :]
#print(class_center_positive_distance)

VBox()

In [12]:
# get the 10 closest neighbours' index for positive and negative class respectively
closest_positive_idx = np.argsort(class_center_positive_distance)[1:11]
#print(closest_positive_idx)
closest_negative_idx = np.argsort(class_center_negative_distance)[1:11]

VBox()

In [13]:
# transform the review_id column to list
# transform the sentence column to list
p_reviewid_result = positive_df.select('_1').collect()
p_sentence_result = positive_df.select('_2').collect()

n_reviewid_result = negative_df.select('_1').collect()
n_sentence_result = negative_df.select('_2').collect()

VBox()

In [14]:
p_center_reviewid = p_reviewid_result[positive_index]
p_center_sentence = p_sentence_result[positive_index]

n_center_reviewid = n_reviewid_result[negative_index]
n_center_sentence = n_sentence_result[negative_index]


VBox()

In [15]:
print("Positive Class")
print("--------------------------------------------------------")
print("center_review_id : "+ p_center_reviewid[0])
print("center_sentence: " + p_center_sentence[0])
print("--------------------------------------------------------")
for i in range(len(closest_positive_idx)):
    print(str(i+1)+" closest neighbor: ")
    print("review_id: "+ p_reviewid_result[closest_positive_idx[i]][0])
    print("sentence: " + p_sentence_result[closest_positive_idx[i]][0])

VBox()

Positive Class
--------------------------------------------------------
center_review_id : R1173JPWRMH85K
center_sentence: \\"The Day that I Die\\" has Cashdogg in it, so obviously becomes a favorite of mine, for I am a dog lover, but with it's country opening it's hard not to love it either way.<BR>    The title track, \\"The Young and the Hopeless\\" is another favorite of mine, it talks of not worrying so much what other think and what they want you to do, but being true to yourself.<BR>    In conclusion, I think the album does a very good job of showing how much the boys have grown up since the last album.
--------------------------------------------------------
1 closest neighbor: 
review_id: R2RC02F8OTX8JA
sentence: \\"The Day that I Die\\" has Cashdogg in it, so obviously becomes a favorite of mine, for I am a dog lover, but with it's country opening it's hard not to love it either way.<BR> The title track, \\"The Young and the Hopeless\\" is another favorite of mine, it talks o

In [16]:
print("Negative Class")
print("--------------------------------------------------------")
print("center_review_id : "+ n_center_reviewid[0])
print("center_sentence: " + n_center_sentence[0])
print("--------------------------------------------------------")
for i in range(len(closest_negative_idx)):
    print(str(i+1)+" closest neighbor: ")
    print("review_id: "+ n_reviewid_result[closest_negative_idx[i]][0])
    print("sentence: " + n_sentence_result[closest_negative_idx[i]][0])

VBox()

Negative Class
--------------------------------------------------------
center_review_id : R3OLF9O3M9S7Y4
center_sentence: It was then that I learned that the best music you will not hear on the radio or on MTV, but is something that you have to search for.<BR>     Good Charlotte is absolutley talentless and you will get sick of this CD within days because every song sounds the same.
--------------------------------------------------------
1 closest neighbor: 
review_id: RLDAYOMBHOJY1
sentence: Im not a fan or anything but you people who complain about Lifestyles of the Rich and Famous need to realize that song was written before they got rich and famous.Its not hypocritical.Plus its not about all rich and famous people,just the ones who have it all yet complain (which GC tends to do).That is all...
2 closest neighbor: 
review_id: R33K1364MTX8RG
sentence: <br />Do you really think a bunch of pop-star \\"punk\\" wannabes \\"bleed the same way as he does\\" and \\"have the same things to