In [88]:
import numpy as np
import re
from pyspark.sql import SparkSession
from nltk.tokenize import sent_tokenize
from pyspark.sql.functions import udf
from pyspark.sql.functions import lit
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import split, explode
from pyspark.sql.types import ArrayType, StringType

VBox()

In [89]:
spark = SparkSession \
    .builder \
    .appName("Stage 4") \
    .getOrCreate()

VBox()

In [90]:
rev_data = "s3://amazon-reviews-pds/tsv/amazon_reviews_us_Music_v1_00.tsv.gz"
revs = spark.read.csv(rev_data,header=True,sep='\t')

VBox()

In [91]:
selectedproduct = revs.filter(revs.product_id=='B004EBT5CU')

VBox()

In [92]:
productRecBefore = selectedproduct.select("review_id", "review_body", "star_rating")

VBox()

In [93]:
productRec = productRecBefore.filter(productRecBefore.review_body != '')

VBox()

In [95]:
def filterout_htmltag(x):
    tag_pattern = re.compile('<.*?>')
    cleantag = re.sub(tag_pattern, ' ', x)
    return cleantag

clean_htmltag = udf(lambda x: filterout_htmltag(x), StringType())

productRec = productRec.withColumn("review_body", lit(clean_htmltag(productRec['review_body'])))

VBox()

In [96]:
pattern = re.compile(r'[\.\?\!]+ ')
def segmentReview(x):
    contentList = re.split(pattern, x)
    return contentList

contentOfSen = udf(lambda x: segmentReview(x), ArrayType(StringType()))

VBox()

In [9]:
def segmentReviewnlp(x):
    contentlist = sent_tokenize(x)
    return contentlist

contentofsennlp = udf(lambda x: segmentReviewnlp(x), ArrayType(StringType()))

VBox()

In [97]:
def putvectorsintolist(result):
    thelist = []
    for row in result:
        thelist.append(np.array(row.result))
    return thelist

VBox()

In [99]:
posClass = productRec.filter(productRec.star_rating >= 4).withColumn("sentences", contentOfSen(productRec.review_body)).cache()

VBox()

In [100]:
negClass = productRec.filter(productRec.star_rating <= 2).withColumn("sentences", contentOfSen(productRec.review_body)).cache()

VBox()

In [101]:
posClassSen = posClass.withColumn('sentence', explode(posClass.sentences))

VBox()

In [102]:
negClassSen = negClass.withColumn('sentence', explode(negClass.sentences))

VBox()

In [103]:
def calCosine(vector1, vector2):
    result = np.dot(vector1,vector2)
    norm = np.linalg.norm(vector1)*np.linalg.norm(vector2)
    cos = result/norm
    return (1-cos)

VBox()

In [104]:
def calAverage(list):
    list2 = []
    for x in list:
        list1 = []
        for y in list:
            if ((x == y).all()) == False:
                list1.append(calCosine(x, y))
        average = sum(list1)/len(list1)
        list2.append(average)
    return list2

VBox()

In [105]:
def stringtolist(sentence):
    pattern = re.compile(r'\s+|[\.\?\!]+ ')
    sentencelist = re.split(pattern, sentence)
    return sentencelist
newList = udf(lambda x: stringtolist(x), ArrayType(StringType()))
posSentence = posClassSen.select('sentence','review_id').withColumn('sentencesplit',newList(posClassSen.sentence))
negSentence = negClassSen.select('sentence','review_id').withColumn('sentencesplit',newList(negClassSen.sentence))

VBox()

# PCA dimention reduction

# Positive reviews

In [112]:
word2Vec = Word2Vec(vectorSize=512, minCount=0, inputCol="sentencesplit", outputCol="result")
model = word2Vec.fit(posSentence)
posResultTrans = model.transform(posSentence)
posResult = posResultTrans.collect()

VBox()

In [115]:
#pca dimention reduction
from pyspark.ml.feature import PCA

pca = PCA(k=10, inputCol="result", outputCol="pca")
model = pca.fit(posResultTrans)
pca_resultPos = model.transform(posResultTrans).select('pca')

VBox()

In [116]:
pos_pca_result_rdd = pca_resultPos.rdd.map(lambda r: r['pca'].toArray())

VBox()

In [117]:
posrddlen= pos_pca_result_rdd.count()
posVectorList = pos_pca_result_rdd.take(posrddlen)
posResultAve = calAverage(posVectorList)

VBox()

In [118]:
def getrows(df, rownums=None):
    return df.rdd.zipWithIndex().filter(lambda x: x[1] in rownums).map(lambda x: x[0])

def gettext(result,rownum):
    therow = getrows(result, rownums=[rownum]).collect()
    review_id = therow[0].review_id
    sentence_text = therow[0].sentence

    return [str(review_id), str(sentence_text)]

VBox()

In [119]:
posMinAve = posResultAve.index(min(posResultAve))
outputListPos = gettext(posResultTrans,posMinAve)

VBox()

In [120]:
def calculate_distance(vector_list,point_index):
    vector = vector_list[point_index]
    dic = {}
    for i in range(len(vector_list)):
        othervector = vector_list[i]
        if (vector == othervector).all() == False:
            dic[i]= calCosine(vector, othervector)

    return dic

VBox()

In [121]:
posDistance_list = calculate_distance(posVectorList,posMinAve)

VBox()

In [122]:
x = sorted(posDistance_list.items(),key=lambda item:item[1])
posToptenList = x[:10]

VBox()

In [123]:
posKeyList = []
for i in posToptenList:
    posKeyList.append(i[0])

VBox()

In [124]:
def ten_close_neighbor(result,indexlist):
    tempList = []
    for i in indexlist:
        tempList.extend(gettext(result,i))
        
    return tempList
    
nListPos = ten_close_neighbor(posResultTrans,posKeyList)
outputListPos.extend(nListPos)

VBox()

In [136]:
count = 0
for i in range(len(outputListPos)):
    if count == 0:
        print("Center Sentence:")
        count += 1
    if count == 2:
        print("Top 10 Neighbor:")
        count += 1
    if i%2 == 0:
        print("review_id: " + outputListPos[i])
    else:
        print("sentence: " + outputListPos[i])
        print
        count += 1

VBox()

Center Sentence:
review_id: RC9Y4B9681QGJ
sentence: This is a good present for a girl who needs something to listen to in the car

Top 10 Neighbor:
review_id: R115VG4O2MJRRZ
sentence: This album is the kind that catches you right away and then continues to seep into your soul for a very long time

review_id: R3GKI20L66NJJG
sentence:  This is an album that you buy, insert into your cd/mp3 player and let it play because you're not going to need to skip over any of the songs since each track is ALL THAT

review_id: R3CA1V7SM0FCS5
sentence:  I chose this as a Christmas gift for a group of friends

review_id: R3OQNQ54FWX7Y8
sentence: great ' if not all; have been good and helpfull the diaper bag was not as I imagined .(the material of the bag )but all we,re good enough, Adele  was a  great album.

review_id: R2MNHPPVHWESB8
sentence: However, in the last year, I've gotten it - here is a young woman who is a force to be reckoned with, and this album proves it

review_id: R16UTC9UWJ61ON
senten

# Negtive reviews

In [126]:
word2Vec = Word2Vec(vectorSize=512, minCount=0, inputCol="sentencesplit", outputCol="result")
model = word2Vec.fit(negSentence)
negResultTrans = model.transform(negSentence)
negResult = negResultTrans.collect()

VBox()

In [127]:
pca = PCA(k=10, inputCol="result", outputCol="pca")
model = pca.fit(negResultTrans)
pca_resultNeg = model.transform(negResultTrans).select('pca')

VBox()

In [128]:
neg_pca_result_rdd = pca_resultNeg.rdd.map(lambda r: r['pca'].toArray())

VBox()

In [129]:
negrddlen= neg_pca_result_rdd.count()
negVectorList = neg_pca_result_rdd.take(negrddlen)
negResultAve = calAverage(negVectorList)

VBox()

In [130]:
negMinAve = negResultAve.index(min(negResultAve))
outputListNeg = gettext(negResultTrans,negMinAve)

VBox()

In [131]:
negDistance_list = calculate_distance(negVectorList,negMinAve)

VBox()

In [132]:
x = sorted(negDistance_list.items(),key=lambda item:item[1])
negToptenList = x[:10]

VBox()

In [133]:
negKeyList = []
for i in negToptenList:
    negKeyList.append(i[0])

VBox()

In [134]:
nListNeg = ten_close_neighbor(negResultTrans,negKeyList)
outputListNeg.extend(nListNeg)

VBox()

In [137]:
count = 0
for i in range(len(outputListNeg)):
    if count == 0:
        print("Center Sentence:")
        count += 1
    if count == 2:
        print("Top 10 Neighbor:")
        count += 1
    if i%2 == 0:
        print("review_id: " + outputListNeg[i])
    else:
        print("sentence: " + outputListNeg[i])
        print
        count += 1

VBox()

Center Sentence:
review_id: R2XV9CUVITLZKZ
sentence: Adele is a very talented artist and I couldn't wait to hear the CD on a high resolution audio system

Top 10 Neighbor:
review_id: R3BYX4UBOFLKQO
sentence: Rumour Has It is God-awful song that deserves to be shot, and those are the only two songs I can tell apart

review_id: R24HI7QAC22ANO
sentence: With all of the other packages I received, there was a receipt with instructions inside on how to return it

review_id: R1N25DVLK6N2SO
sentence: My CD player is a BOSE and I never had skipping before with a CD.

review_id: R270W4W3JJM1FN
sentence: I also try to avoid seeing the ridiculously creepy album cover which shows a majority of her face

review_id: R3BYX4UBOFLKQO
sentence: Just like since Adele is trying to sing something that sounds like it was written by a depressed Avril Lavinge, accept maybe a grade ahead (A grade ahead of Avril Lavinge is still an elementary grade, mind you), when she has this unique jazzy voice she could put t