In [None]:
import findspark
findspark.init()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import nltk, re, pprint
import urllib
import urllib.request
import json
import itertools
import collections
import operator
from __future__ import division
from nltk import bigrams
from nltk.collocations import *
from nltk.util import ngrams
from nltk import word_tokenize
from math import log
from collections import defaultdict
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

In [None]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext()
sqlcontext = SQLContext(sc)

# 1. Working on Reviews - Overall Rating

### 1.1. Loading File

In [None]:
df = sqlcontext.read.json('./data/reviews_Cell_Phones_and_Accessories.json')

In [None]:
df.show(5)

In [None]:
# Dropping the rows with missing values
df2 = df.na.drop()

In [None]:
# Number of complete rows
df2.count()

In [None]:
df2.registerTempTable("df2")

### 1.2. Only retain products with more than 4000 reviews

In [None]:
ratings = sqlcontext.sql("SELECT * FROM df2 WHERE asin in (SELECT asin from rating WHERE count >4000)")
ratings.registerTempTable("df3")

### 1.3. Cleaning of Review Text

In [None]:
def clean_text(review):
    
    # Removes non-english words (punctuations and numbers)
    import re
    cleaned_text = re.sub("[^a-zA-Z\'\.]", " ", str(review))
    
    # Covert everything to lower case
    cleaned_text = cleaned_text.lower()
    
    # Remove the word 'quot'
    cleaned_text = cleaned_text.replace("quot", "")
    
    # Remove any extra white space
    cleaned_text = ' '.join(cleaned_text.split())
    
    return(cleaned_text)

In [None]:
df3 = ratings.rdd\
    .map(lambda x: (x['asin'], x['helpful'], x['overall'], clean_text(x['reviewText']), x['reviewTime'], x['reviewerID'], x['reviewerName'], x['summary'], x['unixReviewTime'])).toDF(['asin','helpful','overall','reviewText_clean','reviewTime','reviewerID','reviewerName', 'summary','unixReviewTime'])

### 1.4. Subjective and Objective Sentence Classifier

In [None]:
def subjectivity(data):
    Sub = []
    l2 = []
    l = data.split('.')
    for i in range(len(l)):
        blob = TextBlob(l[i])
        for sentence in blob.sentences: 
            Sub = sentence.sentiment.subjectivity
            if Sub > 0.2:
                l2.append(l[i])
    return '.'.join(l2)

In [None]:
df4 = df3.rdd\
    .map(lambda x: (x['asin'], x['helpful'], x['overall'], subjectivity(x['reviewText_clean']), x['reviewTime'], x['reviewerID'], x['reviewerName'], x['summary'], x['unixReviewTime'])).toDF(['asin','helpful','overall','reviewText_sub','reviewTime','reviewerID','reviewerName', 'summary','unixReviewTime'])   

### 1.5. Part-of-Speech Tagging

In [None]:
def pos(text):
    text_pos = nltk.word_tokenize(text)
    text_pos = nltk.pos_tag(text_pos)
    return text_pos

In [None]:
df5 = df4.rdd\
    .map(lambda x: (x['asin'], x['helpful'], x['overall'], pos(x['reviewText_sub']), x['reviewTime'], x['reviewerID'], x['reviewerName'], x['summary'], x['unixReviewTime'])).toDF(['asin','helpful','overall','reviewText_pos','reviewTime','reviewerID','reviewerName', 'summary','unixReviewTime'])    

### 1.6. Filtering Based on Tags

In [None]:
def tagging(text):
    li = []
    for i in range(len(text)-1):
        if i == len(text) - 2:
            if text[i][1] == "JJ" and (text[i+1][1] == "NN" or text[i+1] == "NNS"):
                li.append(text[i])
                li.append(text[i+1])
            elif (text[i][1] == "RB" or text[i][1] == "RBR" or text[i][1] == "RBS") and (text[i+1][1] == "VB" or text[i+1][1] == "VBN" or text[i+1][1] == "VBD" or text[i+1][1] == "VBG"):
                li.append(text[i])
                li.append(text[i+1])
            
        elif text[i][1] == "JJ" and (text[i+1][1] == "NN" or text[i+1] == "NNS"):
            li.append(text[i])
            li.append(text[i+1])
        elif (text[i][1] == "RB" or text[i][1] == "RBR" or text[i][1] == "RBS") and (text[i+1][1] == "JJ") and (text[i+2][1] != "NN" and text[i+2][1] != "NNS"):
            li.append(text[i])
            li.append(text[i+1])
            li.append(text[i+2])
        elif (text[i][1] == "JJ" ) and (text[i+1][1] == "JJ") and (text[i+2][1] != "NN" and text[i+2][1] != "NNS"):
            li.append(text[i])
            li.append(text[i+1])
            li.append(text[i+2])
        elif (text[i][1] == "NN" or text[i][1] == "NNS") and (text[i+1][1] == "JJ") and (text[i+2][1] != "NN" and text[i+2][1] != "NNS"):
            li.append(text[i])
            li.append(text[i+1])
            li.append(text[i+2])
        elif (text[i][1] == "RB" or text[i][1] == "RBR" or text[i][1] == "RBS") and (text[i+1][1] == "VB" or text[i+1][1] == "VBN" or text[i+1][1] == "VBD" or text[i+1][1] == "VBG"):
            li.append(text[i])
            li.append(text[i+1])

    return li

In [None]:
df6 = df5.rdd\
    .map(lambda x: (x['asin'], x['helpful'], x['overall'], tagging(x['reviewText_pos']), x['reviewTime'], x['reviewerID'], x['reviewerName'], x['summary'], x['unixReviewTime'])).toDF(['asin','helpful','overall','reviewText_tag','reviewTime','reviewerID','reviewerName', 'summary','unixReviewTime'])

In [None]:
df6.registerTempTable("df6")

### 1.7. Pointwise Mutual Information - Information Retrieval Algorithm

In [None]:
list=[]
newlist=[]
newlist1=[]
ct=0


def hits(word1,word2=""): #
    query = "https://www.googleapis.com/customsearch/v1?key=AIzaSyCSggcnYzxwb67eSbwTa-cMmFglqqSJFOY&cx=007528943775727266505:5j7gzqbpr5g&q="
    if word2 == "":
#        results = urllib.request.urlopen(query % word1)
        with urllib.request.urlopen(query + word1) as url:
            resuts = url.read()
    else:
#        results = urllib.request.urlopen(query % word1+" "+"AROUND(10)"+" "+word2)
        with urllib.request.urlopen(query + word1+"%20"+word2) as url:
            resuts = url.read()
    json_res = json.loads(results.read())
    google_hits=int(json_res['responseData']['cursor']['estimatedResultCount'])
    return google_hits


def so(phrase):
    num = hits(phrase,"excellent")
    #print num
    den = hits(phrase,"poor")
    #print den
    ratio = num / den
    #print ratio
    sop = log(ratio)
    return sop

list_first = ["RB","RBR","RBS"]
list_second = ["VB","VBD","VBN","VBG"]
list_combn = itertools.product(list_first,list_second)

      

def check(newl,spl1):
    print(newl)
    print(spl1)
    for k in range(0,len(newl)):
        if(k!=len(newl)-1):
            list_new=[]
            list_new.append(newl[k])
            list_new.append(newl[k+1])
            list_new = tuple(list_new)
        
            if( newl[k]=="JJ" and newl[k+1]=="JJ" and newl[k+2]!="NN" and newl[k+2]!="NNS"):
                return "".join(spl1[k])+" "+"".join(spl1[k+1])
                
            if( newl[k]=="JJ" and newl[k+1]=="NN" ) or ( newl[k]=="JJ" and newl[k+1]=="NNS" ):
                return "".join(spl1[k])+" "+"".join(spl1[k+1])
                
            if( newl[k]=="NN" and newl[k+1]=="JJ" and newl[k+2]!="NN" and newl[k+2]!="NNS") or ( newl[k]=="NNS" and newl[k+1]=="JJ" and newl[k+2]!="NN" and newl[k+2]!="NNS"):
                return "".join(spl1[k])+" "+"".join(spl1[k+1])
                
            if( newl[k]=="RB" and newl[k+1]=="JJ" and newl[k+2]!="NN" and newl[k+2]!="NNS") or ( newl[k]=="RBR" and newl[k+1]=="JJ" and newl[k+2]!="NN" and newl[k+2]!="NNS") or ( newl[k]=="RBS" and newl[k+1]=="JJ" and newl[k+2]!="NN" and newl[k+2]!="NNS"):
                return "".join(spl1[k])+" "+"".join(spl1[k+1])
                
            for iter in list_combn:
                if(list_new == iter):
                    return "".join(spl1[k])+" "+"".join(spl1[k+1])
            
            

def text_pos(raw):
    global list,newlist,newlist1,ct
    print("raw input:",raw)
    spl=raw.split()
    print("\n")
    print("split version of input:",spl)
    pos=nltk.pos_tag(spl)
    print("\n")
    print("POS tagged text:","")
    for iter in pos:
        print(iter,"")
    for i in range(0,len(pos)):
        if(i!=len(pos)-1):
            list.append(pos[i])
            list.append(pos[i+1])
            t1 = list[0]
            t2 = list[1]
            newlist.append(t1[1])
            newlist.append(t2[1])
            list=[]
    print("\n")
    print("Extracting the tags alone:","")
    print(newlist)
    for j in range(0,len(newlist)):
        if((j%2!=0) and (j!=len(newlist)-1)):
            newlist[j]=0
            
    newlist = [x for x in newlist if x != 0]
    print("Checking whether the tags conform to the required pattern...")
    print("\n")
    print(spl)
    print(newlist)
    print("The extracted two-word phrases which satisfy the required pattern are:")
    strr1=check(newlist,spl)
    return strr1

In [None]:
strr = text_pos("Nokia is a amazing phone")
print(strr)
x = so(strr)
print(x)

### 1.8. Semantic Orientation 

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
analyser = SentimentIntensityAnalyzer()

In [None]:
def so(sentence):
    score_final = analyser.polarity_scores(sentence)
    return score_final['compound']

In [None]:
def tb(sentence):
    score = 0
    blob = TextBlob(sentence)
    for sentence in blob.sentences:
        score = score + (sentence.sentiment.polarity)
    return score

In [None]:
df7 = df4.rdd\
    .map(lambda x: (x['asin'], x['helpful'], x['overall'], so(x['reviewText_sub']), tb(x['reviewText_sub']), x['reviewTime'], x['reviewerID'], x['reviewerName'], x['summary'], x['unixReviewTime'])).toDF(['asin','helpful','overall','reviewText_so', 'reviewText_tb','reviewTime','reviewerID','reviewerName', 'summary','unixReviewTime'])

In [None]:
df7.show(5)

### 1.9. Overall Rating
This was found via two ways of semactic orientation - using VADER and using Textblob packages as shown in 3.8. 

In [None]:
df7.registerTempTable("df7")

In [None]:
overall_rat = sqlcontext.sql("SELECT reviewText_tb, reviewText_so FROM df7")

In [None]:
overall_rat.show(5)

In [None]:
overall_asin = overall_rat.select("asin").rdd.flatMap(lambda x: x).collect()
overall_rating = overall_rat.select("overall").rdd.flatMap(lambda x: x).collect()
overall_so = overall_rat.select("reviewText_so").rdd.flatMap(lambda x: x).collect()
overall_tb = overall_rat.select("reviewText_tb").rdd.flatMap(lambda x: x).collect()

In [None]:
SO = pd.DataFrame(overall_so)
SO.columns = {'SO'}
ID = pd.DataFrame(overall_asin)
ID.columns = {'ID'}
TB = pd.DataFrame(overall_tb)
TB.columns = {'TB'}
Rating = pd.DataFrame(overall_rating)
Rating.columns = {'Rating'}

### 1.10. Normalizing the Ratings

In [None]:
df = pd.concat([ID, SO, TB, Rating], axis=1)

In [None]:
df.shape

In [None]:
df = df.dropna()

In [None]:
df['overall_so'] = df['SO']*df['Rating']
df['overall_tb'] = df['TB']*df['Rating']

In [None]:
df.head(5)

In [None]:
mini = df['overall_so'].min()
maxi = df['overall_so'].max()

In [None]:
review = np.array(df['overall_so'])
review_final = (review - mini)*5/(maxi - mini)
review_final.mean()

In [None]:
df['overall_final_so'] = pd.DataFrame(review_final)

In [None]:
df.head(5)

In [None]:
df2 = df[['ID','Rating', 'overall_final_so']].groupby(['ID']).mean()
df2 = df2.reset_index('ID')

# 2. Working on Reviews - Feature Extraction

### 1.1. Feature Extraction


In [None]:
c = sqlcontext.sql("Select reviewText_tag from df6")
d = sqlcontext.sql("Select asin from df6")

In [None]:
reviewtext = c.select("reviewText_tag").rdd.flatMap(lambda x:x).collect()
asin = d.select("asin").collect()
asin = [x.asin for x in asin]

In [None]:
data = pd.DataFrame(columns=['asin','reviewtext'])

In [None]:
data['asin'] = asin
data['reviewtext'] = reviewtext

In [None]:
data.head()

In [None]:
mapping = {}
for i in data['asin'].unique():
    mapping[i] = [data['reviewtext'][j] for j in data[data['asin']==i].index]

In [None]:
count = []
for i in mapping.keys():
    count.append(len(mapping[i]))

In [None]:
values = []
for key,value in mapping.items():
    count = {}
    for i in value:
        for j in i:
            #print(i)
            if j._2 == 'NN':
                if j._1 not in count:
                    count[j._1] = 1
                else:
                    count[j._1] += 1
    count = sorted(count.items(), key=operator.itemgetter(1),reverse=True)
    values.append(count)

In [None]:
data_noun = pd.DataFrame(columns=['asin','reviewtext_noun'])
data_noun['asin'] = mapping.keys()
data_noun['reviewtext_noun'] = values

In [None]:
data_noun.head()

In [None]:
def removal(li):
    val = []
    common_words = ['power', 'port', 'ports','usb', 'charger', 'battery', 'fit', 'weight', 'size', 'charge', 'build', 'price', 'quality', 'display','case', 'cases','panel', 'plastic', 'warranty', 'charging', 'cable', 'cost', 'connection', 'phone', 'color', 'charges', 'cord', 'protect', 'protection', 'protects', 'protector', 'package', 'packaging', 'connectors', 'connector','glass', 'brand', 'shock', 'button', 'service', 'crack', 'cracks', 'scratch', 'install', 'installation', 'bubble', 'bubbles']
    for i in li:
        if i[0] in common_words:
            val.append(i)
    return val

In [None]:
data_new = data_noun['reviewtext_noun'].map(removal)

In [None]:
data_noun['reviewtext_noun'] = data_new

In [None]:
data_noun.to_csv('noun.csv',index=False)

In [None]:
mapping_words = {}
for key,value in mapping.items():
    for j in range(len(value)):
        for k in range(len(value[j])):
            value[j][k] = value[j][k]._1
    mapping_words[key] = value

In [None]:
for key,value in mapping_words.items():
    for j in range(len(value)):
        value[j] = ' '.join(value[j])


In [None]:
def ngram(li):
    copy_3 = []
    for i in range(len(li)):
        if len(li[i].split(' ')) > 2:
            a = word_tokenize(li[i])
            b = ngrams(a,3)
            copy_3.append(list(b))
        else:
            c = word_tokenize(li[i])
            d = ngrams(c,2)
            copy_3.append(list(d))
    return copy_3

In [None]:
for key,value in mapping_words.items():
    mapping_words[key] = ngram(value)

In [None]:
for key,value in mapping_words.items():
    val = []
    for j in value:
        for k in j:
            val.append(k)
    mapping_words[key] = val

In [None]:
def nouns(asin,text):
    noun_mapping = {}
    for i in text:
        for value in mapping_words[asin]:
            if i[0] in value:
                if i[0] not in noun_mapping:
                    noun_mapping[i[0]] = [value]
                else:
                    noun_mapping[i[0]].append(value)
    return noun_mapping

In [None]:
maps = []
for i in range(len(data_noun)):
    maps.append(nouns(data_noun.iloc[i,0],data_noun.iloc[i,1]))

In [None]:
def sentiment(sentence):
    score_final = analyser.polarity_scores(sentence)
    return -score_final['neg'] if score_final['compound'] < 0 else score_final['pos']

In [None]:
final = []
for i in maps:
    noun_score = {}
    for key,value in i.items():
        scores = 0
        for j in value:
            score = sentiment(' '.join(j))
            scores += score
        noun_score[key] = scores/len(value)
    noun_score = sorted(noun_score.items(),key=operator.itemgetter(1))
    final.append(noun_score)

In [None]:
data_noun['final_featur_scores'] = final

In [None]:
data_noun.head(5)

In [None]:
for i in range(len(final)):
    for j in range(len(final[i])):
        final[i][j] = list(final[i][j]) 
        

In [None]:
for i in range(len(data_new)):
    for j in range(len(data_new[i])):
        data_new[i][j] = list(data_new[i][j]) 
        

In [None]:
new = list(data_new)
final = list(data_noun['final_featur_scores'])

In [None]:
def final_count_sort(first,second):
    li = []
    for i in range(len(first)):
        for j in range(len(second)):
             if first[i][0] == second[j][0]:
                    li.append([first[i][0],second[j][1],first[i][1]])
    return li

In [None]:
final_map = []
for i in range(len(new)):
    final_map.append(final_count_sort(new[i],final[i]))

In [None]:
data_noun['final_features'] = final_map
data_noun['review_count'] = count

### 1.2. Extract Top 5 Positive and Negative Features


In [None]:
def pos_neg(text):
    pos = {'pos':[],'neg':[]}
    for i in text:
        if i[1] >= 0.1 and len(pos['pos']) <= 4:
            pos['pos'].append([i[0],i[2]])
        elif i[1] < 0.1 and len(pos['neg']) <= 4:
            pos['neg'].append([i[0],i[2]])
    return pos

In [None]:
for i in range(len(data_noun)):
    data_noun['pos_neg'] = data_noun['final_features'].map(pos_neg)

In [None]:
pos = []
neg = []
for i in range(len(data_noun)):
    pos.append(data_noun['pos_neg'][i]['pos'])
    neg.append(data_noun['pos_neg'][i]['neg'])

pos_count = []
neg_count = []
for i in range(len(data_noun)):
    for j in range(5):
        pos_count.append(pos[i][j][1])
        neg_count.append(neg[i][j][1])

pos_name = []
neg_name = []
for i in range(len(data_noun)):
    for j in range(5):
        pos_name.append(pos[i][j][0])
        neg_name.append(neg[i][j][0])        

In [None]:
pos_count = np.array(pos_count)
pos_count = pos_count.reshape(20,5)
neg_count = np.array(neg_count)
neg_count = neg_count.reshape(20,5)

pos_name = np.array(pos_name)
pos_name = pos_name.reshape(20,5)
neg_name = np.array(neg_name)
neg_name = neg_name.reshape(20,5)

In [None]:
for i in range(len(count)):
    pos_count[i] = pos_count[i]*100/count[i]
    neg_count[i] = neg_count[i]*100/count[i]

In [None]:
pos_val = []
neg_val = []
for i in range(20):
    pos_val.append(list(pos_name[i]))
    neg_val.append(list(neg_name[i]))

pos_cval = []
neg_cval = []
for i in range(20):
    pos_cval.append(list(pos_count[i]))
    neg_cval.append(list(neg_count[i]))

In [None]:
df = pd.DataFrame(columns = ['asin', 'pos', 'count'])
df['pos'] = pos_val
df['count'] = pos_cval
df['asin'] = data_noun['asin']

In [None]:
df1 = pd.DataFrame(columns = ['asin', 'neg', 'count'])
df1['neg'] = neg_val
df1['count'] = neg_cval
df1['asin'] = data_noun['asin']