In [13]:
import re
import datetime
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import sqlalchemy
from sqlalchemy import create_engine,Table,Column,Integer,String,MetaData,ForeignKey,Date,update
import warnings
from IPython.display import display
import io
import os

from nltk.parse.stanford import StanfordDependencyParser

#os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home" 
#os.environ["CLASSPATH"] = "/home/chweng/Desktop/SemanticProj/StanfordNLP/jars"
#os.environ["STANFORD_MODELS"] = "/home/chweng/Desktop/SemanticProj/StanfordNLP/models"
os.environ["CLASSPATH"] ='/Users/chweng/Desktop/StanfordNLP/jars'
os.environ["STANFORD_MODELS"] ='/Users/chweng/Desktop/StanfordNLP/models'

In [14]:
def load_csv(date):
    #載入資料庫抓下來的csv檔存成pandas dataframe
    df=pd.DataFrame.from_csv("data/reviews-%s.csv"%(date), encoding="utf-8",index_col=None)
    #.reset_index(drop=True)
    return df

In [15]:
def summary(df):
    #確認資料數
    prodTypes=["central","canister","handheld","robotic","stick","upright","wetdry"]
    #choose type
    for prodType in prodTypes:
        print(prodType,len(df.loc[df["ptype"]==prodType]["review"]))
    print("total reviews=",len(df),"\n")

In [16]:
def store_prodRevs_in_a_list(prodType,df):
    '''將pandas dataframe內的評論提取，存成一個List。該List
    '''
    df=df.loc[df["ptype"]==prodType]
    pReviewsGroup=df.groupby(['ptype','pid'])

    reviews_list=[]
    for key,pReviews in pReviewsGroup:
        rids=pReviews[["rid","review"]].values[:,0].tolist()
        reviews=pReviews[["rid","review"]].values[:,1].tolist()

        ptypes=[key[0]]*len(rids)
        pids=[key[1]]*len(rids)
        reviews_list+=list(zip(ptypes,pids,rids,reviews))

    print("number of products=",len(pReviewsGroup))
    print("number of reviews in type %s=%i"%(prodType,len(reviews_list)))
    
    return reviews_list

In [17]:
def cleanText(review):
    #print(len(review))
    review=review.lower().replace('\n','').replace(';',',').replace('!',',') \
                 .replace('i.e.','').strip()
    review=re.sub("\.+,", ",",review)     # replace ......., with ,
    review=re.sub("\.+",".", review)      # replace ........ with .
    review=re.sub("\,+",",", review)      # replace ,,,,,,,, with ,
    review=re.sub('[\"\'\*]','', review)  # remove ", ' and *
    
    #remove emoji
    emoji_pattern = re.compile("["  
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    review=re.sub(emoji_pattern,"", review)
    review=re.sub('(?::|;|=)(?:-)?(?:\)|\(|D|P)',"",review)
    # notice that, the question mark makes the preceding token
    # in the regular expression optional.
    # also, (?: xxx) is a non-capturing group
    # let us probably come back to this re later.
    return review

In [18]:
def splitSentences(review):
    rev=re.split(r' *[\.\?!][\'"\)\]]* *', review)
    return rev

In [19]:
def reviewsProcessing(reviews_list):
    reviews_list=[(review[0],review[1],review[2],splitSentences(cleanText(review[3])))
                              for review in reviews_list if review[3]==review[3]]
    return reviews_list

In [20]:
def extractSentencesOfTheSelectedReviews(prodIDs,reviews_list):
    # extract the sentences of the targeted products (those products are contained in the list: prodIDs)
    buf = io.StringIO(prodIDs)
    prodIDs=buf.readlines()
    prodIDs=list(map(lambda x:re.sub('[\n]','',x) ,prodIDs))
    selectedReviews=[review[3] for review in reviews_list if(review[1] in prodIDs)]
    sentences=[sentence for sentences in selectedReviews for sentence in sentences]
    print("number of sentences from the selected prodIDs %s=%i"%(prodIDs,len(sentences)))
    return sentences

In [21]:
def findMatchedSentences(pattern,sentences):
    sentsMatched=[sentence for sentence in sentences if pattern in sentence]
    print("number of the sentences matched the pattern '%s'=%i"%(pattern,len(sentences)))
    return sentsMatched
   
#    for idx,sentence in enumerate(sentences):
#        if('suction' in sentence):
#            print(idx,repr(sentence),'\n')

---

In [22]:
#product types
prodTypes=["central","canister","handheld","robotic","stick","upright","wetdry"]

df=load_csv("2017-02-03")                               # load reviews from csv
summary(df)
reviews_list=store_prodRevs_in_a_list(prodTypes[1],df)  # store reviews into a list
reviews_list=reviewsProcessing(reviews_list)            # the sentences of the reviews are cleaned

#product id
prodIDs='''B00002N8CX
'''                        
#prodIDs='''B00002N8CX
#B00AZBIXHG
#B00AZBIV9Q
#'''

# obtain all the sentences of the reviews of the selected products
sentences=extractSentencesOfTheSelectedReviews(prodIDs,reviews_list)

sentsMatched=findMatchedSentences("noise",sentences)

central 335
canister 23294
handheld 79899
robotic 26944
stick 57268
upright 102643
wetdry 16821
total reviews= 307204 

number of products= 350
number of reviews in type canister=23294
number of sentences from the selected prodIDs ['B00002N8CX']=14427
number of the sentences matched the pattern 'noise'=14427


In [23]:
sentsMatched[:5]

['the noise',
 'it produces a powerful vacuum, easily maneuverable, convenient for storing, the noise is not too bad, and a good price',
 'the noise isnt too painful, either (some  low-powered vacuums make that high-pitched whining noise that just makes  you want to live in filth)',
 'it is higher-power, (better suction, but it blew a circuit the first time i used it), and the noise it makes is much louder than the old one',
 'and if you can vacuum your home 2x faster, the noise lasts only half as long']

#### now, let's check if the Stanford parser works

In [26]:
#sent='i found no neg_fault with the suction power, the machine picked up bits of paper and cat litter easily without multiple passes '
sent='suction is not good at all'
#sent='actually you don\'t need a carpet attachment with this because it has so much suction'
#sent='it costs too much money'
print(repr(sent),'\n')
parser = StanfordDependencyParser()
res = list(parser.parse(sent.split()))

#for row in res[0].triples():
#    print(row)

rels=[rel for rel in res[0].triples()]
   
#for row in res[0].tree():
#    if type(row) is not str:
#        #row.draw()
#        display(row)

'suction is not good at all' 



In [27]:
for rel in rels:
    print(rel)

(('good', 'JJ'), 'nsubj', ('suction', 'NN'))
(('good', 'JJ'), 'cop', ('is', 'VBZ'))
(('good', 'JJ'), 'neg', ('not', 'RB'))
(('good', 'JJ'), 'nmod', ('all', 'DT'))
(('all', 'DT'), 'case', ('at', 'IN'))


In [268]:
for sent in sentsMatched[:1]:

    parser = StanfordDependencyParser()
    result = list(parser.parse(sent.split()))
    rels=[rel for rel in result[0].triples()]

    negList=['no','not']
    for triple in rels:
        if triple[1]='neg':
            if triple[0][1] in negList:
                
    
    
    
    depWords=[]
    pattern='noise'

    depPairs=[(rel[0][0],rel[2][0]) for rel in rels]
    for depPair in depPairs:
        if(pattern in depPair[0]):
            depWords.append(depPair[1])
        if(pattern in depPair[1]):
            depWords.append(depPair[0])
    print(sent,depWords,'\n')

SyntaxError: invalid syntax (<ipython-input-268-45e65b8f8227>, line 9)

#### todo: 
* find amod words for visualization 
* 

In [25]:
for sent in sentsMatched[:50]:

    parser = StanfordDependencyParser()
    result = list(parser.parse(sent.split()))
    rels=[rel for rel in result[0].triples()]

    depWords=[]
    pattern='noise'

    depPairs=[(rel[0][0],rel[2][0]) for rel in rels]
    for depPair in depPairs:
        if(pattern in depPair[0]):
            depWords.append(depPair[1])
        if(pattern in depPair[1]):
            depWords.append(depPair[0])
    print(sent,depWords,'\n')

the noise ['the'] 

it produces a powerful vacuum, easily maneuverable, convenient for storing, the noise is not too bad, and a good price ['bad,', 'the'] 

the noise isnt too painful, either (some  low-powered vacuums make that high-pitched whining noise that just makes  you want to live in filth) ['isnt', 'make', 'that', 'high-pitched', 'whining', 'makes'] 

it is higher-power, (better suction, but it blew a circuit the first time i used it), and the noise it makes is much louder than the old one ['louder', 'the', 'makes'] 

and if you can vacuum your home 2x faster, the noise lasts only half as long ['lasts', 'the'] 

unfortunately, the rest of the people in the house were subjected to airplane jet engine level noise ['unfortunately,', 'jet', 'engine', 'level'] 

con: the noise is unbearably loud ['loud', 'the'] 

my wife wouldnt let me keep it anyway because she said it makes a very loud high pitched noise ['makes', 'a', 'loud', 'high', 'pitched'] 

im okay with the noise as some v

---

#### codes grave yard

In [151]:
text="hahaha :) :( :D "
re.sub('(?::|;|=)(?:-)?(?:\)|\(|D|P)', ".",text)

'hahaha . . . '

In [160]:
text="hahaha"
re.sub('(?:haha)', ".",text)

'.ha'

In [176]:
text="hahaha"
m=re.match('(?:haha)',text)
print(m,"\n",m.groups())

<_sre.SRE_Match object; span=(0, 4), match='haha'> 
 ()


In [173]:
text='1st'
m=re.match('([0-9]+)(?:st|nd|rd|th)?',text)
print(m.groups())

('1',)


In [158]:
text="hahaha :) :( :D "
m=re.match('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
print(m)

None


In [177]:
a=['aa|a','bb|b']

def f(x):
    return x.split("|")

list(map(f,a))

[['aa', 'a'], ['bb', 'b']]

In [138]:
text = "Am I poor and vulnerable ? My heart is turned and tossed..., and it's all because of you..."
text=re.sub("\.+,", ",",text)
text=re.sub("\.+", ".",text)
sentence = re.split(r' *[\.\?!][\'"\)\]]* +', text)
print(sentence)

['Am I poor and vulnerable', "My heart is turned and tossed, and it's all because of you."]
