In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
NER = spacy.load("en_core_web_lg")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('wordnet')
nltk.download('omw-1.4')

df = pd.read_csv('amazon_alexa.tsv', sep='\t')

df.head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sicom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sicom\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\sicom\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sicom\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sicom\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [2]:
df = df.drop(['rating', 'date','feedback' ,'variation'], axis=1)

In [3]:
def clean_text(review):
    stopword = set(stopwords.words('english'))
    stopword.add('')
    review = word_tokenize(review)
    review = [rev.lower() for rev in review]
    review = [re.sub("[^A-Za-z0-9]",' ',rev) for rev in review]
    review = [re.sub(r'[0-9]+',' ',rev) for rev in review]
    review = " ".join(WordNetLemmatizer().lemmatize(i) for i in review if i not in stopword)
    
    return review

df['verified_reviews'] = df['verified_reviews'].apply(lambda rev: clean_text(rev))

df

Unnamed: 0,verified_reviews
0,love echo
1,loved
2,sometimes playing game answer question corre...
3,lot fun thing yr old learns dinosaur con...
4,music
...,...
3145,perfect kid adult everyone
3146,listening music searching location checkin...
3147,love thing running entire home tv light ...
3148,complaint sound quality n t great mostly use...


In [4]:
df['verified_reviews_POS'] = df['verified_reviews'].apply(lambda x: nltk.pos_tag(x.split()))
df


Unnamed: 0,verified_reviews,verified_reviews_POS
0,love echo,"[(love, NN), (echo, NN)]"
1,loved,"[(loved, VBN)]"
2,sometimes playing game answer question corre...,"[(sometimes, RB), (playing, VBG), (game, NN), ..."
3,lot fun thing yr old learns dinosaur con...,"[(lot, NN), (fun, JJ), (thing, NN), (yr, NN), ..."
4,music,"[(music, NN)]"
...,...,...
3145,perfect kid adult everyone,"[(perfect, JJ), (kid, NN), (adult, NN), (every..."
3146,listening music searching location checkin...,"[(listening, VBG), (music, NN), (searching, VB..."
3147,love thing running entire home tv light ...,"[(love, JJ), (thing, NN), (running, VBG), (ent..."
3148,complaint sound quality n t great mostly use...,"[(complaint, NN), (sound, NN), (quality, NN), ..."


In [5]:
dictionary = {}

In [6]:
for rev in df['verified_reviews_POS']:
    for word in rev:
        if word[1] in dictionary:
            #if word[0] not in word[1]:
            dictionary[word[1]].add(word[0])
        else:
            dictionary[word[1]] = set(word[0])


In [7]:
for key in dictionary:
    dictionary[key] = list(dictionary[key])

In [8]:
# print(dictionary)

In [9]:
out = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in dictionary.items() ]))
out

Unnamed: 0,NN,VBN,RB,VBG,JJR,JJ,VBP,IN,VBZ,VBD,...,TO,WRB,RP,WP,EX,WP$,WDT,NNP,RBS,UH
0,tip,surprised,tip,wedding,worse,pen,am,within,lit,surprised,...,n,woken,out,chat,there,s,chat,xbox,a,s
1,shop,adapted,s,reconnecting,louder,traditional,home,audioapple,johnny,proved,...,na,whenever,over,u,t,o,a,speaker,best,yes
2,bill,s,certainly,transferring,shower,proved,india,thorough,phillips,s,...,a,when,off,a,,w,v,xfinity,m,y
3,evening,v,shortly,needing,s,tip,artist,worth,confuses,convinced,...,to,o,ask,discoveredthat,,e,w,u,e,oh
4,spell,bought,primarily,gaming,bother,opt,s,alexi,s,expected,...,,w,u,s,,h,e,s,k,e
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1782,haywire,,,,,,,,,,...,,,,,,,,,,
1783,mode,,,,,,,,,,...,,,,,,,,,,
1784,rarity,,,,,,,,,,...,,,,,,,,,,
1785,combination,,,,,,,,,,...,,,,,,,,,,


In [10]:
!pip install xlwt
out.to_excel('postagging.xlsx')



In [None]:
df['verified_reviews_NER'] = df['verified_reviews'].apply(lambda x: NER(x))

In [None]:
df['verified_reviews_NER']

In [None]:
dict_ner = {}
for rev in df['verified_reviews_NER']:
    for word in rev.ents:
        if word.label_ in dict_ner:
            dict_ner[word.label_].add(word.text)
        else:
            dict_ner[word.label_] = set(word.text)
            
dict_ner

        

In [None]:
for key in dict_ner:
    dict_ner[key] = list(dict_ner[key])

In [34]:
out_ner = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in dict_ner.items() ]))
out_ner

Unnamed: 0,ORG,CARDINAL,ORDINAL,DATE,WORK_OF_ART,LOC,GPE,TIME,MONEY,PERSON,PRODUCT,FAC,NORP,EVENT,LANGUAGE,PERCENT,QUANTITY,LAW
0,Apple Music,229,2nd,May,LOVE MY ECHO SHOW,echo,echo,6 hours,34;lights.&#34,Aunt,Kasa,Echo Tower,an Echo Dot,This Echo Dot,English,1,,e
1,REFURBISHED ECHO DOT &,hundreds,First,10 year old,L,the Echo Show,Alexas,two second,34;turn,Phillips,SHOW,,Southern,The Echo Dot,E,15%,b,the Training Coordinator
2,the PhilipsHue Light Bulb,a million,3xs,the 1st,The Echo Plus,Dots,Netflix,one night,29.99,Contacted Amazon,b,x,Americans,an Echo Dot,i,around 20%,1GB,
3,Sprint,hundred,Third,a year,Mother’s Day,the Echo Dots,i,4 hours,99,Hue,i,E,i,c,h,%,i,o
4,Phillips,2package,6th,August,an Echo Dot,Long Island,f,4am,100,Alexa Echo,Spot,c,I,,Spanish,0,m,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,CSI,,,,,,,,,,,,,,,,,
198,AI,,,,,,,,,,,,,,,,,
199,the Amazon Echo,,,,,,,,,,,,,,,,,
200,Amazon,,,,,,,,,,,,,,,,,


In [35]:
out_ner.to_excel('ner.xlsx')

In [36]:
dict_dep = { 'Token': [], 'Relation':[],'Head':[],'Children':[]}
            

In [37]:
for rev in df['verified_reviews_NER']:
    for token in rev:
        dict_dep['Token'].append(str(token.text))
        dict_dep['Relation'].append(str(token.dep_))
        dict_dep['Head'].append(str(token.head.text))
        dict_dep['Children'].append(str([child for child in token.children]))

            
out_dep = pd.DataFrame(dict_dep)
out_dep.head(-10)

Unnamed: 0,Token,Relation,Head,Children
0,Love,ROOT,Love,"[Echo, !]"
1,my,poss,Echo,[]
2,Echo,dobj,Love,[my]
3,!,punct,Love,[]
4,Loved,ROOT,Loved,"[it, !]"
...,...,...,...,...
92990,echo,pobj,with,[size]
92991,or,cc,go,[]
92992,make,conj,go,[sure]
92993,sure,ccomp,make,[hook]


In [38]:
out_dep.to_excel('dep.xlsx')