In [8]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
!python -m spacy download en_core_web_lg
NER = spacy.load("en_core_web_lg")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

df = pd.read_csv('amazon_alexa.tsv', sep='\t')

df.head()


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.0/en_core_web_lg-3.4.0-py3-none-any.whl (587.7 MB)
[K     |████████████████████████████████| 587.7 MB 9.8 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [9]:
df = df.drop(['rating', 'date','feedback' ,'variation'], axis=1)

In [10]:
def clean_text(review):
    stopword = set(stopwords.words('english'))
    stopword.add('')
    review = word_tokenize(review)
    review = [rev.lower() for rev in review]
    review = [re.sub("[^A-Za-z0-9]",' ',rev) for rev in review]
    review = [re.sub(r'[0-9]+',' ',rev) for rev in review]
    review = " ".join(WordNetLemmatizer().lemmatize(i) for i in review if i not in stopword)
    
    return review

df['verified_reviews'] = df['verified_reviews'].apply(lambda rev: clean_text(rev))

df

Unnamed: 0,verified_reviews
0,love echo
1,loved
2,sometimes playing game answer question corre...
3,lot fun thing yr old learns dinosaur con...
4,music
...,...
3145,perfect kid adult everyone
3146,listening music searching location checkin...
3147,love thing running entire home tv light ...
3148,complaint sound quality n t great mostly use...


In [11]:
df['verified_reviews_POS'] = df['verified_reviews'].apply(lambda x: nltk.pos_tag(x.split()))
df


Unnamed: 0,verified_reviews,verified_reviews_POS
0,love echo,"[(love, NN), (echo, NN)]"
1,loved,"[(loved, VBN)]"
2,sometimes playing game answer question corre...,"[(sometimes, RB), (playing, VBG), (game, NN), ..."
3,lot fun thing yr old learns dinosaur con...,"[(lot, NN), (fun, JJ), (thing, NN), (yr, NN), ..."
4,music,"[(music, NN)]"
...,...,...
3145,perfect kid adult everyone,"[(perfect, JJ), (kid, NN), (adult, NN), (every..."
3146,listening music searching location checkin...,"[(listening, VBG), (music, NN), (searching, VB..."
3147,love thing running entire home tv light ...,"[(love, JJ), (thing, NN), (running, VBG), (ent..."
3148,complaint sound quality n t great mostly use...,"[(complaint, NN), (sound, NN), (quality, NN), ..."


In [12]:
dictionary = {}
for rev in df['verified_reviews_POS']:
    for word in rev:
        if word[1] in dictionary:
            #if word[0] not in word[1]:
            dictionary[word[1]].add(word[0])
        else:
            dictionary[word[1]] = set(word[0])


In [13]:
for key in dictionary:
    dictionary[key] = list(dictionary[key])

In [14]:
postag = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in dictionary.items() ]))
postag

Unnamed: 0,NN,VBN,RB,VBG,JJR,JJ,VBP,IN,VBZ,VBD,...,TO,WRB,RP,WP,EX,WP$,WDT,NNP,RBS,UH
0,medium,read,i,understanding,cheaper,re,read,en,corresponds,tickled,...,a,when,bathroom,who,there,s,v,x,k,s
1,stat,ignored,bit,exchanging,sister,fire,buy,nicer,depreciates,re,...,na,o,alexa,r,t,o,r,s,m,y
2,navigation,raised,importantly,interacting,cualquier,intrusive,reccomend,whether,broken,handled,...,n,w,audio,s,,w,whatever,jump,best,yes
3,agent,cancelled,straightforward,touching,bigger,adopter,re,with,task,red,...,to,wow,p,warehouse,,h,whatsoever,fire,e,e
4,pin,dated,presumably,asking,w,tardis,gb,thestand,guy,ran,...,,whenever,awhile,u,,e,w,k,a,oh
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1782,miss,,,,,,,,,,...,,,,,,,,,,
1783,standalone,,,,,,,,,,...,,,,,,,,,,
1784,excellent,,,,,,,,,,...,,,,,,,,,,
1785,department,,,,,,,,,,...,,,,,,,,,,


In [15]:
!pip install xlwt
postag.to_excel('postagging.xlsx')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
df['verified_reviews_NER'] = df['verified_reviews'].apply(lambda x: NER(x))

In [17]:
df['verified_reviews_NER']

0                                         (love, echo,  )
1                                              (loved,  )
2       (sometimes, playing, game,   , answer, questio...
3       (lot, fun, thing,     , yr, old, learns, dinos...
4                                                 (music)
                              ...                        
3145             (perfect, kid,   , adult, everyone,    )
3146    (listening, music,   , searching, location,   ...
3147    (love, thing,   , running, entire, home,   , t...
3148    (complaint, sound, quality, n, t, great,   , m...
3149                                               (good)
Name: verified_reviews_NER, Length: 3150, dtype: object

In [18]:
dict_ner = {}
for rev in df['verified_reviews_NER']:
    for word in rev.ents:
        if word.label_ in dict_ner:
            dict_ner[word.label_].add(word.text)
        else:
            dict_ner[word.label_] = set(word.text)
            
dict_ner

        

{'CARDINAL': {'almost half',
  'e',
  'five',
  'four',
  'four   five',
  'half',
  'hundred',
  'hundred thousand',
  'n',
  'nearly half',
  'o',
  'one',
  'one two',
  'six',
  'three',
  'three three',
  'two',
  'two   one',
  'two   three',
  'two three',
  'zero'},
 'TIME': {' ',
  'afternoon',
  'couple minute',
  'e',
  'evening',
  'every morning',
  'g',
  'hour',
  'hour minute',
  'i',
  'last night',
  'late night',
  'le   minute  ',
  'm',
  'middle night',
  'minute',
  'morning',
  'multiple hour',
  'n',
  'night',
  'night hour',
  'o',
  'one night',
  'r',
  'sometimes night',
  'v',
  'y'},
 'ORDINAL': {'c',
  'd',
  'e',
  'first',
  'fourth',
  'n',
  'o',
  's',
  'second',
  'third'},
 'ORG': {'a',
  'abc',
  'abd',
  'alexa',
  'alexa clean house',
  'amazon',
  'amazon  s',
  'amazon alexa',
  'amazon cam  s job',
  'amazon n',
  'amazon n t',
  'amazon netflix',
  'amazon sonos',
  'amazon web service n t',
  'amazon zigbee',
  'arlo security',
  'ase',


In [19]:
for key in dict_ner:
    dict_ner[key] = list(dict_ner[key])

In [20]:
out_ner = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in dict_ner.items() ]))
out_ner

Unnamed: 0,CARDINAL,TIME,ORDINAL,ORG,PERSON,NORP,DATE,GPE,LANGUAGE,MONEY,QUANTITY,LOC,FAC
0,hundred thousand,minute,third,amazon netflix,alexa,scottish,thursday,miami,english,v,,x,x
1,two three,g,first,isue month,alex app,english,one age ago,tunein,l,r,f,r,l
2,four,night,s,wifi hdm tv,june hulu,l,p,alabama,i,,o,s,
3,two three,y,second,iheartradio,gb,i,weekly,north carolina,s,p,ten foot,,u
4,hundred,hour,o,viceo,bob dylan,screenselect,august,suffolk county,n,every penny,n,o,zigbee hub
...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,,,,sony,,,,,,,,,
102,,,,nervana,,,,,,,,,
103,,,,a,,,,,,,,,
104,,,,hbo,,,,,,,,,


In [21]:
out_ner.to_excel('ner.xlsx')

In [22]:
dict_dep = { 'Token': [], 'Relation':[],'Head':[],'Children':[]}
for rev in df['verified_reviews_NER']:
    for token in rev:
        dict_dep['Token'].append(str(token.text))
        dict_dep['Relation'].append(str(token.dep_))
        dict_dep['Head'].append(str(token.head.text))
        dict_dep['Children'].append(str([child for child in token.children]))

            
out_dep = pd.DataFrame(dict_dep)
out_dep.head(-10)

Unnamed: 0,Token,Relation,Head,Children
0,love,nsubj,echo,[]
1,echo,ROOT,echo,"[love, ]"
2,,dep,echo,[]
3,loved,ROOT,loved,[ ]
4,,dep,loved,[]
...,...,...,...,...
51080,kind,amod,thing,[]
51081,thing,dobj,re,"[bothered, kind, ]"
51082,,dep,thing,[]
51083,d,nsubj,go,[]


In [23]:
out_dep.to_excel('dep.xlsx')