In [1]:
import nltk

## Tokenization

In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [3]:
#Importing the dataset from the corpus
from nltk.corpus import product_reviews_1

In [4]:
data = product_reviews_1.raw("Nokia_6610.txt")

In [11]:
data[0:5000]

'*****************************************************************************\n* Annotated by: Minqing Hu and Bing Liu, 2004.\n*\t\tDepartment of Computer Sicence\n*               University of Illinois at Chicago              \n*\n* Product name: Nokia 6610\n* Review Source: amazon.com\n*\n* See Readme.txt to find the meaning of each symbol. \n*****************************************************************************\n\n[t]excellent phone , excellent service . \n##i am a business user who heavily depend on mobile service . \nphone[+3], work[+2]##there is much which has been said in other reviews about the features of this phone , it is a great phone , mine worked without any problems right out of the box . \n##just double check with customer service to ensure the number provided by amazon is for the city / exchange you wanted . \nat&t customer service[-2]##after several years of torture in the hands of at&t customer service i am delighted to drop them , and look forward to august 

## Word Tokenization

- Grouping of words present in the dataset

In [12]:
words = nltk.word_tokenize(data)

In [14]:
print(words[0:500])

['*****************************************************************************', '*', 'Annotated', 'by', ':', 'Minqing', 'Hu', 'and', 'Bing', 'Liu', ',', '2004', '.', '*', 'Department', 'of', 'Computer', 'Sicence', '*', 'University', 'of', 'Illinois', 'at', 'Chicago', '*', '*', 'Product', 'name', ':', 'Nokia', '6610', '*', 'Review', 'Source', ':', 'amazon.com', '*', '*', 'See', 'Readme.txt', 'to', 'find', 'the', 'meaning', 'of', 'each', 'symbol', '.', '*****************************************************************************', '[', 't', ']', 'excellent', 'phone', ',', 'excellent', 'service', '.', '#', '#', 'i', 'am', 'a', 'business', 'user', 'who', 'heavily', 'depend', 'on', 'mobile', 'service', '.', 'phone', '[', '+3', ']', ',', 'work', '[', '+2', ']', '#', '#', 'there', 'is', 'much', 'which', 'has', 'been', 'said', 'in', 'other', 'reviews', 'about', 'the', 'features', 'of', 'this', 'phone', ',', 'it', 'is', 'a', 'great', 'phone', ',', 'mine', 'worked', 'without', 'any', 'problem

## Sentence Tokenization

- Grouping of sentences present in the dataset

In [15]:
sentences = nltk.sent_tokenize(data)

In [18]:
print(sentences[0:100])

['*****************************************************************************\n* Annotated by: Minqing Hu and Bing Liu, 2004.', '*\t\tDepartment of Computer Sicence\n*               University of Illinois at Chicago              \n*\n* Product name: Nokia 6610\n* Review Source: amazon.com\n*\n* See Readme.txt to find the meaning of each symbol.', '*****************************************************************************\n\n[t]excellent phone , excellent service .', '##i am a business user who heavily depend on mobile service .', 'phone[+3], work[+2]##there is much which has been said in other reviews about the features of this phone , it is a great phone , mine worked without any problems right out of the box .', '##just double check with customer service to ensure the number provided by amazon is for the city / exchange you wanted .', 'at&t customer service[-2]##after several years of torture in the hands of at&t customer service i am delighted to drop them , and look forward to

## Stemming

In [19]:
from nltk.stem import PorterStemmer

In [20]:
ps = PorterStemmer()

In [25]:
for w in words[0:50]:
    print(w+" : "+str(ps.stem(w)))

***************************************************************************** : *****************************************************************************
* : *
Annotated : annot
by : by
: : :
Minqing : minq
Hu : Hu
and : and
Bing : bing
Liu : liu
, : ,
2004 : 2004
. : .
* : *
Department : depart
of : of
Computer : comput
Sicence : sicenc
* : *
University : univers
of : of
Illinois : illinoi
at : at
Chicago : chicago
* : *
* : *
Product : product
name : name
: : :
Nokia : nokia
6610 : 6610
* : *
Review : review
Source : sourc
: : :
amazon.com : amazon.com
* : *
* : *
See : see
Readme.txt : readme.txt
to : to
find : find
the : the
meaning : mean
of : of
each : each
symbol : symbol
. : .
***************************************************************************** : *****************************************************************************
[ : [


## Lemmatizing

- Similar to that of Stemming except it returns a complete word with meaning

In [26]:
from nltk.stem import WordNetLemmatizer

In [27]:
lemmatizer = WordNetLemmatizer()

In [28]:
for w in words[0:60]:
    print(w+" : "+str(lemmatizer.lemmatize(w)))

***************************************************************************** : *****************************************************************************
* : *
Annotated : Annotated
by : by
: : :
Minqing : Minqing
Hu : Hu
and : and
Bing : Bing
Liu : Liu
, : ,
2004 : 2004
. : .
* : *
Department : Department
of : of
Computer : Computer
Sicence : Sicence
* : *
University : University
of : of
Illinois : Illinois
at : at
Chicago : Chicago
* : *
* : *
Product : Product
name : name
: : :
Nokia : Nokia
6610 : 6610
* : *
Review : Review
Source : Source
: : :
amazon.com : amazon.com
* : *
* : *
See : See
Readme.txt : Readme.txt
to : to
find : find
the : the
meaning : meaning
of : of
each : each
symbol : symbol
. : .
***************************************************************************** : *****************************************************************************
[ : [
t : t
] : ]
excellent : excellent
phone : phone
, : ,
excellent : excellent
service : service
. : .
# : #
# : #


## Part of the speech tagging

- Part of the speech tagging is what it sounds like, tagging the individual words with grammar

In [33]:
for i in sentences[0:10]:
    words = nltk.word_tokenize(i)
    
    tagged = nltk.pos_tag(words)
    
    print(tagged)

[('*****************************************************************************', 'JJ'), ('*', 'NNP'), ('Annotated', 'VBN'), ('by', 'IN'), (':', ':'), ('Minqing', 'NNP'), ('Hu', 'NNP'), ('and', 'CC'), ('Bing', 'NNP'), ('Liu', 'NNP'), (',', ','), ('2004', 'CD'), ('.', '.')]
[('*', 'JJ'), ('Department', 'NNP'), ('of', 'IN'), ('Computer', 'NNP'), ('Sicence', 'NNP'), ('*', 'NNP'), ('University', 'NNP'), ('of', 'IN'), ('Illinois', 'NNP'), ('at', 'IN'), ('Chicago', 'NNP'), ('*', 'NNP'), ('*', 'NNP'), ('Product', 'NNP'), ('name', 'NN'), (':', ':'), ('Nokia', 'JJ'), ('6610', 'CD'), ('*', 'NNP'), ('Review', 'NNP'), ('Source', 'NNP'), (':', ':'), ('amazon.com', 'NN'), ('*', 'VBZ'), ('*', 'NNP'), ('See', 'NNP'), ('Readme.txt', 'NNP'), ('to', 'TO'), ('find', 'VB'), ('the', 'DT'), ('meaning', 'NN'), ('of', 'IN'), ('each', 'DT'), ('symbol', 'NN'), ('.', '.')]
[('*****************************************************************************', 'JJ'), ('[', 'NNP'), ('t', 'NN'), (']', 'NNP'), ('excellen

## Stop words

- These are the type of the words that affect our data analysis process, these are the words are redundant need to be removed or filtered

In [34]:
from nltk.corpus import stopwords

In [35]:
stop_words = set(stopwords.words("english"))

In [37]:
print(stop_words)

filtered_sent = []

{'how', 'after', 'where', 'between', 'd', 'had', 've', 'at', "shan't", 'hers', "it's", 'does', 'there', 'having', 'if', 'has', 'own', 'most', "should've", 'isn', 'i', 'which', 'who', 'for', 'be', 'just', 'my', 'did', "you've", 'myself', 'they', 'yourself', 'into', 'himself', "hasn't", 'been', 'both', 'other', 'now', 'it', 'being', 'her', "mustn't", 'same', 'ma', 'theirs', 'from', 'what', 'some', 'whom', 'again', "needn't", 'only', 'shouldn', 'of', 'not', "wasn't", 'hasn', 'nor', 'by', 'yours', "weren't", 'won', "didn't", 'don', 'yourselves', 'any', 'or', 'are', 'through', "don't", 'in', 'under', 'no', 'we', 'further', "you're", 'each', 'will', 'than', "haven't", "hadn't", 'to', 'itself', 'shan', 'should', 'aren', 'herself', 'its', 'when', 'so', 'ours', 'that', 'all', 'ourselves', 'very', 'doesn', 'why', "mightn't", "aren't", "doesn't", 'was', 'am', 'needn', "she's", 'his', 'mightn', 'them', 's', 'doing', 'over', 'mustn', 'the', "shouldn't", 'him', 'before', 'hadn', 'about', 'off', 'few

In [38]:
for w in words[0:80]:
    if w not in stop_words:
        filtered_sent.append(w)
        

In [39]:
print(filtered_sent)

['speaker', 'phone', '[', '+2', ']', ',', 'radio', '[', '+2', ']', ',', 'infrared', '[', '+2', ']', '#', '#', 'favorite', 'features', ',', 'although', 'many', ',', 'speaker', 'phone', ',', 'radio', 'infrared', '.']


## Chunking

In [42]:
for i in sentences[0:20]:
    
    words = word_tokenize(i)
    tagged = nltk.pos_tag(words)
    
    chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?} """
    chunkParser = nltk.RegexpParser(chunkGram)
    
    chunked = chunkParser.parse(tagged)
    
    chunked.draw()

## Wordnet Synonyms

In [44]:
from nltk.corpus import wordnet

In [50]:
synsets = wordnet.synsets("dog")
print(synsets)

[Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')]


In [52]:
print(synsets[0].lemmas())

[Lemma('dog.n.01.dog'), Lemma('dog.n.01.domestic_dog'), Lemma('dog.n.01.Canis_familiaris')]


In [54]:
print(synsets[0].definition())

a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds


In [55]:
print(synsets[0].examples())

['the dog barked all night']
