# Opinion Mining Using POS Tagging and Grammar Association

## Imports

In [1]:
import nltk
from xml.dom import minidom

## Parse an XML File and extract data

In [2]:
doc = minidom.parse('review.review')

review_data = []

reviews = doc.getElementsByTagName("review")
for review in reviews:
        review_text = review.getElementsByTagName("review_text")[0]
        review_data.append(review_text.firstChild.data.replace("\n",""))


## Exploratory Analysis

In [3]:
#review_data = ['i am traveling internationally and wanted to make sure that an international plan is active on my device ending is 9106. All of our chat reps are currently helping other customers you\'ll be connected as soon as possible. I\'ll check further and see what if we can still do something about it as well. But are you taking off those charges from when i called to cancel this please. I have been charged these other months too']
#review_data = ['Is that only available now or could we do that for the future. The unlimited. How much is the 8gb. That is correct the reason being it is so close to the cost of the 24gb and 16gb that it will make more sense put you on unlimited. I\'ll check 8gb. For the 8gb plan your bill will be $126.19. I can help you with a free feature to help you avoid overage. Safety mode is already activated on the account and this will help you stay within 8gb. Wait is there a 16gb.']
#review_data = ['I checked on your previous bills and I can see that you were getting a $25 off on your smartphone line access. That\'s the reason why the line access is just at $15. The promo ended and that\'s the reason why it was not applied to the bill']
review_data = ['You can reach a Customer Care Specialist at 1-800-392-0717 to talk to someone about your billing questions.']

In [4]:
review_data[45]

IndexError: list index out of range

## Download Dependencies

In [5]:
nltk.download()
# Download Averaged Perceptron Tagger in Models
# Help on Tagsets in Models

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

## Word Tokenize

In [23]:
text = nltk.word_tokenize("And now for something completely different")
#print text
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

<table border="2" width="100%" cellspacing="2">
	<tbody>
		<tr>
			<th>POS Tag</th>
			<th>Description</th>
			<th>Example</th>
		</tr>
		<tr>
			<td>CC</td>
			<td>coordinating conjunction</td>
			<td>and</td>
		</tr>
		<tr>
			<td>CD</td>
			<td>cardinal number</td>
			<td>1, third</td>
		</tr>
		<tr>
			<td>DT</td>
			<td>determiner</td>
			<td>the</td>
		</tr>
		<tr>
			<td>EX</td>
			<td>existential there</td>
			<td>
				<em>there</em> is</td>
		</tr>
		<tr>
			<td>FW</td>
			<td>foreign word</td>
			<td>d&#8217;hoevre</td>
		</tr>
        <tr>
			<td>IN</td>
			<td>preposition/subordinating conjunction</td>
			<td>in, of, like</td>
		</tr>
		<tr>
			<td>JJ</td>
			<td>adjective</td>
			<td>big</td>
		</tr>
		<tr>
			<td>JJR</td>
			<td>adjective, comparative</td>
			<td>bigger</td>
		</tr>
		<tr>
			<td>JJS</td>
			<td>adjective, superlative</td>
			<td>biggest</td>
		</tr>
		<tr>
			<td>LS</td>
			<td>list marker</td>
			<td>1)</td>
		</tr>
		<tr>
			<td>MD</td>
			<td>modal</td>
			<td>could, will</td>
		</tr>
        <tr>
			<td>NN</td>
			<td>noun, singular or mass</td>
			<td>door</td>
		</tr>
		<tr>
			<td>NNS</td>
			<td>noun plural</td>
			<td>doors</td>
		</tr>
		<tr>
			<td>NNP</td>
			<td>proper noun, singular</td>
			<td>John</td>
		</tr>
		<tr>
			<td>NNPS</td>
			<td>proper noun, plural</td>
			<td>Vikings</td>
		</tr>
		<tr>
			<td>PDT</td>
			<td>predeterminer</td>
			<td>
				<em>both</em> the boys</td>
		</tr>
		<tr>
			<td>POS</td>
			<td>possessive ending</td>
			<td>friend<em>&#8216;s</em>
			</td>
		</tr>
		<tr>
			<td>PRP</td>
			<td>personal pronoun</td>
			<td>I, he, it</td>
		</tr>
		<tr>
			<td>PRP</td>
			<td>possessive pronoun</td>
			<td>my, his</td>
		</tr>
		<tr>
			<td>RB</td>
			<td>adverb</td>
			<td>however, usually, naturally, here, good</td>
		</tr>
		<tr>
			<td>RBR</td>
			<td>adverb, comparative</td>
			<td>better</td>
		</tr>
		<tr>
			<td>RBS</td>
			<td>adverb, superlative</td>
			<td>best</td>
		</tr>
        <tr>
			<td>RP</td>
			<td>particle</td>
			<td>give <em> up </em>
			</td>
		</tr>
		<tr>
			<td>TO</td>
			<td>to</td>
			<td>
				<em>to</em> go, <em>to</em> him</td>
		</tr>
		<tr>
			<td>UH</td>
			<td>interjection</td>
			<td>uhhuhhuhh</td>
		</tr>
		<tr>
			<td>VB</td>
			<td>verb, base form</td>
			<td>take</td>
		</tr>
		<tr>
			<td>VBD</td>
			<td>verb, past tense</td>
			<td>took</td>
		</tr>
		<tr>
			<td>VBG</td>
			<td>verb, gerund/present participle</td>
			<td>taking</td>
		</tr>
		<tr>
			<td>VBN</td>
			<td>verb, past participle</td>
			<td>taken</td>
		</tr>
		<tr>
			<td>VBP</td>
			<td>verb, sing. present, non-3d</td>
			<td>take</td>
		</tr>
		<tr>
			<td>VBZ</td>
			<td>verb, 3rd person sing. present</td>
			<td>takes</td>
		</tr>
		<tr>
			<td>WDT</td>
			<td>wh-determiner</td>
			<td>which</td>
		</tr>
		<tr>
			<td>WP</td>
			<td>wh-pronoun</td>
			<td>who, what</td>
		</tr>
		<tr>
			<td>WP</td>
			<td>possessive wh-pronoun</td>
			<td>whose</td>
		</tr>
		<tr>
			<td>WRB</td>
			<td>wh-abverb</td>
			<td>where, when</td>
		</tr>
	</tbody>
</table>

## POS Tagging of  Words

In [6]:
tagged_reviews = []
for each_review_text in review_data[0:10]:
    text = nltk.word_tokenize(each_review_text)
    tagged_reviews.append(nltk.pos_tag(text))
tagged_reviews[0]

[('You', 'PRP'),
 ('can', 'MD'),
 ('reach', 'VB'),
 ('a', 'DT'),
 ('Customer', 'NNP'),
 ('Care', 'NNP'),
 ('Specialist', 'NNP'),
 ('at', 'IN'),
 ('1-800-392-0717', 'JJ'),
 ('to', 'TO'),
 ('talk', 'VB'),
 ('to', 'TO'),
 ('someone', 'NN'),
 ('about', 'IN'),
 ('your', 'PRP$'),
 ('billing', 'NN'),
 ('questions', 'NNS'),
 ('.', '.')]

## Opinion Mining/Association

In [7]:
#nltk.help.upenn_tagset()
grammar = "NP: {<DT|PP|CD>?<JJ||JJR|JJS>*<NN|NNS|PRP|NNP|IN|PRP\$>+<VBD|VBZ|VBN|VBP|IN>*<JJ|RB>*<PRP|NN|NNS>*}"
cp = nltk.RegexpParser(grammar)
results = cp.parse(tagged_reviews[0])

## Plot the parse tree

In [87]:
#results.draw()

In [88]:
#len(tagged_reviewsiews_reviews[9])

## Explore results

In [8]:
final = []
for result in results:
    if type(result) == nltk.tree.Tree:
        assoc=[]
        for res in result:
            assoc.append(res[0])
        if len(assoc) > 2:
            print(assoc)
            final.append(assoc)

['a', 'Customer', 'Care', 'Specialist', 'at', '1-800-392-0717']
['someone', 'about', 'your', 'billing', 'questions']


In [9]:
review_data[0]
#tagged_reviews[1]

'You can reach a Customer Care Specialist at 1-800-392-0717 to talk to someone about your billing questions.'

In [10]:
from nltk.corpus import stopwords 

In [11]:
stop_words = set(stopwords.words('english')) 

In [12]:
for w in final:
    filtered_sentence = [] 
    for wn in w:
        if wn not in stop_words: 
            filtered_sentence.append(wn) 
    print(filtered_sentence) 

['Customer', 'Care', 'Specialist', '1-800-392-0717']
['someone', 'billing', 'questions']
