## Create a NLP Pipeline to 'clean' Data Reviews
* Load Input File and Read reviews
* Tokenize
* Remove Stopwords
* Perform Stemming
* Write cleaned data to output file


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB

In [3]:
#Since the y values are discrete -> use Multinomial NB

In [5]:
df=pd.read_csv('Train.csv')

In [6]:
df.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [7]:
df.shape

(40000, 2)

In [14]:
X=df[:,0]  #can be used for training
Y=df[:,1]

In [16]:
print(X.shape,Y.shape)

(40000,) (40000,)


In [18]:
print(X[0])
print(Y[0])

mature intelligent and highly charged melodrama unbelivebly filmed in China in 1948. wei wei's stunning performance as the catylast in a love triangle is simply stunning if you have the oppurunity to see this magnificent film take it
pos


In [20]:
from nltk.tokenize import word_tokenize

In [27]:
words=word_tokenize(X[0])
print(words)


['mature', 'intelligent', 'and', 'highly', 'charged', 'melodrama', 'unbelivebly', 'filmed', 'in', 'China', 'in', '1948.', 'wei', 'wei', "'s", 'stunning', 'performance', 'as', 'the', 'catylast', 'in', 'a', 'love', 'triangle', 'is', 'simply', 'stunning', 'if', 'you', 'have', 'the', 'oppurunity', 'to', 'see', 'this', 'magnificent', 'film', 'take', 'it']


In [31]:
from nltk.corpus import stopwords

In [36]:
#Stopwords removal

<WordListCorpusReader in 'C:\\Users\\SHUBHIKA\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'>


## ________________________

### Working with a sample text using NLP Pipelining

In [46]:
sample_text = """I loved this movie <br /><br /> since I was 7 and I saw it on the opening day. It was so touching and beautiful. I strongly recommend seeing for all. It's a movie to watch with your family by far.<br /><br />My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, nudity/sexuality and some language."""


### NLTK

In [48]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [50]:
#Init Objects
tokenizer=RegexpTokenizer('\w+')
en_stopwords=set(stopwords.words('english'))
ps=PorterStemmer()

In [54]:
def getStemmedReview(review):
    
    review=review.lower()
    review=review.replace("<br /><br />"," ")
    
    #tokenize 
    tokens=tokenizer.tokenize(review)
    new_tokens=[token for token in tokens if token not in en_stopwords]
    stemmed_tokens=[ps.stem(token) for token in new_tokens]
    
    cleaned_review=' '.join(stemmed_tokens)
    
    return cleaned_review
    
    

In [55]:
getStemmedReview(sample_text)

'love movi sinc 7 saw open day touch beauti strongli recommend see movi watch famili far mpaa rate pg 13 themat element prolong scene disastor nuditi sexual languag'

### Now , write code in a python file for file handling