# Clean a NLP Pipeline to 'Clean' Reviews Data 
* Load Input file and read reviews
* Tokenize
* Remove Stopwords
* Perform Stemming
* Write Clean data to output file

In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [2]:
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [3]:
def getCleannedReview(review) :
    review = review.lower()
    review = review.replace("<br /><br />"," ")
    # Tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    
    return cleaned_review

In [4]:
def getStemmedDocument(inputFile,outputFile) :
    output = open(outputFile,'w',encoding='utf8')
    with open(inputFile,encoding='utf8') as f :
        reviews = f.readlines()
    
    clean_document = []
    for review in reviews :
        cleaned_review = getCleannedReview(review)
        clean_document.append(cleaned_review)
        print((cleaned_review),file=output)
    
    print(type(clean_document))
    print(clean_document)
    output.close()

In [5]:
inputFile = 'imdb_temp.txt'
outputFile = 'imdb_temp_output.txt'

In [6]:
getStemmedDocument(inputFile,outputFile)

<class 'list'>
['realli sure make movi weird artsi kind movi watch compel plot charact like kind movi stop watch horrif fascin thing happen screen although first time wife watch make way disturb run bit long nonetheless worthwhil view interest dark movi', 'enjoy film like pulp fiction reservoir dog lock stock two smoke barrel go love two hand type black humor beat keep entertain whole film like pulp fiction wacki scenario charact get deal along gallipoli picnic hang rock one best australian film seen also star young heath ledger got real big state terribl underr movi believ good pulp fiction great see', 'okay deal american pilot fli along mind busi suddenli outnumb evil cowardli non american fighter plane middl eastern type suffic say like appl pie elvi presley proceed shoot american pilot noth wrong evil non american care know bang foreign jail sentenc death would normal happen us militari would carpet bomb coupl nearbi town pilot releas time evil peac lovin type probabl got involv ma