# NLP Pipeline to clean the reviews

In [21]:
sampleReview = "I loved this movie since I was 7  and I saw it on the opening day. It was so touching and beautiful. I strongly recommend seeing for all. It's a movie to watch with your family by far.<br /><br />My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, nudity/sexuality and some language."

In [27]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from tqdm import tqdm

In [23]:
tokenizer = RegexpTokenizer(r'\w+')
sw = set(stopwords.words('english'))
ps = PorterStemmer()

In [24]:
def stemTheReview(review):
    review = review.lower()
    review = review.replace('<br />', ' ')

    # Tokeinize the review
    review = tokenizer.tokenize(review)
    
    # Filter out stop words and stem them
    review = [ps.stem(word) for word in review if word not in sw]
    return ' '.join(review)

In [25]:
stemTheReview(sampleReview)

'love movi sinc 7 saw open day touch beauti strongli recommend see movi watch famili far mpaa rate pg 13 themat element prolong scene disastor nuditi sexual languag'

In [28]:
def getStemmedDocument(inputFile, outputFile):
    with open(inputFile, 'r') as f:
        reviews = f.readlines()
    outFile = open(outputFile, 'w')
    for i in tqdm(range(len(reviews))):
        review = reviews[i]
        cleanedReview = stemTheReview(review)
        print((cleanedReview), file=outFile)
    outFile.close()

In [29]:
getStemmedDocument('./IMDB/imdb_trainX.txt', './IMDB/cleaned_imdb_trainX.txt')

100%|██████████| 25000/25000 [01:53<00:00, 220.20it/s]


In [30]:
getStemmedDocument('./IMDB/imdb_testX.txt', './IMDB/cleaned_imdb_testX.txt')

100%|██████████| 25000/25000 [01:49<00:00, 228.52it/s]
