In [1]:
import numpy as np
import pandas as pd

In [2]:
labeledData = pd.read_csv("labeledTrainData.tsv", header = 0, delimiter='\t', quoting= 3)
unlabeledData = pd.read_csv("unlabeledTrainData.tsv", header = 0, delimiter='\t', quoting = 3)
testData =  pd.read_csv("testData.tsv", header = 0, delimiter='\t', quoting = 3)

In [3]:
labeledData.shape, unlabeledData.shape, testData.shape

((25000, 3), (50000, 2), (25000, 2))

In [4]:
# transform raw review
from bs4 import BeautifulSoup as bs
import re
from nltk.corpus import stopwords

In [5]:
# unlike bag of words, Word2Vec needs a list of words so we return the same
# instead of returning a string as done previously.
def transformSentence(rawReview, remove_stopwords = False):
    #remove punctuation marks
    noHTML = bs(rawReview, "lxml").get_text()
    
    #remove punctuation marks
    letters_only = re.sub("[^a-zA-Z0-9]", " ", noHTML)
    
    #convert to lower case and split
    words = letters_only.lower().split()
    
    #optional removing stopwords
    if remove_stopwords:
        sw = set(stopwords.words("english"))
        words = [w for w in words if w not in sw]
    
    #return list of words in the review
    return words

In [13]:
clean_review = transformSentence(unlabeledData.review[0])
# clean_review

['watching',
 'time',
 'chasers',
 'it',
 'obvious',
 'that',
 'it',
 'was',
 'made',
 'by',
 'a',
 'bunch',
 'of',
 'friends',
 'maybe',
 'they',
 'were',
 'sitting',
 'around',
 'one',
 'day',
 'in',
 'film',
 'school',
 'and',
 'said',
 'hey',
 'let',
 's',
 'pool',
 'our',
 'money',
 'together',
 'and',
 'make',
 'a',
 'really',
 'bad',
 'movie',
 'or',
 'something',
 'like',
 'that',
 'what',
 'ever',
 'they',
 'said',
 'they',
 'still',
 'ended',
 'up',
 'making',
 'a',
 'really',
 'bad',
 'movie',
 'dull',
 'story',
 'bad',
 'script',
 'lame',
 'acting',
 'poor',
 'cinematography',
 'bottom',
 'of',
 'the',
 'barrel',
 'stock',
 'music',
 'etc',
 'all',
 'corners',
 'were',
 'cut',
 'except',
 'the',
 'one',
 'that',
 'would',
 'have',
 'prevented',
 'this',
 'film',
 's',
 'release',
 'life',
 's',
 'like',
 'that']

In [21]:
unlabeledData.review[0]

'"Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \\"Hey, let\'s pool our money together and make a really bad movie!\\" Or something like that. What ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc. All corners were cut, except the one that would have prevented this film\'s release. Life\'s like that."'

In [8]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [20]:
raw_sentence=  tokenizer.tokenize(unlabeledData.review[0].strip())
raw_sentence

['"Watching Time Chasers, it obvious that it was made by a bunch of friends.',
 'Maybe they were sitting around one day in film school and said, \\"Hey, let\'s pool our money together and make a really bad movie!\\" Or something like that.',
 'What ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc.',
 "All corners were cut, except the one that would have prevented this film's release.",
 'Life\'s like that."']

In [16]:
def transformReview(rawReview, tokenizer, remove_stopwords = False):
    
    #convert paragraph into sentences
    raw_sentence = tokenizer.tokenize(rawReview.strip())
    
    sentences = []
    
    #for each sentence, convert it into list of words
    for sentence in raw_sentence:
        if(len(sentence) > 0):
            sentences.append(transformSentence(sentence, remove_stopwords))
    
    #return list of sentences each broken into words i.e. a list of lists
    return sentences

In [38]:
s = transformReview(unlabeledData.review[0], tokenizer)
print(len(s))
print(len(s[0]))
s

5
14


[['watching',
  'time',
  'chasers',
  'it',
  'obvious',
  'that',
  'it',
  'was',
  'made',
  'by',
  'a',
  'bunch',
  'of',
  'friends'],
 ['maybe',
  'they',
  'were',
  'sitting',
  'around',
  'one',
  'day',
  'in',
  'film',
  'school',
  'and',
  'said',
  'hey',
  'let',
  's',
  'pool',
  'our',
  'money',
  'together',
  'and',
  'make',
  'a',
  'really',
  'bad',
  'movie',
  'or',
  'something',
  'like',
  'that'],
 ['what',
  'ever',
  'they',
  'said',
  'they',
  'still',
  'ended',
  'up',
  'making',
  'a',
  'really',
  'bad',
  'movie',
  'dull',
  'story',
  'bad',
  'script',
  'lame',
  'acting',
  'poor',
  'cinematography',
  'bottom',
  'of',
  'the',
  'barrel',
  'stock',
  'music',
  'etc'],
 ['all',
  'corners',
  'were',
  'cut',
  'except',
  'the',
  'one',
  'that',
  'would',
  'have',
  'prevented',
  'this',
  'film',
  's',
  'release'],
 ['life', 's', 'like', 'that']]

In [30]:
# form a 3D matrix of all sentences from all reviews
sentences = []

for review in unlabeledData.review:
    sentences += transformReview(review, tokenizer)

for review in labeledData.review:
    sentences += transformReview(review, tokenizer)

  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [34]:
len(sentences[0])

14

In [37]:
sentences[0]

['watching',
 'time',
 'chasers',
 'it',
 'obvious',
 'that',
 'it',
 'was',
 'made',
 'by',
 'a',
 'bunch',
 'of',
 'friends']