In [None]:
# Install a pip package in the current Jupyter kernel
# This is better than !pip ... see https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/
import sys
!{sys.executable} -m pip install spacy pandas

In [23]:
import pandas as pd
import re
import spacy
from spacy.lang.en import English

In [24]:
# Read data from file 'filename.csv' 
# (in the same directory that your python process is based)
# Control delimiters, rows, column names with read_csv (see later) 
data = pd.read_csv("IMDB Dataset.csv") 

# Keep the first 10 elements to reduce the load on cpu
data=data[:10]
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Remove HTML elements

In [28]:
REMOVE_HTML = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def remove_html(review):
    return REMOVE_HTML.sub(" ", review) 

data['cleaned_review'] = data['review'].map(remove_html)
data.head()

Unnamed: 0,review,sentiment,cleaned_review,tokenized_review
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,"[One, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming te...,"[A, wonderful, little, production, ., <, br, /..."
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,"[I, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...,"[Basically, there, 's, a, family, where, a, li..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is...","[Petter, Mattei, 's, "", Love, in, the, Time, o..."


### Tokenization

In [26]:
#  "nlp" Object is used to create documents with linguistic annotations.
nlp = English()

def tokenize_review(this_review):
    my_doc = nlp(this_review)
    
    # Create list of word tokens
    token_list = []
    for token in my_doc:
        token_list.append(token.text)
    return token_list

In [29]:
data['tokenized_review'] = data['review'].map(tokenize_review)
data.head()

Unnamed: 0,review,sentiment,cleaned_review,tokenized_review
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,"[One, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming te...,"[A, wonderful, little, production, ., <, br, /..."
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,"[I, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...,"[Basically, there, 's, a, family, where, a, li..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is...","[Petter, Mattei, 's, "", Love, in, the, Time, o..."


In [31]:
#print stopword list from spacy
spacy_stopwords = list(spacy.lang.en.stop_words.STOP_WORDS)

remove_from_stopwordlist=["n't", "most", "much", "never", "no", "not", "nothing", "n‘t", "n’t", "really", "top", "very", "well"]
for word in spacy_stopwords:
    if word in remove_from_stopwordlist:
         spacy_stopwords.remove(word)


In [32]:
my_doc = nlp("dfg dggd")
    
# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)
token_list

['dfg', 'dggd']