In [1]:
import pandas as pd
import nltk
import pickle


In [2]:
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *

[nltk_data] Downloading package stopwords to /home/thomas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
stemmer = PorterStemmer()
stopwords_english = stopwords.words("english")

In [4]:
train_clean = pd.read_csv("train_clean.csv")
test_clean = pd.read_csv("test_clean.csv")

In [5]:
train_clean.head()

Unnamed: 0,target,clean_text
0,1,last night i finished re watching jane eyre ...
1,1,maybe i m a sap but this is the sweetest movie...
2,0,in keeping with disney s well known practice o...
3,1,john waters has given us a genuinely enjoyable...
4,0,i m starting to write this review during a bre...


In [6]:
def review_to_words(text):
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords_english] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [7]:
sample_train = train_clean["clean_text"][0]

In [8]:
sample_train

'last night i finished re watching  jane eyre   1983   the bbc mini series adapted from charlotte bronte s gothic romance novel which is deservingly a classic of english literature with timothy dalton  my favorite james bond  as mr  edward rochester and zelah clarke  as jane eyre  a poor orphaned 18 year old girl  a governess at mr  rochester s estate  thornfield   jane eyre  has been one of my most beloved books since i was an 11 years old girl and the friend of mine gave it to me with the words   this book is amazing  and so it was and i have read it dozens of times and i am still not tired of it  its beautiful language  refined  fragrant  and surprisingly fresh  the dialogs  and above all  two main characters  and the story of their impossible love have attracted many filmmakers   jane eyre  has been adapted to tv and big screen many times  18 according to imdb  the actors as famous and marvelous as joan fontaine and orson welles  william hurt and charlotte gainsbourg  george c  sco

In [9]:
result_train = review_to_words(sample_train)
result_train

['last',
 'night',
 'finish',
 'watch',
 'jane',
 'eyr',
 '1983',
 'bbc',
 'mini',
 'seri',
 'adapt',
 'charlott',
 'bront',
 'gothic',
 'romanc',
 'novel',
 'deservingli',
 'classic',
 'english',
 'literatur',
 'timothi',
 'dalton',
 'favorit',
 'jame',
 'bond',
 'mr',
 'edward',
 'rochest',
 'zelah',
 'clark',
 'jane',
 'eyr',
 'poor',
 'orphan',
 '18',
 'year',
 'old',
 'girl',
 'gover',
 'mr',
 'rochest',
 'estat',
 'thornfield',
 'jane',
 'eyr',
 'one',
 'belov',
 'book',
 'sinc',
 '11',
 'year',
 'old',
 'girl',
 'friend',
 'mine',
 'gave',
 'word',
 'book',
 'amaz',
 'read',
 'dozen',
 'time',
 'still',
 'tire',
 'beauti',
 'languag',
 'refin',
 'fragrant',
 'surprisingli',
 'fresh',
 'dialog',
 'two',
 'main',
 'charact',
 'stori',
 'imposs',
 'love',
 'attract',
 'mani',
 'filmmak',
 'jane',
 'eyr',
 'adapt',
 'tv',
 'big',
 'screen',
 'mani',
 'time',
 '18',
 'accord',
 'imdb',
 'actor',
 'famou',
 'marvel',
 'joan',
 'fontain',
 'orson',
 'well',
 'william',
 'hurt',
 'charl

In [10]:
train_clean['final_text'] = train_clean["clean_text"].apply(review_to_words)
test_clean['final_text'] = test_clean["clean_text"].apply(review_to_words)

In [11]:
train_clean.head()

Unnamed: 0,target,clean_text,final_text
0,1,last night i finished re watching jane eyre ...,"[last, night, finish, watch, jane, eyr, 1983, ..."
1,1,maybe i m a sap but this is the sweetest movie...,"[mayb, sap, sweetest, movi, ever, saw, first, ..."
2,0,in keeping with disney s well known practice o...,"[keep, disney, well, known, practic, steal, me..."
3,1,john waters has given us a genuinely enjoyable...,"[john, water, given, us, genuin, enjoy, film, ..."
4,0,i m starting to write this review during a bre...,"[start, write, review, break, watch, movi, fir..."


In [12]:
del train_clean['clean_text']
del test_clean['clean_text']

In [13]:
train_clean.to_csv("train_words.csv",index=False)
test_clean.to_csv("test_words.csv",index=False)

In [14]:
cache_data = {
    "train_words":train_clean, 
    "test_words":test_clean
}

In [15]:
with open('data_words.pickle', "wb") as f:
                pickle.dump(cache_data, f)