In [None]:
#Assembling the individual text documents into a single csv file

import pyprind
import pandas as pd
import os

basepath = 'aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000) #50000 iterations because that is the
#number of documents that we need to read in
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file),
                     'r', encoding = 'utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]],
                          ignore_index = True)
            pbar.update()

df.columns = ['review', 'sentiment']

In [3]:
#Shuffling the dataframe so the dataset can be split into training and test sets

import numpy as np

np.random.seed(0)

df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index = False, encoding = 'utf-8')

In [2]:
#Double checking everything went as planned
# read the csv and print an excerpt of the first three samples:

df = pd.read_csv('movie_data.csv', encoding = 'utf-8')
df.head(3)



Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [22]:
# Bag-of-words model - transforming words into feature vectors

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)


In [23]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [24]:
#Feature vector that was created - 'raw term frequencies' - tf(t,d)
print(bag.toarray())

#This is a unigram model because each "token" is one only word
# 2-gram model would be: "the sun", "is shining"

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [25]:
#Getting the term frequency-inverse document frequency (tf-idf)
# this downweights those terms that appear across multiple documents from both classes
# these terms do not have much discriminatory or useful information

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True,
                        norm = 'l2',
                        smooth_idf=True)

np.set_printoptions(precision=2)

print(tfidf.fit_transform(count.fit_transform(docs))
     .toarray())

#TfidTransformer normalizes the data using the l2 regularization parameter
# this is slightly different than the standard tf-idf formula
# the 0.45 tf-idf for 'is' is a relatively small number

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [27]:
#Moving forward with the movie data - it is necessary to strip the unwanted characters
#As an example - displaying the last 50 characters from the first
# document shows HTML markup as well as punctuation and non-letter characters

df.loc[0, 'review'][-50:]

#We will not strip all punctuation marks except for emoticons

'is seven.<br /><br />Title (Brazil): Not Available'

In [28]:
#Removing all the non-informative characters
#Generally shouldn't use regex to remove html though (like in the first line)
#The last line removes the nose from emoticons
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                          text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
           ' '.join(emoticons).replace('-', ''))
    return text

In [29]:
preprocessor(df.loc[0, 'review'][-50:])

'is seven title brazil not available'

In [30]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [31]:
#Applying the preprocessor to the reviews in the dataframe
#'cleaning' the dataframe
df['review'] = df['review'].apply(preprocessor)

In [32]:
#Tokenizing the documents by splitting at the whitespace

def tokenizer(text):
    return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [33]:
#Word stemming - useful technique for tokenizing - transforms a word to its root form
# maps related words to the same stem - Porter stemmer algorithm is first and default in NLTK

from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

#The nonsense 'thu' stem does not seem to matter - you can use lemmatization to 
# get the grammatically correct word (lemmas) but is computationally expensive
# and does not seem to improve performance

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [34]:
#Removing stop-words - stop words are those that are extremely common
# and serve little use for classification - examples are "is", "and", "has", "like"
# especially useful for working with non-tf-idfs (already downweighted in tf-idfs)

#Downloading stop words - set of 127 English stop words
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/santiagocassalett/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [36]:
#loading and applying the english stop words
from nltk.corpus import stopwords

stop = stopwords.words('english')

[w for w in tokenizer_porter('runners like running and runs a lot')[-10:]
    if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [37]:
#Training a logistic regression model for document classification
#First need to divide the dataframe into 25,000 training docs and 25,000 test docs

X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values



In [40]:
#Next will use GridSearchCV for the optimal set of parameters using
# 5-fold stratifed cross-validation

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor = None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
              'vect__tokenizer': [tokenizer,
                                  tokenizer_porter],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [1.0, 10.0, 100.0]},
             {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
              'vect__tokenizer': [tokenizer,
                                  tokenizer_porter],
              'vect__use_idf': [False],
              'vect__norm': [None],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [1.0, 10.0, 100.0]}
             ]

lr_tfidf = Pipeline([('vect', tfidf),
                    ('clf', 
                    LogisticRegression(random_state=0, solver = 'liblinear'))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, 
                          scoring='accuracy',
                          cv = 5, verbose=1,
                          n_jobs=-1)

gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 24.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 125.6min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 154.3min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [41]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

#Best parameter is without the porter stemming for tokenization, no stop-word library,
# l2 penalty, C set at 10

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x129928c20>} 


In [44]:
print('CV accuracy: %.3f'
         % gs_lr_tfidf.best_score_)

CV accuracy: 0.897


In [45]:
clf = gs_lr_tfidf.best_estimator_
print('Test accuracy: %.3f'
     % clf.score(X_test, y_test))

Test accuracy: 0.899


In [3]:
#The above took a very long time - you can speed it up by breaking it into mini-batches
# via stochastic gradient descent

#Below is a tokenizer for cleaning the unprocessed movie list
import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                          text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
           ' '.join(emoticons).replace('-', ''))
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [4]:
#Generator function that reads in and returns one doc at a time
def stream_docs(path):
    with open(path, 'r', encoding = 'utf-8') as csv:
        next(csv) #skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [5]:
next(stream_docs(path = 'movie_data.csv'))

('"In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />""Murder in Greenwich"" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich f

In [6]:
#Now we make get_minibatch that will take the document stream from the
# stream_docs and return a particular number of docs

def get_minibatch(doc_streams, size):
    docs,  y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [7]:
#HashingVectorizer is what we have to use instead of CountVectorizer or TfidfVectorizer
# because the other two requires holding the complete vocabulary in the memory
# which is very expensive

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore',
                        n_features=2 ** 21,
                        preprocessor=None,
                        tokenizer=tokenizer)

clf = SGDClassifier(loss = 'log', random_state=1)
doc_stream = stream_docs(path = 'movie_data.csv')

In [8]:
#Now we can start the out-of-core learning
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])

for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size = 1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:26


In [9]:
#We now use the last 5000 documents to test our performance of the model

X_test, y_test = get_minibatch(doc_stream, size = 5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.868


In [10]:
#Can now use the last 5000 documents to update our model

clf = clf.partial_fit(X_test, y_test)

#Will need to use the above for chapter 9

In [56]:
#Decomposing text documents with Latent Dirichlet Allocation (LDA)
# this is different than Linear Discriminant Analysis and is an unsupervised
# clustering technique 

import pandas as pd
df = pd.read_csv('movie_data.csv', encoding='utf-8')



In [57]:
#LDA uses bag-of-words matrix as an input and we will use this to try
# and categorize the movie reviews into 10 different topics

from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english',
                       max_df=.1,
                       max_features=5000)

X = count.fit_transform(df['review'].values)

#max_df = .1 sets the maximum document frequency to 10% - excludes words that appear
# too frequently
#max_features = 5000 limits it to the 5000 most frequent words to limit dimensionality

In [60]:
#Fitting LDA to the bag-of-words and infering 10 topics
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components = 10,
                               random_state=123,
                               learning_method='batch')
X_topics = lda.fit_transform(X)

#setting 'batch' allows the estimator to work with all of the availability 
# training data (the bag of words matrix) in one interation
# you can use minibatch by changing method to 'online' - it doesnt work as well 
# but is faster

In [61]:
#The components_ attribute stores the matrix containing the word importance (here, 5000)
# for each topic in increasing order

lda.components_.shape

(10, 5000)

In [62]:
#Printing the 5 most important words for the 10 topics
# must sort in reverse order because it is increasing order

n_top_words = 5
feature_names = count.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print('Topic %d:' % (topic_idx+1))
    print(" ".join([feature_names[i]
                   for i in topic.argsort()[:-n_top_words -1 : -1]]))

Topic 1:
worst minutes awful script stupid
Topic 2:
family mother father children girl
Topic 3:
american war dvd music tv
Topic 4:
human audience cinema art sense
Topic 5:
police guy car dead murder
Topic 6:
horror house sex girl woman
Topic 7:
role performance comedy actor performances
Topic 8:
series episode war episodes tv
Topic 9:
book version original read novel
Topic 10:
action fight guy guys cool


In [63]:
#To confirm the categories lets plot three movies from the horror category (topic 6, index 5)

horror = X_topics[:, 5].argsort()[::-1]

for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\nHorror movie #%d:' % (iter_idx + 1))
    print(df['review'][movie_idx][:300], '...')


Horror movie #1:
House of Dracula works from the same basic premise as House of Frankenstein from the year before; namely that Universal's three most famous monsters; Dracula, Frankenstein's Monster and The Wolf Man are appearing in the movie together. Naturally, the film is rather messy therefore, but the fact that ...

Horror movie #2:
Okay, what the hell kind of TRASH have I been watching now? "The Witches' Mountain" has got to be one of the most incoherent and insane Spanish exploitation flicks ever and yet, at the same time, it's also strangely compelling. There's absolutely nothing that makes sense here and I even doubt there  ...

Horror movie #3:
<br /><br />Horror movie time, Japanese style. Uzumaki/Spiral was a total freakfest from start to finish. A fun freakfest at that, but at times it was a tad too reliant on kitsch rather than the horror. The story is difficult to summarize succinctly: a carefree, normal teenage girl starts coming fac ...
