In [78]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
from glob import glob
import numpy as np
from mlutils.nlp import *
from mlutils.models.classification import *
from sklearn.linear_model import LogisticRegression
from fastai.text.data import *

In [2]:
#!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
#!gunzip aclImdb_v1.tar.gz
#!tar -xvf aclImdb_v1.tar

## IMDB dataset and the sentiment classification task

The [large movie review dataset](http://ai.stanford.edu/~amaas/data/sentiment/) contains a collection of 50,000 reviews from IMDB. The dataset contains an even number of positive and negative reviews. The authors considered only highly polarized reviews. A negative review has a score ≤ 4 out of 10, and a positive review has a score ≥ 7 out of 10. Neutral reviews are not included in the dataset. The dataset is divided into training and test sets. The training set is the same 25,000 labeled reviews.

The **sentiment classification task** consists of predicting the polarity (positive or negative) of a given text.


## Tokenizing and term document matrix creation

In [3]:
PATH='aclImdb/'
names = ['neg','pos']

In [4]:
%ls {PATH}

imdbEr.txt  imdb.vocab  README  [0m[01;34mtest[0m/  [01;34mtrain[0m/


In [5]:
%ls {PATH}train

labeledBow.feat  [0m[01;34mpos[0m/    unsupBow.feat  urls_pos.txt
[01;34mneg[0m/             [01;34munsup[0m/  urls_neg.txt   urls_unsup.txt


In [6]:
%ls {PATH}train/pos | head

0_9.txt
10000_8.txt
10001_10.txt
10002_7.txt
10003_8.txt
10004_8.txt
10005_7.txt
10006_7.txt
10007_7.txt
10008_7.txt
ls: write error: Broken pipe


In [7]:
def texts_labels_from_folders(path, folders):
    texts,labels = [],[]
    for idx,label in enumerate(folders):
        for fname in glob(os.path.join(path, label, '*.*')):
            texts.append(open(fname, 'r').read())
            labels.append(idx)
    return texts, np.array(labels).astype(np.int64)

trn,trn_y = texts_labels_from_folders(f'{PATH}train',names)
val,val_y = texts_labels_from_folders(f'{PATH}test',names)

In [8]:
trn[6]

'Adrian has just gone out of the asylum, being rich and with no parents, his life seems empty. One day, he meets Gonzalo, a poor boy whom mother is prostitute. Desperate for earning some money, Gonzalo helps Adrian to search about his life and who where his parents. This is a movie from a new director, and it is perfectly clear in most of the film: scenes not correctly directed, dialogues a little forced, some incoherences in the script...Anyway, the ending is unexpectedly well done (well, just a little) and that saves a little the film. Actors are known and with great quality, nevertheless, they are not inspired enough to make the movie interesting; all of them have done better papers in other film. The film results boring and probably you will spend most of the time thinking how much time will pass until it ends. Of course there are lots of worse films, but, sure, there are many many better ones.'

In [9]:
names[trn_y[6]]

'neg'

In [11]:
veczr = CountVectorizer(tokenizer=tokenize)

In [12]:
# fit our vectorizer and transform to create a document matrix
trn_term_doc = veczr.fit_transform(trn)
# apply the bag of words to the validation data
val_term_doc = veczr.transform(val)

In [13]:
# 25k documents with 75132 word vocabulary
trn_term_doc

<25000x75132 sparse matrix of type '<class 'numpy.int64'>'
	with 3749745 stored elements in Compressed Sparse Row format>

In [14]:
# this particular review has 67 identified words
trn_term_doc[6]

<1x75132 sparse matrix of type '<class 'numpy.int64'>'
	with 113 stored elements in Compressed Sparse Row format>

In [15]:
# which words are in the bag of words model?
vocab = veczr.get_feature_names();
vocab[5000:5005]

['aussie', 'aussies', 'austen', 'austeniana', 'austens']

In [16]:
# go to raw data to naively split words
w0 = set([o.lower() for o in trn[6].split(' ')]);
w0

{'(well,',
 'a',
 'about',
 'actors',
 'adrian',
 'all',
 'and',
 'are',
 'asylum,',
 'being',
 'better',
 'boring',
 'boy',
 'but,',
 'clear',
 'correctly',
 'course',
 'day,',
 'desperate',
 'dialogues',
 'directed,',
 'director,',
 'done',
 'earning',
 'empty.',
 'ending',
 'ends.',
 'enough',
 'film',
 'film.',
 'film:',
 'films,',
 'for',
 'forced,',
 'from',
 'gone',
 'gonzalo',
 'gonzalo,',
 'great',
 'has',
 'have',
 'he',
 'helps',
 'his',
 'how',
 'in',
 'incoherences',
 'inspired',
 'interesting;',
 'is',
 'it',
 'just',
 'known',
 'life',
 'little',
 'little)',
 'lots',
 'make',
 'many',
 'meets',
 'money,',
 'most',
 'mother',
 'movie',
 'much',
 'nevertheless,',
 'new',
 'no',
 'not',
 'of',
 'one',
 'ones.',
 'other',
 'out',
 'papers',
 'parents,',
 'parents.',
 'pass',
 'perfectly',
 'poor',
 'probably',
 'prostitute.',
 'quality,',
 'results',
 'rich',
 'saves',
 'scenes',
 'script...anyway,',
 'search',
 'seems',
 'some',
 'spend',
 'sure,',
 'that',
 'the',
 'them',

In [17]:
# almost the same, except we didn't use a tokenizer here
len(w0)

112

In [18]:
veczr.vocabulary_['boring']

8484

In [19]:
# see if the word boring appears on this review
trn_term_doc[6, 8484]

1

In [20]:
# let's look for irrelevant words
veczr.vocabulary_['the']

66458

In [21]:
# we have quite a few "the" on this text
trn_term_doc[6, 66458]

8

So we just created a Bag of Words model, based on the vocabulary found through all the reviews

## Logistic regression with unigrams

In [None]:
# setup training and validation sets
x=trn_term_doc
y=trn_y
x_val = val_term_doc
y_val = val_y

In [56]:
# train classifier
m = LogisticRegression(C=1e8, dual=True)
train_classification(m, x, y);
preds = predict_and_evaluate_classification(m, x_val, y_val)

Accuracy (Training): 0.99632
Accuracy score: 0.85752


...and the regularized version

In [58]:
# train classifier
m = LogisticRegression(C=1.0, dual=True)
train_classification(m, x, y);
preds = predict_and_evaluate_classification(m, x_val, y_val)

Accuracy (Training): 0.99644
Accuracy score: 0.87184


In [59]:
# binarized 
m = LogisticRegression(C=1.0, dual=True)
train_classification(m, x.sign(), y);
preds = predict_and_evaluate_classification(m, x_val.sign(), y_val)

Accuracy (Training): 0.99784
Accuracy score: 0.87384


## Trigram with NB features

In [60]:
veczr =  CountVectorizer(ngram_range=(1,3), tokenizer=tokenize, max_features=800000)
trn_term_doc = veczr.fit_transform(trn)
val_term_doc = veczr.transform(val)

In [63]:
trn_term_doc.shape

(25000, 800000)

In [64]:
vocab = veczr.get_feature_names()
vocab[200000:200005]

['by vast', 'by vengeance', 'by vengeance .', 'by vera', 'by vera miles']

In [67]:
# setup training and validation sets
x=trn_term_doc.sign()
y=trn_y
x_val = val_term_doc.sign()
y_val = val_y

### LogisticRegressor with trigrams

In [68]:
# train classifier
m = LogisticRegression(C=1e8, dual=True)
train_classification(m, x, y);
preds = predict_and_evaluate_classification(m, x_val, y_val)

Accuracy (Training): 1.0
Accuracy score: 0.90140


In [69]:
# create Naive-Bayes features
def pr(y_i):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

r = np.log(pr(1) / pr(0))
b = np.log((y==1).mean() / (y==0).mean())

### LogisticRegressor with trigram's log-count ratios

In [73]:
x_nb = x.multiply(r)
val_x_nb = x_val.multiply(r)

m = LogisticRegression(dual=True, C=0.1)
train_classification(m, x_nb, y);
preds = predict_and_evaluate_classification(m, val_x_nb, y_val)

Accuracy (Training): 0.99928
Accuracy score: 0.91768
