### Load Stanford movie review Data

In [17]:
from pathlib import Path
import re
from sklearn.model_selection import train_test_split
%matplotlib inline
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

In [20]:
import spacy
import nltk
import string
from collections import defaultdict
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import WordNetLemmatizer

In [5]:
def  getfilelist(root):
    """Return a fully-qualified list of filenames under root directory"""
    path = Path(root)
    textfiles = path.glob('**/*.txt')
    return [str(line) for line in textfiles]

In [6]:
def gettext(filename):
    """Return a string text from given txt file"""
    with open(filename) as f:
        text = f.read().replace("<br />", " ")
    return text

In [7]:
def gettexttodf(rootpath):
    """Extract text from all the txt files under given directory
       and return a dataframe with string reviews and labels"""
    filename_list = getfilelist(rootpath)
    # identify data from positive or negative dataset
    if "neg" in rootpath:
        label = np.zeros(len(filename_list), dtype=int)
    else:
        label = np.ones(len(filename_list), dtype=int)
    
    review = []
    for filename in filename_list:
        review.append(gettext(filename))
    
    return (review, label) 

In [376]:
neg_path = "aclImdb/small_data/neg/"
pos_path = "aclImdb/small_data/pos/"

In [377]:
neg_content, neg_y = gettexttodf(neg_path)
pos_content, pos_y = gettexttodf(pos_path)

In [378]:
len(neg_content)

400

 ### Tokenize

In [10]:
def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.
    """
    text = text.lower()
    text = re.sub('[' + string.punctuation + '0-9\\r\\t\\n]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [w for w in tokens if (len(w) > 3) and (w not in ENGLISH_STOP_WORDS)]  # ignore a, an, to, at, be, ...
    return tokens

In [11]:
def normalizewords(words):
    """
    Given a list of tokens/words, return a new list of normalize words
    """
    lemmatizer = nltk.stem.WordNetLemmatizer()
    normal = []
    for word, tag in nltk.pos_tag(words):
        wtag = tag[0].lower()
        wtag = wtag if wtag in ['a', 'r', 'n', 'v'] else None
        lemma = lemmatizer.lemmatize(word, wtag) if wtag else word
        normal.append(lemma)
    return ' '.join(normal)

In [12]:
# example output
print('Before: ', neg_content[0],'\n','After: ', normalizewords(tokenize(neg_content[0])))

Before:  Technically abominable (with audible "pops" between scenes)and awesomely amateurish, "Flesh" requires a lot of patience to sit through and will probably turn off most viewers; but the dialogue rings amazingly true and Joe Dallesandro, who exposes his body in almost every scene, also gives an utterly convincing performance. A curio, to be sure, but the more polished "Trash", made two years later, is a definite step forward. I suggest you watch that instead. (*1/2) 
 After:  technically abominable audible pop scene awesomely amateurish flesh require patience probably turn viewer dialogue ring amazingly true dallesandro expose body scene give utterly convincing performance curio sure polish trash year later definite step forward suggest watch instead


In [379]:
# pre-processing
neg_content = [normalizewords(tokenize(line)) for line in neg_content]
pos_content = [normalizewords(tokenize(line)) for line in pos_content]

### Split dataset in train and validation

In [380]:
X = np.append(neg_content, pos_content)
y = np.append(neg_y, pos_y)

In [381]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

### Words to Index mapping

In [382]:
def get_vocab(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for line in content:
        words = set(line.split())
        for word in words:
            vocab[word] += 1
    return vocab      

In [383]:
#Getting the vocabulary from the training set
word_count = get_vocab(X_train)

In [384]:
# word_count

In [385]:
len(word_count.keys())

10024

In [386]:
for word in list(word_count):
    if word_count[word] < 4:
        del word_count[word]

In [387]:
len(word_count.keys())

2609

In [388]:
## Finally we need an index for each word in the vocab
vocab2index = {"<PAD>":0, "UNK":1} # init with padding and unknown
words = ["<PAD>", "UNK"]
for word in word_count:
    vocab2index[word] = len(words)
    words.append(word)

In [389]:
len(vocab2index)

2611

In [390]:
# vocab2index

### Sentance Encoding

In [391]:
# each number is the length of the sentence
x_train_len = np.array([len(x.split()) for x in X_train]) 
x_val_len = np.array([len(x.split()) for x in X_val])

In [393]:
np.percentile(x_train_len, 90) # set max len to 400

200.0

In [394]:
max_len = 200

In [395]:
def encode_sentence(s, N=max_len):
    enc = np.zeros(N, dtype=np.int32)
    # use index to represent words
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in s.split()])
    # cut it length > N
    l = min(N, len(enc1))
    enc[:l] = enc1[:l]
    return enc

In [396]:
x_train_len = np.minimum(x_train_len, max_len)
x_val_len = np.minimum(x_val_len, max_len)

In [397]:
x_train = np.vstack([encode_sentence(x) for x in X_train])
x_train.shape

(640, 200)

In [398]:
x_val = np.vstack([encode_sentence(x) for x in X_val])
x_val.shape

(160, 200)

## Embedding layer

In [399]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, emb_size=100):
        super(CBOW, self).__init__()
        # initialize random embedding matrix
        self.word_emb = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        self.linear = nn.Linear(emb_size, 1)
        
    def forward(self, x, s):
        x = self.word_emb(x)
        x = x.sum(dim=1)/ s
        x = self.linear(x) 
        return x

In [400]:
CBOW(vocab_size=5, emb_size=4)

CBOW(
  (word_emb): Embedding(5, 4, padding_idx=0)
  (linear): Linear(in_features=4, out_features=1, bias=True)
)

In [401]:
V = len(words)
model = CBOW(vocab_size=V, emb_size=50)
print(V)

2611


In [402]:
def val_metrics(model):
    model.eval()
    x = torch.LongTensor(x_val) #.cuda()
    y = torch.Tensor(y_val).unsqueeze(1) #).cuda()
    s = torch.Tensor(x_val_len).view(x_val_len.shape[0], 1)
    y_hat = model(x, s)
    loss = F.binary_cross_entropy_with_logits(y_hat, y)
    y_pred = y_hat > 0
    correct = (y_pred.float() == y).float().sum()
    accuracy = correct/y_pred.shape[0]
    return loss.item(), accuracy.item()

In [403]:
val_metrics(model) # about 0.5

(0.6979413032531738, 0.5)

In [404]:
def train_epocs(model, epochs=10, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for i in range(epochs):
        model.train()
        x = torch.LongTensor(x_train)  #.cuda()
        y = torch.Tensor(y_train).unsqueeze(1)
        s = torch.Tensor(x_train_len).view(x_train_len.shape[0], 1)
        y_hat = model(x, s) # predicted value
        loss = F.binary_cross_entropy_with_logits(y_hat, y) # get loss
        optimizer.zero_grad() # set the gradients to zero 
        loss.backward()
        optimizer.step()
        val_loss, val_accuracy = val_metrics(model)
        print("train_loss %.6f val_loss %.3f val_accuracy %.3f" % (loss.item(), val_loss, val_accuracy))

In [405]:
train_epocs(model, epochs=20, lr=0.1)

train_loss 0.690264 val_loss 0.660 val_accuracy 0.544
train_loss 0.643663 val_loss 0.627 val_accuracy 0.663
train_loss 0.547587 val_loss 0.570 val_accuracy 0.725
train_loss 0.437712 val_loss 0.475 val_accuracy 0.831
train_loss 0.317343 val_loss 0.410 val_accuracy 0.819
train_loss 0.229734 val_loss 0.367 val_accuracy 0.850
train_loss 0.148871 val_loss 0.369 val_accuracy 0.869
train_loss 0.095424 val_loss 0.387 val_accuracy 0.856
train_loss 0.062842 val_loss 0.375 val_accuracy 0.856
train_loss 0.037675 val_loss 0.351 val_accuracy 0.856
train_loss 0.022152 val_loss 0.338 val_accuracy 0.869
train_loss 0.014140 val_loss 0.338 val_accuracy 0.875
train_loss 0.009159 val_loss 0.348 val_accuracy 0.875
train_loss 0.005351 val_loss 0.364 val_accuracy 0.881
train_loss 0.002775 val_loss 0.385 val_accuracy 0.875
train_loss 0.001479 val_loss 0.409 val_accuracy 0.869
train_loss 0.000880 val_loss 0.434 val_accuracy 0.869
train_loss 0.000577 val_loss 0.459 val_accuracy 0.863
train_loss 0.000407 val_loss

In [406]:
val_metrics(model)

(0.5076395273208618, 0.862500011920929)

In [351]:
model = CBOW(vocab_size=V, emb_size=100)

In [352]:
train_epocs(model, epochs=20, lr=0.03)

train_loss 0.690807 val_loss 0.685 val_accuracy 0.575
train_loss 0.673843 val_loss 0.672 val_accuracy 0.625
train_loss 0.655129 val_loss 0.663 val_accuracy 0.663
train_loss 0.630417 val_loss 0.651 val_accuracy 0.675
train_loss 0.600020 val_loss 0.632 val_accuracy 0.706
train_loss 0.563297 val_loss 0.612 val_accuracy 0.731
train_loss 0.523042 val_loss 0.595 val_accuracy 0.738
train_loss 0.479391 val_loss 0.579 val_accuracy 0.738
train_loss 0.435078 val_loss 0.559 val_accuracy 0.744
train_loss 0.390017 val_loss 0.538 val_accuracy 0.788
train_loss 0.346103 val_loss 0.522 val_accuracy 0.775
train_loss 0.303146 val_loss 0.510 val_accuracy 0.750
train_loss 0.262440 val_loss 0.497 val_accuracy 0.762
train_loss 0.224202 val_loss 0.481 val_accuracy 0.775
train_loss 0.189282 val_loss 0.473 val_accuracy 0.781
train_loss 0.157886 val_loss 0.471 val_accuracy 0.806
train_loss 0.130416 val_loss 0.467 val_accuracy 0.806
train_loss 0.106769 val_loss 0.460 val_accuracy 0.825
train_loss 0.086825 val_loss

In [353]:
val_metrics(model)

(0.46491122245788574, 0.8125)

### SGD

In [103]:
from torch.utils.data import Dataset, DataLoader

In [108]:
def encode_sentence2(s, N=200):
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in s.split()])
    l = min(N, len(enc1))
    enc[:l] = enc1[:l]
    return enc, l 

In [109]:
class SubjectivityDataset(Dataset):
    def __init__(self, X, y):
        self.x = X
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        x, s = encode_sentence2(x)
        return x, self.y[idx], s
    
sub_dataset_train = SubjectivityDataset(X_train, y_train)

In [110]:
train_loader = DataLoader(sub_dataset_train, batch_size=5, shuffle=True)
x, y, s = next(iter(train_loader))

In [259]:
# initiate
model = CBOW(vocab_size=V, emb_size=100)

In [126]:
train_loader = DataLoader(sub_dataset_train, batch_size=500, shuffle=True)

In [127]:
def train_epocs_sgd(model, epochs=10, lr=0.02):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr) # what is optimizer?
    for i in range(epochs):
        total_loss = 0
        total = 0
        model.train()
        for x, y, s in train_loader:
            x = x.type(torch.LongTensor)  #.cuda()
            y = y.type(torch.FloatTensor).unsqueeze(1)
            s = s.type(torch.Tensor).view(s.shape[0], 1)
            y_hat = model(x, s)
            loss = F.binary_cross_entropy_with_logits(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += x.size(0)*loss.item()
            total += x.size(0)
            train_loss = total_loss/total
        val_loss, val_accuracy = val_metrics(model)
        
        print("train_loss %.3f val_loss %.3f val_accuracy %.3f" % (train_loss, val_loss, val_accuracy))

In [128]:
train_epocs_sgd(model, epochs=20)

train_loss 0.698 val_loss 0.676 val_accuracy 0.613
train_loss 0.666 val_loss 0.674 val_accuracy 0.531
train_loss 0.639 val_loss 0.656 val_accuracy 0.556
train_loss 0.599 val_loss 0.609 val_accuracy 0.781
train_loss 0.545 val_loss 0.567 val_accuracy 0.800
train_loss 0.491 val_loss 0.527 val_accuracy 0.819
train_loss 0.428 val_loss 0.486 val_accuracy 0.844
train_loss 0.358 val_loss 0.457 val_accuracy 0.819
train_loss 0.294 val_loss 0.436 val_accuracy 0.831
train_loss 0.241 val_loss 0.407 val_accuracy 0.850
train_loss 0.192 val_loss 0.368 val_accuracy 0.856
train_loss 0.151 val_loss 0.339 val_accuracy 0.856
train_loss 0.121 val_loss 0.326 val_accuracy 0.856
train_loss 0.095 val_loss 0.324 val_accuracy 0.850
train_loss 0.074 val_loss 0.331 val_accuracy 0.856
train_loss 0.059 val_loss 0.332 val_accuracy 0.856
train_loss 0.047 val_loss 0.324 val_accuracy 0.856
train_loss 0.037 val_loss 0.314 val_accuracy 0.844
train_loss 0.029 val_loss 0.308 val_accuracy 0.850
train_loss 0.024 val_loss 0.309

## test accuracy

In [407]:
test_neg_path = "aclImdb/small_data/test_neg/"
test_pos_path = "aclImdb/small_data/test_pos/"

In [408]:
test_neg_content, test_neg_y = gettexttodf(neg_path)
test_pos_content, test_pos_y = gettexttodf(pos_path)
content = test_neg_content+test_pos_content

### without pre-processing text

In [368]:
def prediction(model, content):
    model.eval()
    
    code = []
    x_len = []
    for i in [encode_sentence2(x) for x in content]:
        code.append(i[0])
        x_len.append(i[1])
        
    x_len = np.minimum(x_len, 200)
    x = torch.LongTensor(code) #.cuda()
#     y = torch.Tensor(y_val).unsqueeze(1) #).cuda()
    s = torch.Tensor(x_len).view(x_len.shape[0], 1)
    y_hat = model(x, s)
    
    return (y_hat > 0)*1

In [371]:
# prediction(model, test_pos_content)

In [372]:
print(f'the predict accuracy of positive is: {(prediction(model, test_pos_content) == 1).sum()/len(pos_y):.2f}')

the predict accuracy of positive is: 0.94


In [373]:
print(f'the predict accuracy of negative is: {(prediction(model, test_neg_content) == 0).sum()/len(test_neg_y):.2f}')

the predict accuracy of negative is: 0.95


### with pre-processing text

In [411]:
test_pos_content = [normalizewords(tokenize(line)) for line in test_pos_content]
test_neg_content = [normalizewords(tokenize(line)) for line in test_neg_content]

In [412]:
def prediction(model, content):
    model.eval()
    
    code = []
    x_len = []
    for i in [encode_sentence2(x) for x in content]:
        code.append(i[0])
        x_len.append(i[1])
        
    x_len = np.minimum(x_len, 200)
    x = torch.LongTensor(code) #.cuda()
#     y = torch.Tensor(y_val).unsqueeze(1) #).cuda()
    s = torch.Tensor(x_len).view(x_len.shape[0], 1)
    y_hat = model(x, s)
    
    return (y_hat > 0)*1

In [413]:
print(f'the predict accuracy of positive is: {(prediction(model, test_pos_content) == 1).sum()/len(test_pos_y):.2f}')

the predict accuracy of positive is: 0.95


In [414]:
print(f'the predict accuracy of negative is: {(prediction(model, test_neg_content) == 0).sum()/len(test_neg_y):.2f}')

the predict accuracy of negative is: 0.99
