In [1]:
import torch
from torchtext.data import TabularDataset

# torch.data

In [2]:
# Field - specifies torch how to handle data.

In [3]:
from torchtext.data.utils import get_tokenizer
import string
import spacy


nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser', 'tagger'])


all_stopwords = list(nlp.Defaults.stop_words)
stop_punctuations = list(string.punctuation)

all_stopwords.extend(stop_punctuations)
all_stopwords.extend(["/><br", "---", "...", "-pron-"])

In [245]:
# Preprocessing
def tokenizer(document):

    import re
    import string

    doc = nlp(document)
    lemmas = [token.lemma_ for token in doc]

    return lemmas

In [168]:
from torchtext import data
text_field = data.Field(sequential = True,
                        tokenize = tokenizer, 
                        use_vocab=True, #Set to false for numeric
                        lower=True,
                        pad_token=None,
                        unk_token='<unk>',
                        stop_words=all_stopwords, 
                        batch_first = True
                       )
target_field = data.Field(sequential = False,
                         use_vocab=False, # Set to false for numeric
                         batch_first = True,
                         dtype=torch.int64
                         )


#Specify how to handle each column in the data
imdb_datafields = [("index", None),
                   ("review", text_field), 
                   ("sentiment", target_field)]

In [169]:
train_ds, test_ds = data.TabularDataset.splits(path = "data",
                                           train = "imdb_reviews_train.csv",
                                           test = "imdb_reviews_test.csv",
                                           format = "csv",
                                           fields = imdb_datafields,
                                           skip_header=True 
                                           )

In [170]:
print("Number of Documents in the Corpus:")
print("Train : {}".format(len(train_ds)))
print(" Test : {}".format(len(test_ds)))

Number of Documents in the Corpus:
Train : 25000
 Test : 25000


In [171]:
text_field.build_vocab(train_ds, max_size = VOCAB_SIZE, min_freq = 50)

In [172]:
from torch.utils.data import DataLoader

In [211]:
MINI_BATCH_SIZE = 32

In [230]:
# Postprocessing
def make_bow_vector(batch, max_frequency_of_token):
    
    
    reviews = [batch[idx].review for idx in range(MINI_BATCH_SIZE)]
    sentiment = [int(batch[idx].sentiment) for idx in range(MINI_BATCH_SIZE)]
    
    vec = torch.zeros(MINI_BATCH_SIZE, VOCAB_SIZE+1)

    for idx, review in enumerate(reviews):
        for word in review:
            vec[idx, text_field.vocab.stoi[word]] += 1
    
    # Clipping word frequency
    vec = vec[:,1:].clamp(max= max_frequency_of_token)
    # Scaling counts
    vec/=max_frequency_of_token
    return vec, torch.tensor(sentiment).reshape(-1,1)
    
    


In [231]:
train_dl = DataLoader(dataset=train_ds,
                      batch_size = MINI_BATCH_SIZE,
                      collate_fn = lambda batch : make_bow_vector(batch, 
                                                                  max_frequency_of_token=3)) 
                      # Passing Arguments to collate function

In [237]:
test_dl = DataLoader(dataset=test_ds,
                      batch_size = MINI_BATCH_SIZE,
                      collate_fn = lambda batch : make_bow_vector(batch, 
                                                                  max_frequency_of_token=3)) 
                      # Passing Arguments to collate function

In [232]:
from torch import nn

In [233]:
HIDDEN_UNITS = 8
device = "cuda" if torch.cuda.is_available() else "cpu"

In [243]:
epochs =101

In [244]:
# 1. Model Architecture - Using Sequential
model = nn.Sequential(nn.Linear(in_features = VOCAB_SIZE,
                                out_features = HIDDEN_UNITS, bias=True),
                       nn.ReLU(),
                       nn.Linear(in_features = HIDDEN_UNITS,
                                 out_features = 1),
                       nn.Sigmoid()
                       )
model = model.to(device)

# 2. Loss
criterion = torch.nn.BCELoss()

# 3. Optimizer
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-3)

for epoch in range(epochs):
    model.train()
    all_loss = 0
    test_all_loss = 0
    for batch_X, batch_y in train_dl:

         # 1. Forward Propogation
        y_pred = model(batch_X.float())
        # 2. Compute Loss
        loss = criterion(y_pred, batch_y.float())

        # 3. Ensure gradients are all zero
        optimizer.zero_grad()

        # 4. Back Propogate
        loss.backward()

        #5. Update weights
        optimizer.step()
        
        all_loss+=loss.item()
        
        break
    
    epoch_loss = all_loss/(len(train_dl))
    if epoch%10 == 0:
        print("epoch : {}\tloss : {}".format(epoch, epoch_loss), end = "\t")
    
        model.eval()
        for batch_X, batch_y in test_dl:
            y_pred = model(batch_X.float())
            test_loss = criterion(y_pred, batch_y.float())
            test_all_loss+=test_loss.item()
            break
        test_epoch_loss = test_all_loss/len(test_dl)
        print("test loss : {}".format(test_epoch_loss))

epoch : 0	loss : 0.0010184933005086601	test loss : 0.001015266798951132
epoch : 10	loss : 0.001014237818510636	test loss : 0.0010110559061055292
epoch : 20	loss : 0.0010100032972252886	test loss : 0.0010068631538039888
epoch : 30	loss : 0.001005699872360815	test loss : 0.0010026537853738537
epoch : 40	loss : 0.0010014198472737656	test loss : 0.000998465987422582
epoch : 50	loss : 0.0009971632981849144	test loss : 0.0009942875646264352
epoch : 60	loss : 0.0009929204688352697	test loss : 0.0009901156205960246
epoch : 70	loss : 0.0009886831273813077	test loss : 0.000985934377631263
epoch : 80	loss : 0.0009844525695761755	test loss : 0.000981745665030711
epoch : 90	loss : 0.000980239237665825	test loss : 0.0009775623641050685
epoch : 100	loss : 0.0009760233142491802	test loss : 0.0009733753283615307


In [79]:
from torchtext.data import Iterator
train_dl, test_dl = Iterator.splits(datasets = (train_ds, test_ds),
                                    batch_sizes = (2, 2),
                                    shuffle = False,
                                    sort_within_batch = False


tensor([[  1,   0, 116,  45,   0,   0,   0,   0, 497,   1,   0,   0,  56,  44,
           0,   0,  11,   0,   0,   6,   0,   0,   7,   0,   0,   0,   0, 376,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0],
        [  0,  73,   1, 488,   0, 145, 498,   0,   0,  87, 108,   0,   0,  72,
         439,   0,   0,   0,   0,   0, 321, 190,   0,   0,   0, 412,  11,   0,
           0,   3,   0,   0,  96, 244,   0,   0, 458,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,  25,  61,   0,   0, 182,   0,   0,   0,
           0,   0, 201,   0,   0,   0,   0,   0,  45,   0,   0,  13, 117,   0,
         376,   0,   1,   0,   0,   0