In [3]:
import torch
import torchtext
from torch.utils.data import DataLoader
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch import nn
from torch.nn import functional as F
import warnings

In [4]:
warnings.filterwarnings("ignore")

# Text Preprocecssing

In [5]:
#reading the data with file handlers
reviewfile = open('reviews.txt', 'r')
reviews = list(map(lambda x:x[:-1], reviewfile.readlines()))

labelfile = open('labels.txt', 'r')
label = list(map(lambda x:x[:-1], labelfile.readlines()))

label = [i.replace('negative\\', '0') for i in label]
label = [i.replace('positive\\', '1') for i in label]

In [30]:
#converting all of the labels into integer values
labels = []
for i in label:
    integer = int(i)
    labels.append(integer)

In [31]:
#tokenizing and indicising the data
train_data, test_data = reviews[:8000], reviews[8000:11000]
train_labels, test_labels = labels[:8000], labels[8000:11000]

tokenizer = get_tokenizer("basic_english")

def build_vocabulary(datasets):
    for dataset in datasets:
        for text in dataset:
            yield tokenizer(text)

vocab = build_vocab_from_iterator(build_vocabulary([train_data, test_data]), min_freq=2, specials=["<UNK>"])

vocab.set_default_index(vocab["<UNK>"])

In [16]:
#the length of the total vocabulary is
len(vocab)

32322

In [32]:
class dataset:
    def __init__(self, indicies_list, labels_list):
        self.indices = indicies_list
        self.labels = labels_list
        
        
    def __len__(self):
        return len(self.indices)
    

    
    def __getitem__(self, idx):
        return self.labels[idx], self.indices[idx]

In [33]:
train_set = dataset(train_data, train_labels)
test_set = dataset(train_data, train_labels)

In [40]:
target_classes = ['positive', 'negative']
max_words = 25

#zero_padding the indicized sentences and loading them
def vectorize_batch(batch):
    Y, X = list(zip(*batch))
    X = [vocab(tokenizer(text)) for text in X]
    X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X] ## Bringing all samples to max_words length.

    return torch.tensor(X, dtype=torch.int32), torch.tensor(Y) ## We have deducted 1 from target names to get them in range [0,1,2,3] from [1,2,3,4]


#creating a dataloader with 1024 btch size and also using the collate_fn
train_loader = DataLoader(train_set, batch_size=25, collate_fn=vectorize_batch, shuffle=True)
test_loader  = DataLoader(test_set , batch_size=25, collate_fn=vectorize_batch)


# RNN Classification

In [47]:
embed_len = 100
hidden_dim = 50
n_layers=1

#developing the model architechture
class RNNClassifier(nn.Module):
    def __init__(self):
        super(RNNClassifier, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embed_len)
        self.rnn = nn.RNN(input_size=embed_len, hidden_size=hidden_dim)
        self.linear = nn.Linear(hidden_dim, len(target_classes))

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.rnn(embeddings, torch.randn(n_layers, len(X_batch), hidden_dim))
        return self.linear(output[:,-1])

In [48]:
rnn_classifier = RNNClassifier()
rnn_classifier

RNNClassifier(
  (embedding_layer): Embedding(32322, 100)
  (rnn): RNN(100, 50)
  (linear): Linear(in_features=50, out_features=2, bias=True)
)

In [49]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc

def CalcValLossAndAccuracy(model, loss_fn, val_loader):
    with torch.no_grad():
        Y_shuffled, Y_preds, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_shuffled.append(Y)
            Y_preds.append(preds.argmax(dim=-1))

        Y_shuffled = torch.cat(Y_shuffled)
        Y_preds = torch.cat(Y_preds)

        print("Valid Loss : {:.3f}".format(torch.tensor(losses).mean()))
        print("Valid Acc  : {:.3f}".format(accuracy_score(Y_shuffled.detach().numpy(), Y_preds.detach().numpy())))


def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds = model(X)

            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        CalcValLossAndAccuracy(model, loss_fn, val_loader)

In [50]:
from torch.optim import Adam

epochs = 15
learning_rate = 1e-3

loss_fn = nn.CrossEntropyLoss()
rnn_classifier = RNNClassifier()
optimizer = Adam(rnn_classifier.parameters(), lr=learning_rate)

TrainModel(rnn_classifier, loss_fn, optimizer, train_loader, test_loader, epochs)

100%|█████████████████████████████████████████| 320/320 [00:10<00:00, 29.82it/s]


Train Loss : 0.704
Valid Loss : 0.686
Valid Acc  : 0.555


100%|█████████████████████████████████████████| 320/320 [00:10<00:00, 31.10it/s]


Train Loss : 0.690
Valid Loss : 0.677
Valid Acc  : 0.576


100%|█████████████████████████████████████████| 320/320 [00:10<00:00, 31.40it/s]


Train Loss : 0.681
Valid Loss : 0.669
Valid Acc  : 0.588


100%|█████████████████████████████████████████| 320/320 [00:09<00:00, 32.51it/s]


Train Loss : 0.671
Valid Loss : 0.651
Valid Acc  : 0.613


100%|█████████████████████████████████████████| 320/320 [00:10<00:00, 30.24it/s]


Train Loss : 0.658
Valid Loss : 0.632
Valid Acc  : 0.626


100%|█████████████████████████████████████████| 320/320 [00:10<00:00, 30.86it/s]


Train Loss : 0.640
Valid Loss : 0.613
Valid Acc  : 0.643


100%|█████████████████████████████████████████| 320/320 [00:10<00:00, 30.91it/s]


Train Loss : 0.624
Valid Loss : 0.597
Valid Acc  : 0.647


100%|█████████████████████████████████████████| 320/320 [00:09<00:00, 32.98it/s]


Train Loss : 0.605
Valid Loss : 0.599
Valid Acc  : 0.630


100%|█████████████████████████████████████████| 320/320 [00:09<00:00, 32.70it/s]


Train Loss : 0.591
Valid Loss : 0.567
Valid Acc  : 0.659


100%|█████████████████████████████████████████| 320/320 [00:10<00:00, 30.61it/s]


Train Loss : 0.578
Valid Loss : 0.545
Valid Acc  : 0.684


100%|█████████████████████████████████████████| 320/320 [00:10<00:00, 29.98it/s]


Train Loss : 0.567
Valid Loss : 0.548
Valid Acc  : 0.668


100%|█████████████████████████████████████████| 320/320 [00:09<00:00, 32.18it/s]


Train Loss : 0.556
Valid Loss : 0.533
Valid Acc  : 0.682


100%|█████████████████████████████████████████| 320/320 [00:10<00:00, 29.39it/s]


Train Loss : 0.550
Valid Loss : 0.527
Valid Acc  : 0.682


100%|█████████████████████████████████████████| 320/320 [00:09<00:00, 33.47it/s]


Train Loss : 0.543
Valid Loss : 0.509
Valid Acc  : 0.701


100%|█████████████████████████████████████████| 320/320 [00:11<00:00, 27.93it/s]


Train Loss : 0.540
Valid Loss : 0.510
Valid Acc  : 0.698


# LSTM Classification

In [53]:
#defining the LSTM architechture
embed_len = 100
hidden_dim = 75
n_layers = 1
class LSTMClassifier(nn.Module):
    def __init__(self):
        super(LSTMClassifier, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embed_len)
        self.lstm = nn.LSTM(input_size=embed_len, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)
        self.linear = nn.Linear(hidden_dim, len(target_classes))
        
    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        hidden, carry = torch.randn(n_layers, len(X_batch), hidden_dim), torch.randn(n_layers, len(X_batch), hidden_dim)
        output, (hidden, carry) = self.lstm(embeddings, (hidden, carry))
        return self.linear(output[:,-1])

In [55]:
model = LSTMClassifier()
model

LSTMClassifier(
  (embedding_layer): Embedding(32322, 100)
  (lstm): LSTM(100, 75, batch_first=True)
  (linear): Linear(in_features=75, out_features=2, bias=True)
)

In [56]:
from torch.optim import Adam

epochs = 15
learning_rate = 1e-3 

loss_fn = nn.CrossEntropyLoss()
model = LSTMClassifier()
optimizer = Adam(model.parameters(), lr=learning_rate)

TrainModel(model, loss_fn, optimizer, train_loader, test_loader, epochs)

100%|█████████████████████████████████████████| 320/320 [00:12<00:00, 24.82it/s]


Train Loss : 0.690
Valid Loss : 0.637
Valid Acc  : 0.647


100%|█████████████████████████████████████████| 320/320 [00:12<00:00, 25.01it/s]


Train Loss : 0.611
Valid Loss : 0.509
Valid Acc  : 0.775


100%|█████████████████████████████████████████| 320/320 [00:12<00:00, 25.48it/s]


Train Loss : 0.479
Valid Loss : 0.390
Valid Acc  : 0.819


100%|█████████████████████████████████████████| 320/320 [00:12<00:00, 25.14it/s]


Train Loss : 0.359
Valid Loss : 0.254
Valid Acc  : 0.919


100%|█████████████████████████████████████████| 320/320 [00:13<00:00, 23.17it/s]


Train Loss : 0.234
Valid Loss : 0.138
Valid Acc  : 0.961


100%|█████████████████████████████████████████| 320/320 [00:13<00:00, 23.93it/s]


Train Loss : 0.141
Valid Loss : 0.072
Valid Acc  : 0.980


100%|█████████████████████████████████████████| 320/320 [00:12<00:00, 24.70it/s]


Train Loss : 0.075
Valid Loss : 0.049
Valid Acc  : 0.990


100%|█████████████████████████████████████████| 320/320 [00:12<00:00, 25.82it/s]


Train Loss : 0.044
Valid Loss : 0.029
Valid Acc  : 0.993


100%|█████████████████████████████████████████| 320/320 [00:12<00:00, 25.35it/s]


Train Loss : 0.028
Valid Loss : 0.010
Valid Acc  : 0.999


100%|█████████████████████████████████████████| 320/320 [00:14<00:00, 22.17it/s]


Train Loss : 0.013
Valid Loss : 0.006
Valid Acc  : 0.999


100%|█████████████████████████████████████████| 320/320 [00:14<00:00, 22.75it/s]


Train Loss : 0.012
Valid Loss : 0.009
Valid Acc  : 0.998


100%|█████████████████████████████████████████| 320/320 [00:13<00:00, 24.61it/s]


Train Loss : 0.009
Valid Loss : 0.008
Valid Acc  : 0.999


100%|█████████████████████████████████████████| 320/320 [00:12<00:00, 25.33it/s]


Train Loss : 0.019
Valid Loss : 0.028
Valid Acc  : 0.990


100%|█████████████████████████████████████████| 320/320 [00:13<00:00, 24.62it/s]


Train Loss : 0.008
Valid Loss : 0.002
Valid Acc  : 1.000


100%|█████████████████████████████████████████| 320/320 [00:14<00:00, 21.37it/s]


Train Loss : 0.011
Valid Loss : 0.009
Valid Acc  : 0.998
