In [1]:
import torch
from torchtext import data

SEED = 1234
import pandas as pd
import numpy as np
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchtext

import nltk

import random
from sklearn.metrics import classification_report

import pyprind
%matplotlib inline

In [3]:
import spacy
spacy_en = spacy.load('en')



    Only loading the 'en' tokenizer.



In [4]:
is_cuda = torch.cuda.is_available()
print("Cuda Status on system is {}".format(is_cuda))
def tokenizer(text):
    return [tok for tok in nltk.word_tokenize(text)]

TEXT = data.Field(sequential=True, tokenize="spacy")
LABEL = data.LabelField(dtype=torch.long, sequential=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data, test_data = data.TabularDataset.splits(
    path="C:/Users/user/Desktop/mlmd/torchtext_data/", train="train.csv", test="test.csv",format="csv", skip_header=True, 
    fields=[('Text', TEXT), ('Label', LABEL)]
)

print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

TEXT.build_vocab(train_data, vectors=torchtext.vocab.FastText(language='en'), 
                 max_size=20000, min_freq=10)
LABEL.build_vocab(train_data)

Cuda Status on system is True


    Only loading the 'en' tokenizer.



.vector_cache\wiki.en.vec: 0.00B [00:00, ?B/s]

Number of training examples: 8252
Number of testing examples: 2062


.vector_cache\wiki.en.vec: 6.60GB [11:00, 9.99MB/s]                                                                    
  0%|                                                                                      | 0/2519370 [00:00<?, ?it/s]Skipping token b'2519370' with 1-dimensional vector [b'300']; likely a header
100%|█████████████████████████████████████████████████████████████████████▉| 2518627/2519370 [06:23<00:00, 7005.77it/s]

In [5]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
TEXT.vocab.freqs.most_common(10)

Unique tokens in TEXT vocabulary: 1330
Unique tokens in LABEL vocabulary: 2


[('.', 4440),
 ('!', 4408),
 (',', 3406),
 ('I', 3117),
 ('to', 3048),
 (' ', 3010),
 ('the', 2817),
 ('a', 2388),
 ('and', 2084),
 ('you', 1949)]

In [6]:
LABEL.vocab.freqs

Counter({'0': 6400, '1': 1852})

In [7]:
BATCH_SIZE = 20

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# keep in mind the sort_key option 
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data), sort_key=lambda x: len(x.Text),
    batch_size=BATCH_SIZE,
    device=device)

In [30]:
class simpleRNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):

        #x = [sent len, batch size]
        
        embedded = self.embedding(x)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        out = self.fc(hidden)
        return out

In [9]:
class lstmRNN(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super(lstmRNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size = embedding_dim,hidden_size = hidden_size, num_layers = 1)
        self.fc = nn.Linear(hidden_size, 2)
        
    def forward(self, x):

        #x = [sent len, batch size]
        
        embedded = self.embedding(x)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        out = self.fc(hidden)
        return out

In [36]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    preds, ind= torch.max(F.softmax(preds, dim=-1), 1)
    correct = (ind == y).float()
    acc = correct.sum()/float(len(correct))
    return acc
def train(epochs, model, iterator, optimizer, criterion):
    for epoch in range(1,epochs+1):
    
        training_loss = 0.0 
        epoch_acc = 0

        model.train()
        for batch in iterator:

            optimizer.zero_grad()

            predictions = model(batch.Text).squeeze(0)
    #         print(predictions.shape, batch.Label.shape, model(batch.Text).shape)
            loss = criterion(predictions, batch.Label)
    #         print(loss.shape)
            acc = binary_accuracy(predictions, batch.Label)

            loss.backward()

            optimizer.step()

            training_loss += loss.item()
            epoch_acc += acc.item()
        training_loss /= len(iterator)
        epoch_acc /= len(iterator)
        print(f'Epoch: {epoch+1:02}, Train Loss: {training_loss:.3f}, Train Acc: {epoch_acc*100:.2f}% ')

def train1(epochs, model, iterator, optimizer, criterion):
    for epoch in range(1,epochs+1):
    
        training_loss = 0.0
        epoch_acc = 0.0

        model.train()
        for batch_idx, batch in enumerate(iterator):

            optimizer.zero_grad()

            predictions = model(batch.Text)
        #         print(predictions.shape, batch.Label.shape, model(batch.Text).shape)
            loss = criterion(predictions, batch.Label)
        #         print(loss.shape)
            acc = binary_accuracy(predictions, batch.Label)

            loss.backward()

            optimizer.step()

            training_loss += loss.item()
            epoch_acc += acc.item()
        
        training_loss /= len(iterator)
        epoch_acc /= len(iterator)
        print(f'Epoch: {epoch+1:02}, Train Loss: {training_loss:.3f}, Train Acc: {epoch_acc*100:.2f}% ')


In [11]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        bar = pyprind.ProgBar(len(iterator), bar_char='█')
        for batch in iterator:

            predictions = model(batch.Text).squeeze(0)
            
            loss = criterion(predictions, batch.Label)
            
            acc = binary_accuracy(predictions, batch.Label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            bar.update()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)



def predict_sentiment(sentence,model):
    tokenized = [tok for tok in sentence.split()]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    
    tensor = tensor.unsqueeze(1)
#     print(tensor.shape)
    prediction = model(tensor)
#     print(prediction)
    preds, ind= torch.max(F.softmax(prediction, dim=-1), 1)
#     print(preds)
    return preds, ind


In [37]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 374
OUTPUT_DIM = 2

model1 = simpleRNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
pretrained_embeddings = TEXT.vocab.vectors
model1.embedding.weight.data = pretrained_embeddings.cuda()
class_weights = torch.tensor([1.0, 15.0]).cuda()
optimizer1 = optim.Adam(model1.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(weight=class_weights)

model1 = model1.to(device)
criterion = criterion.to(device)

pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([1330, 300])


In [38]:
train(100,model1,train_iterator, optimizer1, criterion)

Epoch: 02, Train Loss: 0.535, Train Acc: 23.69% 
Epoch: 03, Train Loss: 0.524, Train Acc: 25.01% 
Epoch: 04, Train Loss: 0.539, Train Acc: 26.30% 
Epoch: 05, Train Loss: 0.515, Train Acc: 23.71% 
Epoch: 06, Train Loss: 0.527, Train Acc: 24.46% 
Epoch: 07, Train Loss: 0.522, Train Acc: 23.67% 
Epoch: 08, Train Loss: 0.527, Train Acc: 23.51% 
Epoch: 09, Train Loss: 0.511, Train Acc: 22.96% 
Epoch: 10, Train Loss: 0.528, Train Acc: 25.23% 
Epoch: 11, Train Loss: 0.501, Train Acc: 23.28% 
Epoch: 12, Train Loss: 0.504, Train Acc: 23.34% 
Epoch: 13, Train Loss: 0.519, Train Acc: 23.20% 
Epoch: 14, Train Loss: 0.502, Train Acc: 23.61% 
Epoch: 15, Train Loss: 0.508, Train Acc: 24.65% 
Epoch: 16, Train Loss: 0.512, Train Acc: 24.56% 
Epoch: 17, Train Loss: 0.540, Train Acc: 26.12% 
Epoch: 18, Train Loss: 0.512, Train Acc: 23.97% 
Epoch: 19, Train Loss: 0.507, Train Acc: 24.39% 
Epoch: 20, Train Loss: 0.514, Train Acc: 23.56% 
Epoch: 21, Train Loss: 0.517, Train Acc: 24.37% 
Epoch: 22, Train Los

In [39]:
test_loss, test_acc = evaluate(model1, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

0% [██████████████████████████████] 100% | ETA: 00:00:00

| Test Loss: 1.322 | Test Acc: 31.63% |



Total time elapsed: 00:00:00


In [22]:
test = pd.read_csv("C:/Users/user/Desktop/mlmd/torchtext_data/test.csv")

In [41]:
def predict_sentiment2(sentence,model):
    tokenized = [tok for tok in sentence.split()]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    
    tensor = tensor.unsqueeze(1)
#     print(tensor.shape)
    prediction = model(tensor)
#     print(prediction)
    preds, ind= torch.max(F.softmax(prediction.squeeze(0), dim=-1), 1)
#     print(preds)
    return preds, ind

In [42]:
pre = [predict_sentiment2(k,model1)[1].item() for k in test.message]
print(classification_report(test.label, pre))

              precision    recall  f1-score   support

           0       0.94      0.08      0.15      1600
           1       0.24      0.98      0.38       462

    accuracy                           0.28      2062
   macro avg       0.59      0.53      0.26      2062
weighted avg       0.78      0.28      0.20      2062



In [19]:
model2 = lstmRNN(HIDDEN_DIM,EMBEDDING_DIM,INPUT_DIM)
model2 = model2.to(device)
optimizer2 = optim.Adam(model2.parameters(), lr=1e-3)


In [21]:
train1(100,model2,train_iterator, optimizer2, criterion)

Epoch: 02, Train Loss: 0.225, Train Acc: 72.03% 
Epoch: 03, Train Loss: 0.042, Train Acc: 99.18% 
Epoch: 04, Train Loss: 0.025, Train Acc: 99.24% 
Epoch: 05, Train Loss: 0.025, Train Acc: 99.41% 
Epoch: 06, Train Loss: 0.008, Train Acc: 99.67% 
Epoch: 07, Train Loss: 0.007, Train Acc: 99.67% 
Epoch: 08, Train Loss: 0.005, Train Acc: 99.77% 
Epoch: 09, Train Loss: 0.003, Train Acc: 99.90% 
Epoch: 10, Train Loss: 0.002, Train Acc: 99.89% 
Epoch: 11, Train Loss: 0.002, Train Acc: 99.84% 
Epoch: 12, Train Loss: 0.001, Train Acc: 99.90% 
Epoch: 13, Train Loss: 0.001, Train Acc: 99.95% 
Epoch: 14, Train Loss: 0.005, Train Acc: 99.82% 
Epoch: 15, Train Loss: 0.004, Train Acc: 99.85% 
Epoch: 16, Train Loss: 0.007, Train Acc: 99.65% 
Epoch: 17, Train Loss: 0.003, Train Acc: 99.82% 
Epoch: 18, Train Loss: 0.002, Train Acc: 99.94% 
Epoch: 19, Train Loss: 0.001, Train Acc: 99.96% 
Epoch: 20, Train Loss: 0.001, Train Acc: 99.98% 
Epoch: 21, Train Loss: 0.001, Train Acc: 99.98% 
Epoch: 22, Train Los

In [23]:
test_loss, test_acc = evaluate(model2, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

0% [██████████████████████████████] 100% | ETA: 00:00:00

| Test Loss: 0.062 | Test Acc: 99.71% |



Total time elapsed: 00:00:00


In [24]:
pre = [predict_sentiment(k,model2)[1].item() for k in test.message]

print(classification_report(test.label, pre))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96      1600
           1       1.00      0.71      0.83       462

    accuracy                           0.93      2062
   macro avg       0.96      0.85      0.89      2062
weighted avg       0.94      0.93      0.93      2062

