In [1]:
import pandas as pd

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data
from torchtext import datasets

import spacy
import numpy as np

import time
import random

In [3]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
train_path = '../tweet-sentiment-extraction/train.csv'
test_path = '../tweet-sentiment-extraction/test.csv'

In [5]:
cols

NameError: name 'cols' is not defined

In [6]:
text_field = data.Field(sequential=True)
text_id_field = data.Field(sequential=False)
tse = data.TabularDataset(
    path=train_path, format='csv',
    fields={'textID': ('textID', text_id_field),
            'text': ('text', text_field),
            'selected_text': ('selected_text', data.Field(sequential=True)),
            'sentiment': ('labels', data.Field(sequential=False))})

In [7]:
text_field.build_vocab(tse, 
                 min_freq = 20,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)


In [8]:
len(text_field.vocab)

1591

In [9]:
data.Field?

In [10]:
# for batch in train_iterator:
# #     print(batch.text.shape)
# #     print(batch.udtags)
#     break

In [11]:
TEXT = data.Field(lower = True)
UD_TAGS = data.Field(unk_token = None)
PTB_TAGS = data.Field(unk_token = None)

In [12]:
# z = datasets.UDPOS(path='')
# z

In [13]:
fields = (("text", TEXT), ("udtags", UD_TAGS), ("ptbtags", PTB_TAGS))

In [14]:
train_data, val_data, test_data = datasets.UDPOS.splits(fields)

In [15]:
# train_data[0].udtags

In [16]:
type(train_data)

torchtext.datasets.sequence_tagging.UDPOS

In [17]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(val_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 12543
Number of validation examples: 2002
Number of testing examples: 2077


In [18]:
# print(vars(val_data[0]))

In [19]:
MIN_FREQ = 2

TEXT.build_vocab(train_data, 
                 min_freq = MIN_FREQ,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)


UD_TAGS.build_vocab(train_data)
PTB_TAGS.build_vocab(train_data)

In [20]:
# print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
# print(f"Unique tokens in UD_TAG vocabulary: {len(UD_TAGS.vocab)}")
# print(f"Unique tokens in PTB_TAG vocabulary: {len(PTB_TAGS.vocab)}")

In [21]:
# print(TEXT.vocab.freqs.most_common(20))

In [22]:
BATCH_SIZE = 32

device = torch.device('cuda:0')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, val_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

In [23]:
class POSTagger(nn.Module):
    def __init__(self,
                input_dim,
                output_dim,
                embedding_dim,
                hidden_dim,
                pad_idx,
                dropout_rate,
                num_layers,
                bidirectional
                ):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        
        if num_layers == 1:
            dropout_rate=0
            
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, 
                            num_layers=num_layers, bidirectional=bidirectional, dropout=dropout_rate)
        
        self.dropout = nn.Dropout(dropout_rate)
        
        if bidirectional:
            self.fc = nn.Linear(hidden_dim*2, output_dim)
        else:
            self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text):
        embeddings = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.lstm(embeddings)
        return self.fc(self.dropout(output))

In [24]:
# print(TEXT.vocab.freqs.most_common(10))
# (UD_TAGS.vocab.freqs)

In [25]:
# len(TEXT.vocab)

In [26]:
# len(TEXT.vocab.freqs)

In [27]:
input_dim = len(TEXT.vocab)
output_dim = len(UD_TAGS.vocab)
hidden_dim = 64
num_layers = 1
bidirectional = False
dropout_rate = 0.25
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
embedding_dim = 100

model = POSTagger(input_dim=input_dim,
                 output_dim=output_dim,
                 hidden_dim=hidden_dim,
                 num_layers=num_layers,
                 bidirectional=bidirectional,
                 dropout_rate=dropout_rate,
                 embedding_dim=embedding_dim,
                 pad_idx=pad_idx)

tag_pad_idx = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index=tag_pad_idx)

In [28]:
# init_weights(0)

In [29]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)
        
model.apply(init_weights)
pass

In [30]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# print(f'The model has {count_parameters(model):,} trainable parameters')

In [31]:
# for z in model.parameters():
#     print(z.numel())
#     break

In [32]:
pretrained_embeddings = TEXT.vocab.vectors

# print(pretrained_embeddings.shape)

In [33]:
# x = torch.randn(1,2).to(torch.device('cuda:0'))

In [34]:
model.embedding.weight.data.copy_(pretrained_embeddings)
pass

In [35]:
model.embedding.weight.data[1] = torch.zeros(embedding_dim)

In [36]:
# print(model.embedding.weight.data)

In [37]:
optimizer = optim.Adam(model.parameters())

In [38]:
criterion.to(device)
model.to(device)
pass

In [39]:
import os; os.getpid()

46984

In [40]:
def custom_categorical_accuracy(y_pred, y_true, tag_pad_idx):
    y_pred = y_pred.argmax(dim=1, keepdim=True)
#     print(y_true)
#     print(tag_pad_idx)
#     print(y_true!=tag_pad_idx)
    non_pad_element_idxs = (y_true!=tag_pad_idx).nonzero()
    correct = y_pred[non_pad_element_idxs].squeeze(1).eq(y_true[non_pad_element_idxs])
    return correct.sum() / torch.FloatTensor([y_true[non_pad_element_idxs].shape[0]])

In [41]:
from pprint import pprint

In [42]:
# for batch in train_iterator:
#     text = batch.text
#     tags = batch.udtags
# #     print(text.shape)
#     # print(type(text[0][0]))
#     # print(text[0][0].cpu().data.numpy())
#     # print(text)
#     # for text_ in text.T:
#     #     pprint(text_)
#     #     print()
# #     print(tags.shape)
#     predictions = model(text)
#     print(predictions.shape)
#     predictions = predictions.view(-1, predictions.shape[-1])
#     print(predictions.shape)
#     print(tags.shape)
#     tags = tags.view(-1)
#     print(tags.shape)
#     # print()
# #     print(predictions.shape)
#     # print(tags)
#     break

In [43]:
# # torch.seed()
# z = torch.randn(2,1)
# z2 = torch.randn(2,1)

In [44]:
# criterion(z,z2)

In [45]:
# criterion(torch.FloatTensor(1), torch.FloatTensor(2))

In [53]:
for batch in train_iterator:
    text = batch.text
    tags = batch.udtags

    y_pred = model(text)
    print(y_pred.shape)
    y_pred = y_pred.view(-1, y_pred.shape[-1])
    print(tags.shape)
    tags = tags.view(-1)
    
    loss = criterion(y_pred, tags)
    print(loss)
    break

torch.Size([39, 32, 18])
torch.Size([39, 32])
tensor(2.9586, device='cuda:0', grad_fn=<NllLossBackward>)


In [114]:
# for batch in train_iterator:
#     text = batch.text
#     tags = batch.udtags

#     y_pred = model(text)
#     print(text.shape)
#     print(y_pred.shape)
#     print(y_pred[0][0])
#     y_pred = y_pred.view(-1, y_pred.shape[-1])
#     tags = tags.view(-1)
#     print(y_pred.shape)
    
#     break

In [None]:
# nn.Sof?

In [39]:
def train_one_epoch(model, iterator, optimizer, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    optimizer.zero_grad()
    
    for batch in train_iterator:
        text = batch.text
        tags = batch.udtags
        
        y_pred = model(text)
        
        y_pred = y_pred.view(-1, y_pred.shape[-1])
        
        tags = tags.view(-1)
#         print(tags)
        loss = criterion(y_pred, tags)
        acc = custom_categorical_accuracy(y_pred, tags, tag_pad_idx)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss/len(iterator), epoch_acc / len(iterator)

In [53]:
def val_one_epoch(model, iterator, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    with torch.no_grad():
        
        for batch in iterator:
            text = batch.text
            tags = batch.udtags

            y_pred = model(text)
            y_pred = y_pred.view(-1, y_pred.shape[-1])

            tags = tags.view(-1)
            

            loss = criterion(y_pred, tags)
            acc = custom_categorical_accuracy(y_pred, tags, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss/len(iterator), epoch_acc / len(iterator)

In [54]:
# device_cpu = torch.device('cpu')

In [55]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [56]:
# for batch in train_iterator:
#     pass

In [57]:
# vars(train_iterator.data()[0])

In [58]:
# for e in train_iterator.data():
# #     print(e.text)
#     try:
#         for udtag in e.udtags:
#             z = (UD_TAGS.vocab.stoi[udtag])
#     except Exception as e:
#         print(e)
#         print(vars(e))
# #     break

In [59]:
# model.to(torch.device('cpu'))

In [60]:
# model.to(device_cpu)
# criterion.to(device_cpu)
n_epochs = 10

best_val_loss = float('inf')

for epoch in range(n_epochs):
    
    start_time = time.time()
    
    train_loss, train_acc = train_one_epoch(model, train_iterator, optimizer, criterion, tag_pad_idx)
    val_loss, val_acc = val_one_epoch(model, valid_iterator, criterion, tag_pad_idx)
    
    end_time = time.time()
    
    epoch_time_min, epoch_time_secs = epoch_time(start_time, end_time)
    
    if val_loss < best_val_loss:
        best_valid_loss = val_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_time_min}m {epoch_time_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')
    

Epoch: 01 | Epoch Time: 0m 2s
	Train Loss: 0.246 | Train Acc: 91.86%
	 Val. Loss: 0.560 |  Val. Acc: 84.22%
Epoch: 02 | Epoch Time: 0m 2s
	Train Loss: 0.233 | Train Acc: 92.15%
	 Val. Loss: 0.572 |  Val. Acc: 83.70%
Epoch: 03 | Epoch Time: 0m 2s
	Train Loss: 0.237 | Train Acc: 92.24%
	 Val. Loss: 0.572 |  Val. Acc: 82.02%
Epoch: 04 | Epoch Time: 0m 2s
	Train Loss: 0.233 | Train Acc: 92.28%
	 Val. Loss: 0.573 |  Val. Acc: 82.19%
Epoch: 05 | Epoch Time: 0m 2s
	Train Loss: 0.224 | Train Acc: 92.43%
	 Val. Loss: 0.570 |  Val. Acc: 84.84%
Epoch: 06 | Epoch Time: 0m 2s
	Train Loss: 0.229 | Train Acc: 92.25%
	 Val. Loss: 0.619 |  Val. Acc: 82.63%
Epoch: 07 | Epoch Time: 0m 1s
	Train Loss: 0.222 | Train Acc: 92.41%
	 Val. Loss: 0.568 |  Val. Acc: 83.88%
Epoch: 08 | Epoch Time: 0m 2s
	Train Loss: 0.221 | Train Acc: 92.48%
	 Val. Loss: 0.590 |  Val. Acc: 82.47%
Epoch: 09 | Epoch Time: 0m 2s
	Train Loss: 0.216 | Train Acc: 92.69%
	 Val. Loss: 0.581 |  Val. Acc: 82.92%
Epoch: 10 | Epoch Time: 0m 1

In [47]:
# x = torch.randn(100,1000)
# y = torch.randn(1000,1000)

In [55]:
# device = torch.device('cpu')

In [50]:
# x.to(device); y.to(device)

In [51]:
# z = torch.matmul(x,y)

In [155]:
# x = torch.randn((32,18))

In [154]:
# x.nonzero().shape

In [52]:
# loss = nn.CrossEntropyLoss()
# input_ = torch.randn(3, 5, requires_grad=True)
# target = torch.empty(3, dtype=torch.long).random_(5)
# print(input_); print(target)
# output = loss(input_, target)