In [0]:
import numpy as np
import pandas as pd
from collections import Counter
import string
import  random
from spacy.lang.ro import Romanian
from spacy.lang.ro.stop_words import STOP_WORDS

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 
from torchvision.transforms import transforms
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchtext import data

In [0]:
spacy_ro = Romanian()

In [5]:
cd /content/drive/My Drive/ranete_cristian

[Errno 2] No such file or directory: '/content/drive/My Drive/ranete_cristian'
/content


In [0]:
use_cuda = torch.cuda.is_available()
torch.manual_seed(1024)
device = torch.device("cuda" if use_cuda else "cpu")
torch.backends.cudnn.deterministic = True  

In [7]:
train_data = pd.read_csv('train_data.csv', sep='\t', encoding='utf-8', lineterminator='\n', header = 0, names=['Id', 'Text', 'Label'])
validation_data = pd.read_csv('valid_data.csv', sep='\t', encoding='utf-8', lineterminator='\n', header = 0, names=['Id', 'Text', 'Label'])

FileNotFoundError: ignored

In [0]:
def full_texts(texts):
  text = " "
  for it in texts:
    text += " ".join(it)
  return text

In [0]:
text = full_texts(train_data['Text'].tolist())

In [0]:
class Vocabulary:
    """
    Helper class that maps characters to unique indices and the other way around
    """
    def __init__(self, text: str):
        # PAD is a special character for padding shorter sequences 
        # in a mini-batch
        # create a set out of all characters
        characters_set = set(["0"]) 
        characters_set.update(text)
        
        #create a dictionary for characters
        self.char_to_idx = {char:idx for (idx, char) 
                            in enumerate(characters_set)}
        self.idx_to_char = {idx:char for (idx, char) 
                            in enumerate(characters_set)}
   
    def size(self):
        return len(self.char_to_idx)
      
    def __str__(self):
        return str(self.char_to_idx)

In [0]:
vocab = Vocabulary(text)
print("Vocabulary size: ", vocab.size())
print("Vocabulary: \n", vocab)

In [0]:
def text_to_tensor(text: str, vocab: Vocabulary) -> torch.LongTensor:
    """
    Convert a string to a Tensor with corresponding character indices
    e.g. "We have" -> [48, 13,  2, 66, 56, 31, 13 
    """
    text_indices = [vocab.char_to_idx[c] for c in text]
  
    return torch.tensor(text_indices)

In [0]:
# function that prepers bacthes 
def my_collate(batch):
  sizes = []
  for item in batch:
    sizes.append(torch.tensor(len(item[0])))
  sizes = torch.stack(sizes, dim = 0).long()
  max_size = torch.max(sizes, dim = 0)[0]
  new_data = []
  for item in batch:
    new_data.append(F.pad(input=item[0], pad=(0, max_size - item[0].shape[0]), mode='constant', value=vocab.char_to_idx['0']))
  data = torch.stack(new_data, dim = 0)
  target = torch.stack([torch.tensor(item[1]) for item in batch], dim = 0)
  return [data, sizes, target]

In [0]:
class TextsDataset(Dataset):
    def __init__(self, texts, labels=None, vocab = None, max_length = 1004):
        self.X = texts
        self.y = labels
        self.vocab = vocab
        self.max_len = max_length
         
    def __len__(self):
        return (len(self.X))
    
    def __getitem__(self, i):
        data = self.X[i]
        data = text_to_tensor(data, self.vocab)
        if self.y is not None:
            y = self.y[i]
            return (data, y)
        else:
            return data

In [0]:
training_dataset = TextsDataset(train_data['Text'].tolist(), train_data["Label"].tolist(),vocab, 1004)
validing_dataset = TextsDataset(validation_data["Text"].tolist(), validation_data["Label"].tolist(),vocab, 1004)

In [0]:
batch_size = 128

In [0]:
trainloader = DataLoader(training_dataset, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=my_collate)
validloader = DataLoader(validing_dataset, batch_size=batch_size, shuffle=True, drop_last = True, collate_fn=my_collate)

In [0]:
class BiLSTM(nn.Module):
    
    def __init__(self, vocab_size, output_size, embed_size, hidden_nodes, n_layers, drop_prob=0.5):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_nodes, bidirectional=True, batch_first=True, num_layers = n_layers)
        self.linear = nn.Linear(hidden_nodes*4 , 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drop_prob)
        self.out = nn.Linear(64, output_size)


    def forward(self, x, text_sizes):
        h_embedding = self.embedding(x)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(h_embedding, text_sizes.flatten(), batch_first=True, enforce_sorted=False)
        h_lstm, _ = self.lstm(packed_embedded)
        h_lstm = nn.utils.rnn.pad_packed_sequence(h_lstm, True)[0]
        avg_pool = torch.mean(h_lstm, 1)
        max_pool, _ = torch.max(h_lstm, 1)
        conc = torch.cat(( avg_pool, max_pool), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = torch.sigmoid(self.out(conc))
        return out

In [0]:
size_of_vocab = vocab.size()
embedding_dim = 100
num_hidden_nodes = 64
num_output_nodes = 1
num_layers = 2
dropout = 0.2

#instantiate the model
model_r = BiLSTM(size_of_vocab, num_output_nodes, embedding_dim, num_hidden_nodes, num_layers, dropout)

In [0]:
#architecture
print(model_r)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model_r.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model_r):,} trainable parameters')

In [0]:
#define optimizer and loss
optimizer = optim.Adam(model_r.parameters(), lr = 0.005)
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model_r = model_r.to(device)
criterion = criterion.to(device)

In [0]:
def train(model, train_iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    clip = 3
    
    #set the model in training phase
    model.train()  
    for inputs, text_lengths, labels in train_iterator:
        #print(it)
        inputs, labels = inputs.to(device), labels.to(device)
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        #print("retrive")
        output = model(inputs, text_lengths) 
        
        #compute the loss
        #print("loss")
        loss = criterion(output.squeeze(), labels.float())  
        
        #compute the binary accuracy
        #print("acc")
        acc = binary_accuracy(output.squeeze(), labels)   
        
        #print("back")
        #backpropage the loss and compute the gradients
        loss.backward() 
        nn.utils.clip_grad_norm_(model.parameters(), clip)      
        
        #print("optim")
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()      
    return epoch_loss / len(train_iterator), epoch_acc / len(train_iterator)

In [0]:
def evaluate(model, eval_iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0
    clip = 5

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for inputs, text_lengths, labels in eval_iterator:

            #retrieve text and no. of words
            inputs, labels = inputs.to(device), labels.to(device)
            
            #convert to 1d tensor
            output = model(inputs, text_lengths)
            
            #compute loss and accuracy
            test_loss = criterion(output.squeeze(), labels.float())
            acc = binary_accuracy(output.squeeze(), labels) 
            
            #keep track of loss and accuracy
            epoch_loss += test_loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(eval_iterator), epoch_acc / len(eval_iterator)

In [0]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model_r, trainloader, optimizer, criterion)
    
    print("eval")
    #evaluate the model
    valid_loss, valid_acc = evaluate(model_r, validloader, criterion)
     
   #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_r.state_dict(), 'saved_weights_1.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [0]:
model_r.load_state_dict(torch.load("saved_weights_1.pt"))

In [0]:
target_loss, target_acc = evaluate(model_r, target_iterator, criterion)