In [None]:
import numpy as np
import pandas as pd
from collections import Counter
import string
import  random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 
from torchvision.transforms import transforms
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchtext import data
from sklearn.metrics import mean_absolute_error as MAE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd drive/MyDrive/ML_German

/content/drive/MyDrive/ML_German


In [None]:
use_cuda = torch.cuda.is_available()
torch.manual_seed(1024)
device = torch.device("cuda" if use_cuda else "cpu")
torch.backends.cudnn.deterministic = True  

In [None]:
train_data = pd.read_csv('./csv_files/training_no_emoji.csv')
validation_data = pd.read_csv('./csv_files/validation_no_emoji.csv') 

In [None]:
allowed_charachters = [ch for ch in ' abcdefghijklmnopqrstuvwxyz' + 'öäüß']
def filter_texts(texts):
  return [''.join([ch for ch in text.lower() if ch in allowed_charachters]) for text in texts]

In [None]:
data_train, values_train = train_data['Text'], train_data['Long']
data_valid, values_valid = validation_data['Text'], validation_data['Long']

In [None]:
data_train = filter_texts(data_train)
data_valid = filter_texts(data_valid)

In [None]:
# train_data = pd.read_csv('train_data.csv', sep='\t', encoding='utf-8', lineterminator='\n', header = 0, names=['Id', 'Text', 'Label'])
# validation_data = pd.read_csv('valid_data.csv', sep='\t', encoding='utf-8', lineterminator='\n', header = 0, names=['Id', 'Text', 'Label'])

In [None]:
def full_texts(texts):
  text = " "
  for it in texts:
    text += " ".join(it)
  return text

In [None]:
text = full_texts(data_train)

In [None]:
class Vocabulary:
    """
    Helper class that maps characters to unique indices and the other way around
    """
    def __init__(self, text: str):
        # PAD is a special character for padding shorter sequences 
        # in a mini-batch
        # create a set out of all characters
        characters_set = set(["0"]) 
        characters_set.update(text)
        
        #create a dictionary for characters
        self.char_to_idx = {char:idx for (idx, char) 
                            in enumerate(characters_set)}
        self.idx_to_char = {idx:char for (idx, char) 
                            in enumerate(characters_set)}
   
    def size(self):
        return len(self.char_to_idx)
      
    def __str__(self):
        return str(self.char_to_idx)

In [None]:
vocab = Vocabulary(text)
print("Vocabulary size: ", vocab.size())
print("Vocabulary: \n", vocab)

Vocabulary size:  32
Vocabulary: 
 {'d': 0, 't': 1, 'v': 2, 'n': 3, 'ü': 4, 'ä': 5, 'o': 6, 'e': 7, '0': 8, 'w': 9, 'j': 10, 'z': 11, 'b': 12, 'c': 13, 'u': 14, 'i': 15, 'q': 16, 'p': 17, 'ö': 18, 'l': 19, 'h': 20, 'f': 21, 'ß': 22, 'x': 23, 's': 24, 'g': 25, 'a': 26, 'k': 27, 'r': 28, 'y': 29, 'm': 30, ' ': 31}


In [None]:
def text_to_tensor(text: str, vocab: Vocabulary) -> torch.LongTensor:
    """
    Convert a string to a Tensor with corresponding character indices
    e.g. "We have" -> [48, 13,  2, 66, 56, 31, 13 
    """
    text_indices = [vocab.char_to_idx[c] for c in text]
  
    return torch.tensor(text_indices)

In [None]:
# function that prepers bacthes 
def my_collate(batch):
  sizes = []
  for item in batch:
    sizes.append(torch.tensor(len(item[0])))
  sizes = torch.stack(sizes, dim = 0).long()
  max_size = torch.max(sizes, dim = 0)[0]
  new_data = []
  for item in batch:
    new_data.append(F.pad(input=item[0], pad=(0, 500 - item[0].shape[0]), mode='constant', value=vocab.char_to_idx['0']))
  data = torch.stack(new_data, dim = 0)
  target = torch.stack([torch.tensor(item[1]) for item in batch], dim = 0)
  return [data, sizes, target]

In [None]:
class TextsDataset(Dataset):
    def __init__(self, texts, labels=None, vocab = None, max_length = 500):
        self.X = texts
        self.y = labels
        self.vocab = vocab
        self.max_len = max_length
         
    def __len__(self):
        return (len(self.X))
    
    def __getitem__(self, i):
        data = self.X[i]
        data = text_to_tensor(data, self.vocab)
        if self.y is not None:
            y = self.y[i]
            return (data, y)
        else:
            return data

In [None]:
training_dataset = TextsDataset(data_train, values_train, vocab, 500)
validing_dataset = TextsDataset(data_valid, values_valid, vocab, 500)

In [None]:
batch_size = 128

In [None]:
trainloader = DataLoader(training_dataset, batch_size=batch_size, shuffle=True, drop_last = True, collate_fn=my_collate)
validloader = DataLoader(validing_dataset, batch_size=batch_size, shuffle=True, drop_last = True, collate_fn=my_collate)

In [None]:
class BiLSTM(nn.Module):
    
    def __init__(self, vocab_size, output_size, embed_size, hidden_nodes, n_layers, drop_prob=0.5):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_nodes, bidirectional=True, batch_first=True, num_layers = n_layers)
        self.linear = nn.Linear(hidden_nodes*4 , 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drop_prob)
        self.out = nn.Linear(64, output_size)


    def forward(self, x, text_sizes):
        h_embedding = self.embedding(x)
        #packed_embedded = nn.utils.rnn.pack_padded_sequence(h_embedding, text_sizes.flatten(), batch_first=True, enforce_sorted=False)
        h_lstm, _ = self.lstm(h_embedding)
        #h_lstm = nn.utils.rnn.pad_packed_sequence(h_lstm, True)[0]
        avg_pool = torch.mean(h_lstm, 1)
        max_pool, _ = torch.max(h_lstm, 1)
        conc = torch.cat(( avg_pool, max_pool), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        return out

In [None]:
size_of_vocab = vocab.size()
embedding_dim = 128
num_hidden_nodes = 64
num_output_nodes = 1
num_layers = 2
dropout = 0.2

#instantiate the model
model_r = BiLSTM(size_of_vocab, num_output_nodes, embedding_dim, num_hidden_nodes, num_layers, dropout)

In [None]:
#architecture
print(model_r)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model_r.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model_r):,} trainable parameters')

BiLSTM(
  (embedding): Embedding(32, 128)
  (lstm): LSTM(128, 64, num_layers=2, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=256, out_features=64, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (out): Linear(in_features=64, out_features=1, bias=True)
)
The model has 219,265 trainable parameters


In [None]:
#define optimizer and loss
optimizer = optim.Adam(model_r.parameters(), lr = 0.005)
criterion = nn.L1Loss()
    
#push to cuda if available
model_r = model_r.to(device)
criterion = criterion.to(device)

In [None]:
def train(model, train_iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    clip = 3
    
    #set the model in training phase
    model.train()  
    for inputs, text_lengths, labels in train_iterator:
        #print(it)
        inputs, labels = inputs.to(device), labels.to(device)
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        #print("retrive")
        output = model(inputs, text_lengths) 
        
        #compute the loss
        #print("loss")
        loss = criterion(output.squeeze(), labels.float())  
        
        #print("back")
        #backpropage the loss and compute the gradients
        loss.backward() 
        nn.utils.clip_grad_norm_(model.parameters(), clip)      
        
        #print("optim")
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  

    return epoch_loss / len(train_iterator)

In [None]:
def evaluate(model, eval_iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    clip = 5

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for inputs, text_lengths, labels in eval_iterator:

            #retrieve text and no. of words
            inputs, labels = inputs.to(device), labels.to(device)
            
            #convert to 1d tensor
            output = model(inputs, text_lengths)
            
            #compute loss and accuracy
            test_loss = criterion(output.squeeze(), labels.float())
            
            #keep track of loss and accuracy
            epoch_loss += test_loss.item()
        
    return epoch_loss / len(eval_iterator)

In [None]:
model_r.load_state_dict(torch.load("saved_weights_1.pt"))

<All keys matched successfully>

In [None]:
N_EPOCHS = 50
best_valid_loss = 0.654

for epoch in range(N_EPOCHS):
     
    print("Epoch: ", epoch)
    #train the model
    train_loss = train(model_r, trainloader, optimizer, criterion)
    
    #evaluate the model
    valid_loss = evaluate(model_r, validloader, criterion)
     
   #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_r.state_dict(), 'saved_weights_best_valid_long.pt')
    torch.save(model_r.state_dict(), 'saved_weights_long.pt') 
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')

Epoch:  0
	Train Loss: 0.966
	 Val. Loss: 0.779
Epoch:  1
	Train Loss: 0.804
	 Val. Loss: 0.739
Epoch:  2
	Train Loss: 0.769
	 Val. Loss: 0.717
Epoch:  3
	Train Loss: 0.734
	 Val. Loss: 0.699
Epoch:  4
	Train Loss: 0.710
	 Val. Loss: 0.686
Epoch:  5
	Train Loss: 0.688
	 Val. Loss: 0.680
Epoch:  6
	Train Loss: 0.669
	 Val. Loss: 0.683
Epoch:  7
	Train Loss: 0.656
	 Val. Loss: 0.663
Epoch:  8
	Train Loss: 0.645
	 Val. Loss: 0.678
Epoch:  9
	Train Loss: 0.632
	 Val. Loss: 0.677
Epoch:  10
	Train Loss: 0.618
	 Val. Loss: 0.667
Epoch:  11
	Train Loss: 0.608
	 Val. Loss: 0.690
Epoch:  12
	Train Loss: 0.596
	 Val. Loss: 0.669
Epoch:  13
	Train Loss: 0.597
	 Val. Loss: 0.657
Epoch:  14
	Train Loss: 0.590
	 Val. Loss: 0.680
Epoch:  15
	Train Loss: 0.584
	 Val. Loss: 0.691
Epoch:  16
	Train Loss: 0.572
	 Val. Loss: 0.679
Epoch:  17
	Train Loss: 0.567
	 Val. Loss: 0.654
Epoch:  18
	Train Loss: 0.560
	 Val. Loss: 0.708
Epoch:  19
	Train Loss: 0.548
	 Val. Loss: 0.668
Epoch:  20
	Train Loss: 0.536
