In [32]:
import numpy as np
import nltk
import pandas as pd
import copy
import math
import matplotlib.pyplot as plt
import matplotlib as mpl
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
import string
import  random
from spacy.lang.ro import Romanian
from spacy.lang.ro.stop_words import STOP_WORDS

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 
from torchvision.transforms import transforms
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchtext import data
from sklearn.metrics import mean_absolute_error as MAE

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd drive/MyDrive/ML_German

/content/drive/MyDrive/ML_German


In [4]:
use_cuda = torch.cuda.is_available()
torch.manual_seed(1024)
device = torch.device("cuda" if use_cuda else "cpu")
torch.backends.cudnn.deterministic = True  

In [5]:
train_data = pd.read_csv('./csv_files/training_no_emoji.csv') #, sep='\t', encoding='utf-8', lineterminator='\n', header = 0, names=['Id', 'Text', 'Label'])
validation_data = pd.read_csv('./csv_files/validation_no_emoji.csv') # pd.read_csv('valid_data.csv', sep='\t', encoding='utf-8', lineterminator='\n', header = 0, names=['Id', 'Text', 'Label'])

In [6]:
allowed_charachters = [ch for ch in ' abcdefghijklmnopqrstuvwxyz' + 'öäüß']
def filter_texts(texts):
  return [''.join([ch for ch in text.lower() if ch in allowed_charachters]) for text in texts]

In [8]:
data_train, values_train = train_data['Text'], train_data['Long']
data_valid, values_valid = validation_data['Text'], validation_data['Long']

In [9]:
data_train = filter_texts(data_train)
data_valid = filter_texts(data_valid)

In [10]:
def full_texts(texts):
  text = " "
  for it in texts:
    text += " ".join(it)
  return text

In [13]:
text = full_texts(data_train)

In [14]:
class Vocabulary:
    """
    Helper class that maps characters to unique indices and the other way around
    """
    def __init__(self, text: str):
        # PAD is a special character for padding shorter sequences 
        # in a mini-batch
        # create a set out of all characters
        characters_set = set(["0"]) 
        characters_set.update(text)
        
        #create a dictionary for characters
        self.char_to_idx = {char:idx for (idx, char) 
                            in enumerate(characters_set)}
        self.idx_to_char = {idx:char for (idx, char) 
                            in enumerate(characters_set)}
   
    def size(self):
        return len(self.char_to_idx)
      
    def __str__(self):
        return str(self.char_to_idx)

In [15]:
vocab = Vocabulary(text)
print("Vocabulary size: ", vocab.size())
print("Vocabulary: \n", vocab)

Vocabulary size:  32
Vocabulary: 
 {'a': 0, 'p': 1, 'q': 2, 'ß': 3, 'ö': 4, ' ': 5, 'x': 6, 't': 7, 'y': 8, 'n': 9, 'j': 10, 'i': 11, 'ü': 12, 'u': 13, 'w': 14, 'o': 15, 'c': 16, 'm': 17, 's': 18, 'l': 19, 'v': 20, 'ä': 21, 'k': 22, 'b': 23, 'r': 24, 'h': 25, '0': 26, 'd': 27, 'g': 28, 'e': 29, 'f': 30, 'z': 31}


In [16]:
def text_to_tensor(text: str, vocab: Vocabulary) -> torch.LongTensor:
    """
    Convert a string to a Tensor with corresponding character indices
    e.g. "We have" -> [48, 13,  2, 66, 56, 31, 13 
    """
    text_indices = [vocab.char_to_idx[c] for c in text]
  
    return torch.tensor(text_indices)

In [39]:
def my_collate(batch):
  new_data = []
  for item in batch:
    new_data.append(F.pad(input=item[0], pad=(0, 500 - item[0].shape[0]), mode='constant', value=vocab.char_to_idx['0']))
  data = torch.stack(new_data, dim = 0)
  target = torch.stack([torch.tensor(item[1]) for item in batch], dim = 0)
  return [data, target]

In [40]:
class TextsDataset(Dataset):
    def __init__(self, texts, labels=None, vocab = None, max_length = 1004):
        self.X = texts
        self.y = labels
        self.vocab = vocab
        self.max_len = max_length
         
    def __len__(self):
        return (len(self.X))
    
    def __getitem__(self, i):
        data = self.X[i]
        data = text_to_tensor(data, self.vocab)
        if self.y is not None:
            y = self.y[i]
            return (data, y)
        else:
            return data

In [41]:
training_dataset = TextsDataset(data_train, values_train, vocab, 1004)
validing_dataset = TextsDataset(data_valid, values_valid, vocab, 1004)

In [42]:
batch_size = 128

In [43]:
trainloader = DataLoader(training_dataset, batch_size=batch_size, shuffle=True, drop_last = True, collate_fn=my_collate)
validloader = DataLoader(validing_dataset, batch_size=batch_size, shuffle=True, drop_last = True, collate_fn=my_collate) 

In [51]:
class CNN_Text(nn.Module):
    
    def __init__(self, vocab_size, embed_size):
        super(CNN_Text, self).__init__()
        filter_sizes = [1,2,3,5]
        num_filters = 36
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.convs1 = nn.ModuleList([nn.Conv2d(1, num_filters, (K, embed_size)) for K in filter_sizes])
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(len(filter_sizes)*num_filters, 1)


    def forward(self, x):
        x = self.embedding(x)  
        x = x.unsqueeze(1)  
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] 
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  
        x = torch.cat(x, 1)
        x = self.dropout(x)  
        x = self.fc1(x)  
        return x

In [52]:
size_of_vocab = vocab.size()
embedding_dim = 128

#instantiate the model
model_r =CNN_Text(size_of_vocab, embedding_dim)

In [53]:
#architecture
print(model_r)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model_r.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model_r):,} trainable parameters')

CNN_Text(
  (embedding): Embedding(32, 128)
  (convs1): ModuleList(
    (0): Conv2d(1, 36, kernel_size=(1, 128), stride=(1, 1))
    (1): Conv2d(1, 36, kernel_size=(2, 128), stride=(1, 1))
    (2): Conv2d(1, 36, kernel_size=(3, 128), stride=(1, 1))
    (3): Conv2d(1, 36, kernel_size=(5, 128), stride=(1, 1))
  )
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=144, out_features=1, bias=True)
)
The model has 55,073 trainable parameters


In [54]:
#define optimizer and loss
optimizer = optim.Adam(model_r.parameters(), lr = 0.005)
criterion = nn.L1Loss()
    
#push to cuda if available
model_r = model_r.to(device)
criterion = criterion.to(device)

In [59]:
def train(model, train_iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    clip = 3
    
    #set the model in training phase
    model.train()  
    for inputs, labels in train_iterator:
        #print(it)
        inputs, labels = inputs.to(device), labels.to(device)
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        #print("retrive")
        output = model(inputs) 
        
        #compute the loss
        #print("loss")
        loss = criterion(output.squeeze(), labels.float())  
        
       
        
        #print("back")
        #backpropage the loss and compute the gradients
        loss.backward() 
        nn.utils.clip_grad_norm_(model.parameters(), clip)       
        
        #print("optim")
        #update the weights
        optimizer.step()      

        #loss and accuracy
        epoch_loss += loss.item()  
     
    return epoch_loss / len(train_iterator)

In [60]:
def evaluate(model, eval_iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    clip = 2

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for inputs, labels in eval_iterator:

            #retrieve text and no. of words
            inputs, labels = inputs.to(device), labels.to(device)
            
            #convert to 1d tensor
            output= model(inputs)
            
            #compute loss and accuracy
            test_loss = criterion(output.squeeze(), labels.float())
            
            #keep track of loss and accuracy
            epoch_loss += test_loss.item()
        
    return epoch_loss / len(eval_iterator)

In [61]:
N_EPOCHS = 50
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    print("Epoch: ", epoch)
    #train the model
    train_loss = train(model_r, trainloader, optimizer, criterion)
    
    #evaluate the model
    valid_loss = evaluate(model_r, validloader, criterion)
     
   #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_r.state_dict(), 'saved_weights_3.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')

Epoch:  0
	Train Loss: 1.273
	 Val. Loss: 0.887
Epoch:  1
	Train Loss: 1.118
	 Val. Loss: 0.833
Epoch:  2
	Train Loss: 1.088
	 Val. Loss: 0.832
Epoch:  3
	Train Loss: 1.074
	 Val. Loss: 0.852
Epoch:  4
	Train Loss: 1.046
	 Val. Loss: 0.819
Epoch:  5
	Train Loss: 1.042
	 Val. Loss: 1.243
Epoch:  6
	Train Loss: 1.058
	 Val. Loss: 0.809
Epoch:  7
	Train Loss: 1.039
	 Val. Loss: 0.825
Epoch:  8
	Train Loss: 0.999
	 Val. Loss: 1.020
Epoch:  9
	Train Loss: 1.007
	 Val. Loss: 0.757
Epoch:  10
	Train Loss: 0.982
	 Val. Loss: 1.074
Epoch:  11
	Train Loss: 0.953
	 Val. Loss: 0.743
Epoch:  12
	Train Loss: 0.954
	 Val. Loss: 1.122
Epoch:  13
	Train Loss: 0.959
	 Val. Loss: 0.778
Epoch:  14
	Train Loss: 0.969
	 Val. Loss: 0.956
Epoch:  15
	Train Loss: 0.936
	 Val. Loss: 0.836
Epoch:  16
	Train Loss: 0.935
	 Val. Loss: 0.724
Epoch:  17
	Train Loss: 0.918
	 Val. Loss: 0.734
Epoch:  18
	Train Loss: 0.922
	 Val. Loss: 0.751
Epoch:  19
	Train Loss: 0.889
	 Val. Loss: 0.715
Epoch:  20
	Train Loss: 0.893


In [None]:
model_r.load_state_dict(torch.load("saved_weights_3.pt"))