<a href="https://colab.research.google.com/github/Ravio1i/ki-lab/blob/master/3_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://github.com/tblock/10kGNAD/raw/master/train.csv train.csv
!wget https://github.com/tblock/10kGNAD/raw/master/test.csv test.csv

--2021-04-26 15:06:53--  https://github.com/tblock/10kGNAD/raw/master/train.csv
Resolving github.com (github.com)... 52.192.72.89
Connecting to github.com (github.com)|52.192.72.89|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/tblock/10kGNAD/master/train.csv [following]
--2021-04-26 15:06:53--  https://raw.githubusercontent.com/tblock/10kGNAD/master/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24405789 (23M) [text/plain]
Saving to: ‘train.csv.1’


2021-04-26 15:06:54 (47.2 MB/s) - ‘train.csv.1’ saved [24405789/24405789]

--2021-04-26 15:06:54--  http://train.csv/
Resolving train.csv (train.csv)... failed: Name or service not known.
wget: unable to resolve host address ‘train.csv’

# Preprocess

In [2]:
import string
import csv

In [6]:
valid_chars = string.ascii_letters + "ÄÖÜäöüß–" + string.punctuation + string.digits + string.whitespace 

def prepare(csvfile: str) -> list:
    data = []
    labels = []
    with open(csvfile, "r", encoding='utf-8',) as csvfile:
        reader = csv.reader(csvfile, delimiter=';')
        for row in reader:
            if row[0] not in ["Sport", "Wirtschaft"]:
                continue
            if not all(c in valid_chars for c in row[1]):
                continue
            labels.append(row[0])
            data.append(" ".join(row[1:]))
    return data, labels

train_data, train_labels = prepare("train.csv")
test_data, test_labels = prepare("test.csv")

print(len(train_data))
print(len(test_data))

2163
246


# Text Preparation

Divide the valid training data lines into tokens using SpaCy.

In [7]:
!python -m spacy download de
import spacy
from collections import Counter
nlp = spacy.load("de")

train_token = [nlp.tokenizer(x) for x in train_data]
test_token = [nlp.tokenizer(x) for x in test_data]

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


Determine the 5000 most common tokens in the training data.

In [9]:
doc = nlp.tokenizer(' '.join(train_data))
words = [token.text for token in doc 
         if not token.is_stop and not token.is_punct and token.text != ' ']
word_freq = Counter(words)
common_words = word_freq.most_common(5000)

print(common_words[:10])

[('Prozent', 2195), ('Euro', 2080), ('Österreich', 1117), ('Wien', 1063), ('Millionen', 874), ('STANDARD', 680), ('Austria', 587), ('Milliarden', 573), ('Unternehmen', 567), ('Spiel', 549)]


For those tokens, create the word_to_ix dictionary

In [10]:
tag_to_ix = {'Wirtschaft': 0, 'Sport': 1}
word_to_ix = {}
for word in common_words:
    word_to_ix[word[0]] = len(word_to_ix)

print(word_to_ix)
print(tag_to_ix)

{'Prozent': 0, 'Euro': 1, 'Österreich': 2, 'Wien': 3, 'Millionen': 4, 'STANDARD': 5, 'Austria': 6, 'Milliarden': 7, 'Unternehmen': 8, 'Spiel': 9, 'Bank': 10, 'Deutschland': 11, 'steht': 12, 'Sonntag': 13, 'Salzburg': 14, 'Saison': 15, 'USA': 16, '2014': 17, 'Platz': 18, 'FC': 19, '2015': 20, 'Sieg': 21, 'zuletzt': 22, 'Rapid': 23, 'Punkte': 24, 'laut': 25, '2016': 26, 'Minute': 27, 'Samstag': 28, 'Trainer': 29, 'Montag': 30, 'Donnerstag': 31, 'Mannschaft': 32, 'Freitag': 33, 'Mittwoch': 34, 'Wiener': 35, 'Dienstag': 36, '1': 37, 'liegt': 38, 'APA': 39, 'Minuten': 40, 'Geld': 41, 'Europa': 42, 'Tore': 43, 'geben': 44, 'Mio.': 45, 'erklärte': 46, 'Land': 47, 'bzw.': 48, 'Regierung': 49, 'letzten': 50, 'Banken': 51, '20': 52, 'Team': 53, 'fast': 54, 'Dollar': 55, 'Runde': 56, 'derzeit': 57, 'Menschen': 58, 'Woche': 59, 'Griechenland': 60, 'knapp': 61, 'Spieler': 62, '1:0': 63, 'EU': 64, '15': 65, 'Tor': 66, 'stehen': 67, 'League': 68, 'deutlich': 69, 'deutschen': 70, 'sieht': 71, 'Folge':

# LSTM

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from time import time

embedding_dim = 128 #@param {type:"integer"}
hidden_dim = 256 #@param {type:"integer"}
lr = 0.1 #@param {type:"number"}

In [13]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [14]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[str(token)] for token in seq if str(token) in to_ix]
    return torch.tensor(idxs, dtype=torch.long)

In [15]:
def test():
    correct = 0
    total = 0
    with torch.no_grad():
        for i, sentence in enumerate(test_token):
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = torch.tensor([tag_to_ix[test_labels[i]]], dtype=torch.long)

            tag_scores = model(sentence_in)

            total += 1
            target = targets[0].item()
            # [score1=Wirtschaft, score2=Sport]
            # tag_scores[-1][target].item() [-1000, -0.1]
            # tag_scores[-1][1-target].item()
            if(tag_scores[-1][target].item() > tag_scores[-1][1-target].item()):
                correct += 1
        return (100 * correct / total)

In [17]:
def test_sample():
    for i in range(10):
        with torch.no_grad():
            print(train_labels[i])
            inputs = prepare_sequence(train_token[i], word_to_ix)
            tag_scores = model(inputs)
            print(tag_scores)

In [18]:
def train():
    correct = 0
    total = 0
    for i, sentence in enumerate(train_token):
        model.zero_grad()

        # Get our inputs ready for the network, that is, turn them into
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.tensor([tag_to_ix[train_labels[i]]], dtype=torch.long)
        if len(sentence_in) < 1:
            continue
        # Run our forward pass.
        tag_scores = model(sentence_in)
        
        # Check if bigger last tag_score is the correct label 
        total += 1
        target = targets[0].item()
        if(tag_scores[-1][target].item() > tag_scores[-1][1-target].item()):
            correct += 1

        # Compute the loss, gradients, and update the parameters by
        loss = loss_function(tag_scores[-1:], targets)
        loss.backward()
        optimizer.step()

    return (100 * correct / total), loss.item()


In [19]:
device = torch.device("cuda:0")
model = LSTMTagger(embedding_dim, hidden_dim, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

In [21]:
time_start = time()
n_epochs = 5
for epoch in range(n_epochs):
    train_acc, train_loss = train()
    test_acc = test()
    time_compl = (time()-time_start)/60
    print("Epoch {} | Training Time (in minutes) = {}".format(epoch+1, time_compl))
    print("     Acc: {}%(train) | Loss: {}".format(train_acc, train_loss))
    print("     Acc: {}%(test)".format(test_acc))


Epoch 1 | Training Time (in minutes) = 3.5178045789400736
     Acc: 61.702127659574465%(train) | Loss: 0.06582174450159073
     Acc: 58.53658536585366%(test)
Epoch 2 | Training Time (in minutes) = 6.390918668111166
     Acc: 77.33580018501388%(train) | Loss: 0.0051244106143713
     Acc: 56.50406504065041%(test)
Epoch 3 | Training Time (in minutes) = 9.228682001431784
     Acc: 88.25161887141536%(train) | Loss: 0.12573286890983582
     Acc: 56.09756097560975%(test)
Epoch 4 | Training Time (in minutes) = 12.080551358064016
     Acc: 85.8926919518964%(train) | Loss: 0.016221914440393448
     Acc: 82.11382113821138%(test)
Epoch 5 | Training Time (in minutes) = 14.901530996958416
     Acc: 94.63459759481961%(train) | Loss: 8.702239938429557e-06
     Acc: 95.9349593495935%(test)
