<a href="https://colab.research.google.com/github/Ravio1i/ki-lab/blob/master/3_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!wget https://github.com/tblock/10kGNAD/raw/master/train.csv train.csv
!wget https://github.com/tblock/10kGNAD/raw/master/test.csv test.csv

--2021-04-23 07:36:19--  https://github.com/tblock/10kGNAD/raw/master/train.csv
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/tblock/10kGNAD/master/train.csv [following]
--2021-04-23 07:36:20--  https://raw.githubusercontent.com/tblock/10kGNAD/master/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24405789 (23M) [text/plain]
Saving to: ‘train.csv.2’


2021-04-23 07:36:20 (291 MB/s) - ‘train.csv.2’ saved [24405789/24405789]

--2021-04-23 07:36:20--  http://train.csv/
Resolving train.csv (train.csv)... failed: Name or service not known.
wget: unable to resolve host address ‘train.csv’


# Preprocess

In [48]:
import string
import csv

In [53]:
valid_chars = string.ascii_letters + "ÄÖÜäöüß–" + string.punctuation + string.digits + string.whitespace 

def prepare(csvfile: str) -> list:
    data = []
    labels = []
    with open(csvfile, "r", encoding='utf-8',) as csvfile:
        reader = csv.reader(csvfile, delimiter=';', quotechar='\'')
        for row in reader:
            if row[0] not in ["Sport", "Wirtschaft"]:
                continue
            if not all(c in valid_chars for c in row[1]):
                continue
            labels.append(row[0])
            data.append(row[1])
    return data, labels

train_texts, train_labels = prepare("train.csv")

test_data, test_labels = prepare("test.csv")

print(len(train_data))
print(len(test_data))

2154
244


# Text Preparation

Divide the valid training data lines into tokens using SpaCy.

In [371]:
!python -m spacy download de
import spacy
from collections import Counter
nlp = spacy.load("de")

test_token = [nlp.tokenizer(x) for x in test_data]
train_token = [nlp.tokenizer(x) for x in train_data]

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


Determine the 5000 most common tokens in the training data.

In [71]:
doc = nlp.tokenizer(' '.join(train_data))
words = [token.text for token in doc 
         if not token.is_stop and not token.is_punct and token.text != ' ']
word_freq = Counter(words)
common_words = word_freq.most_common(5000)

print(common_words[:5])

[('Prozent', 2176), ('Euro', 2045), ('Österreich', 1111), ('Wien', 1057), ('Millionen', 864)]


For those tokens, create the word_to_ix dictionary

In [75]:
tag_to_ix = {'Wirtschaft': 0, 'Sport': 1}
word_to_ix = {}
for word in common_words:
    word_to_ix[word[0]] = len(word_to_ix)

print(word_to_ix)
print(tag_to_ix)

{'Prozent': 0, 'Euro': 1, 'Österreich': 2, 'Wien': 3, 'Millionen': 4, 'STANDARD': 5, 'Austria': 6, 'Milliarden': 7, 'Unternehmen': 8, 'Spiel': 9, 'Bank': 10, 'Deutschland': 11, 'steht': 12, 'Sonntag': 13, 'Salzburg': 14, 'Saison': 15, 'Platz': 16, '2014': 17, 'FC': 18, 'Sieg': 19, 'USA': 20, '2015': 21, 'zuletzt': 22, 'Rapid': 23, 'Punkte': 24, '2016': 25, 'laut': 26, 'Minute': 27, 'Samstag': 28, 'Trainer': 29, 'Montag': 30, 'Donnerstag': 31, 'Mannschaft': 32, 'Freitag': 33, 'Mittwoch': 34, 'Wiener': 35, 'Dienstag': 36, '1': 37, 'liegt': 38, 'APA': 39, 'Minuten': 40, 'Geld': 41, 'Tore': 42, 'geben': 43, 'Europa': 44, 'erklärte': 45, 'Land': 46, 'bzw.': 47, 'letzten': 48, 'Regierung': 49, 'Mio.': 50, '20': 51, 'Team': 52, 'Dollar': 53, 'fast': 54, 'Banken': 55, 'Runde': 56, 'derzeit': 57, 'Menschen': 58, 'Woche': 59, 'knapp': 60, 'Griechenland': 61, 'Spieler': 62, '1:0': 63, '15': 64, 'Tor': 65, 'EU': 66, 'League': 67, 'stehen': 68, 'deutlich': 69, 'deutschen': 70, 'Folge': 71, 'sieht':

# LSTM

In [316]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from time import time

embedding_dim = 128 #@param {type:"integer"}
hidden_dim = 256 #@param {type:"integer"}
lr = 0.1 #@param {type:"number"}

In [168]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [187]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[str(token)] for token in seq if str(token) in to_ix]
    return torch.tensor(idxs, dtype=torch.long)

In [412]:
def test():
    correct = 0
    total = 0
    with torch.no_grad():
        for i, sentence in enumerate(test_token):
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = torch.tensor([tag_to_ix[test_labels[i]]], dtype=torch.long)

            tag_scores = model(sentence_in)

            total += 1
            target = targets[0].item()
            if(tag_scores[-1][target].item() > tag_scores[-1][1-target].item()):
                correct += 1
        return (100 * correct / total)

In [406]:
def test_sample():
    for i in range(10):
        with torch.no_grad():
            print(train_labels[i])
            inputs = prepare_sequence(train_token[i], word_to_ix)
            tag_scores = model(inputs)
            print(tag_scores)

In [407]:
def train():
    correct = 0
    total = 0
    for i, sentence in enumerate(train_token):
        model.zero_grad()

        # Get our inputs ready for the network, that is, turn them into
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.tensor([tag_to_ix[train_labels[i]]], dtype=torch.long)
        if len(sentence_in) < 1:
            continue
        # Run our forward pass.
        tag_scores = model(sentence_in)
        
        # Check if bigger last tag_score is the correct label 
        total += 1
        target = targets[0].item()
        if(tag_scores[-1][target].item() > tag_scores[-1][1-target].item()):
            correct += 1

        # Compute the loss, gradients, and update the parameters by
        loss = loss_function(tag_scores[-1:], targets)
        loss.backward()
        optimizer.step()

    return (100 * correct / total), loss.item()


In [409]:
device = torch.device("cuda:0")
model = LSTMTagger(embedding_dim, hidden_dim, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

In [None]:
time_start = time()
for epoch in range(n_epochs):
    train_acc, train_loss = train()
    test_acc = test()
    time_compl = (time()-time_start)/60
    print("Epoch {} | Training Time (in minutes) = {}".format(epoch+1, time_compl))
    print("     Acc: {}%(train) | Loss: {}".format(train_acc, train_loss))
    print("     Acc: {}%(test)".format(test_acc))


# Bonus

In [None]:
DataLoader()