In [None]:
import pandas as pd
import string
import spacy
import re
import numpy as np
import spacy.cli
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

torch.manual_seed(1)
spacy.cli.download("de_core_news_md")
spacy.prefer_gpu() #returns true if GPU/Cuda is available

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_md')


True

In [None]:
nlp = spacy.load('de_core_news_md')

In [None]:
test_url = 'https://raw.githubusercontent.com/tblock/10kGNAD/master/test.csv'
train_url = 'https://raw.githubusercontent.com/tblock/10kGNAD/master/train.csv'
df_train = pd.read_csv(train_url, error_bad_lines=False, delimiter=';', quotechar='\'', header=None,names=['Topic', 'Text'])
df_test = pd.read_csv(test_url, error_bad_lines=False, delimiter=';', quotechar='\'',  header=None, names=['Topic', 'Text'])

In [None]:
 all_tokens = []

In [None]:
df_train.info

<bound method DataFrame.info of               Topic                                               Text
0             Sport  21-Jähriger fällt wohl bis Saisonende aus. Wie...
1            Kultur  Erfundene Bilder zu Filmen, die als verloren g...
2               Web  Der frischgekürte CEO Sundar Pichai setzt auf ...
3        Wirtschaft  Putin: "Einigung, dass wir Menge auf Niveau vo...
4            Inland  Estland sieht den künftigen österreichischen P...
...             ...                                                ...
9240         Inland  Bernd Saurer war Bridge-Juniorenweltmeister un...
9241  International  Sandhere soll in vergangener Woche bei Luftang...
9242     Wirtschaft  Derzeit Konzeptgruppe in Berlin – Kein Komment...
9243         Inland  Landeshauptmann will den vierten Regierungssit...
9244       Panorama  Er ist einer von Millionen syrischen Flüchtlin...

[9245 rows x 2 columns]>

In [None]:
df_test.info

<bound method DataFrame.info of               Topic                                               Text
0        Wirtschaft  Die Gewerkschaft GPA-djp lanciert den "All-in-...
1             Sport  Franzosen verteidigen 2:1-Führung – Kritische ...
2               Web  Neues Video von Designern macht im Netz die Ru...
3             Sport  23-jähriger Brasilianer muss vier Spiele pausi...
4     International  Aufständische verwendeten Chemikalie bei Gefec...
...             ...                                                ...
1023            Web  Programm "Ebay Plus" soll Ende September start...
1024         Inland  Fehlerhafte Bilanz 2013, Partei will berufen –...
1025          Sport  Als erster Olympia-Champion auch Hawaii-Sieger...
1026          Sport  Der Außenseiter aus Denver gewinnt die 50. Sup...
1027            Web  Wohl keine Einschränkungen bei Google-Services...

[1028 rows x 2 columns]>

In [None]:
##The number of valid lines for training and test should be 2154 and 244.
valid_chars = string.ascii_letters + "ÄÖÜäöüß?–" + string.punctuation + string.digits + string.whitespace 

def check(str, pattern):
    # _matching the strings
    if re.search(pattern, str):
        #print("Valid String")
        return True
    else:
        #print("Invalid String")
        return False

regex = "^["+valid_chars+"]+$"

#filter train data frame
df_train = df_train[(df_train['Topic'] == 'Sport') | (df_train['Topic'] == 'Wirtschaft')] 
pattern = re.compile(regex)
mask = df_train['Text'].apply(lambda x: check(x, pattern))
df_train = df_train[mask]
df_train = df_train.reset_index(drop=True)
print(df_train.info)

#filter test data frame
df_test = df_test[(df_test['Topic'] == 'Sport') | (df_test['Topic'] == 'Wirtschaft')] 
pattern = re.compile(regex)
mask = df_test['Text'].apply(lambda x: check(x, pattern))
df_test = df_test[mask]
df_test = df_test.reset_index(drop=True)
print(df_test.info)

<bound method DataFrame.info of            Topic                                               Text
0          Sport  21-Jähriger fällt wohl bis Saisonende aus. Wie...
1     Wirtschaft  Putin: "Einigung, dass wir Menge auf Niveau vo...
2     Wirtschaft  Der Welser Stempelhersteller verbreitert sich ...
3          Sport  Traditionsklub setzt sich gegen den FC Utrecht...
4          Sport  Abschiedstournee für Guardiola beginnt beim HS...
...          ...                                                ...
2149  Wirtschaft  Austria Glas Recycling appelliert an Bevölkeru...
2150  Wirtschaft  Kein Kommentar, ob Raffinerie in Schwechat ode...
2151  Wirtschaft  Günter Geyer zieht nach wie vor die Fäden – El...
2152  Wirtschaft  Der heimische Baukonzern zieht einen Großauftr...
2153  Wirtschaft  Derzeit Konzeptgruppe in Berlin – Kein Komment...

[2154 rows x 2 columns]>
<bound method DataFrame.info of           Topic                                               Text
0    Wirtschaft  Die Gewerk

In [None]:
def tokenize(lines, nlp, train=False):
  tokens = []
  for line in lines:
    line_tokens = []
    doc = nlp(line)
    for token in doc:
      line_tokens.append(token.text)
      if train:
        all_tokens.append(token.text)
    tokens.append(line_tokens)
  return tokens

train_tokens = tokenize(df_train.loc[:, "Text"], nlp, train=True)
test_tokens = tokenize(df_test.loc[:, "Text"], nlp)

In [None]:
#create dictionary of 5000 most common words in trainingsdata
unique_list, counts = np.unique(np.array(all_tokens), return_counts=True)
arr = np.asarray((unique_list, counts)).T
sortedArr = arr[arr[:,1].argsort()]
sortedArr = np.flip(sortedArr)
top_fiveK = sortedArr[:5000]

In [None]:
top_fiveK[:3]

array([['99', 'Gastgeber'],
       ['99', 'Aussendung'],
       ['99', 'solche']], dtype='<U88')

In [None]:
tag_to_ix = {"Wirtschaft": 0, "Sport": 1}
word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for counter, token in top_fiveK:
  if token not in word_to_ix:  # word has not been assigned an index yet
    word_to_ix[token] = len(word_to_ix)  # Assign each word with a unique index
print(word_to_ix)

{'Gastgeber': 0, 'Aussendung': 1, 'solche': 2, 'Abschluss': 3, 'Geschäft': 4, 'Verein': 5, 'Noch': 6, 'Sport': 7, 'lang': 8, 'geplant': 9, 'du': 10, '80': 11, 'geführt': 12, 'vielleicht': 13, 'verdient': 14, 'Sein': 15, 'Über': 16, 'Coach': 17, 'Beispiel': 18, 'Schweden': 19, '35': 20, 'Niederösterreicher': 21, 'Rolle': 22, 'Linz': 23, 'nächste': 24, 'Dazu': 25, 'Ersatz': 26, 'Zentralbank': 27, 'genug': 28, 'sorgte': 29, 'findet': 30, 'Preis': 31, 'Außerdem': 32, 'Training': 33, 'gegangen': 34, 'Heta': 35, 'Viertelfinale': 36, 'Partien': 37, 'Politik': 38, 'Staatsanwaltschaft': 39, 'Zinsen': 40, 'richtig': 41, 'ihnen': 42, 'Insgesamt': 43, 'Staat': 44, '2018': 45, 'gehe': 46, 'stärker': 47, 'Investitionen': 48, 'Nun': 49, 'EM': 50, 'beste': 51, 'berichtet': 52, 'starken': 53, 'stellen': 54, 'legte': 55, 'hohen': 56, '!': 57, 'Jahr': 58, 'beim': 59, 'seit': 60, 'wieder': 61, 'Karriere': 62, 'weltweit': 63, 'zahlen': 64, 'möchte': 65, 'Verkauf': 66, 'holte': 67, '2:2': 68, 'Parlament': 6

In [None]:
#format train data
#neccessary structure for train data: (['The', 'dog', 'ate', 'the', 'apple'], ['Sport'])

train_data = train_tokens.copy()
for idx, tokens in enumerate(train_data):
  
  label = df_train.loc[idx, "Topic"]
  train_data[idx] = (tokens, [label])

print(train_data[0])

#format test data
test_data = test_tokens.copy()
for idx, tokens in enumerate(test_data):
  
  label = df_test.loc[idx, "Topic"]
  test_data[idx] = (tokens, [label])

print(test_data[0])

(['21-Jähriger', 'fällt', 'wohl', 'bis', 'Saisonende', 'aus', '.', 'Wien', '–', 'Rapid', 'muss', 'wohl', 'bis', 'Saisonende', 'auf', 'Offensivspieler', 'Thomas', 'Murg', 'verzichten', '.', 'Der', 'im', 'Winter', 'aus', 'Ried', 'gekommene', '21-Jährige', 'erlitt', 'beim', '0:4-Heimdebakel', 'gegen', 'Admira', 'Wacker', 'Mödling', 'am', 'Samstag', 'einen', 'Teilriss', 'des', 'Innenbandes', 'im', 'linken', 'Knie', ',', 'wie', 'eine', 'Magnetresonanz-Untersuchung', 'am', 'Donnerstag', 'ergab', '.', 'Murg', 'erhielt', 'eine', 'Schiene', ',', 'muss', 'aber', 'nicht', 'operiert', 'werden', '.', 'Dennoch', 'steht', 'ihm', 'eine', 'mehrwöchige', 'Pause', 'bevor', '.'], ['Sport'])
(['Die', 'Gewerkschaft', 'GPA-djp', 'lanciert', 'den', '"', 'All-in-Rechner', '"', 'und', 'findet', ',', 'dass', 'die', 'Vertragsform', 'auf', 'die', 'Führungsebene', 'beschränkt', 'gehört', '.', 'Wien', '–', 'Die', 'Gewerkschaft', 'GPA-djp', 'sieht', 'Handlungsbedarf', 'bei', 'sogenannten', 'All-in-Verträgen', '.', 'F

In [None]:
def prepare_sequence(seq, to_ix):
  idxs = []
  for word in seq:
    #skip not existing words in dict
    if word in to_ix:
      idxs.append(to_ix[word])
  return torch.tensor(idxs, dtype=torch.long)

In [None]:
#testing if sequence mapping for tags and sentence works
for sentence, tags in train_data[:1]:
  print(prepare_sequence(sentence, word_to_ix))

tensor([1943, 3205, 1978, 3311, 4872, 3246, 3211, 2733,   59,   90, 3246, 4844,
         586, 1978, 4990, 3242])


In [None]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
      ### mapping from 5000 Vocabs to 128 embedding dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores   ###outputs scores for each word. The last score represents the score for the whole sentence
        
correct = 0
total = 0
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Running on {device}")

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
model.to(device)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
# with torch.no_grad():
    # inputs = prepare_sequence(train_data[0][0], word_to_ix)
    # tag_scores = model(inputs)
    # print(tag_scores)

for epoch in range(5):  
    training_time = 0
    for sentence, topic in train_data:
        #start logging the training time
        start = time.perf_counter()

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
          
        sentence_in = prepare_sequence(sentence, word_to_ix)   ##words ->embeddings
        targets = prepare_sequence(topic, tag_to_ix)   ## targets->embeddings
        sentence_in.to(device)
        targets.to(device)

        # # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)
        #get last prediction
        output = tag_scores[-1]   #takes only the last score

        prediction = (output.argmax(0) == targets).sum().item() ##takes the index with higher prob.
        correct += prediction
        total += 1

        output = torch.reshape(output, (1, -1)) #reshape into loss function format

        # Step 4. Compute the loss, gradients, and update the parameters by
        loss = loss_function(output, targets)
        logged_loss = loss.data

        loss.backward()
        optimizer.step()

        #end logging the training time
        end = time.perf_counter()
        training_time += (end-start)
###accuracy is calculated for each epoche
    acc =  100 * correct / total
    print(f"Epoch: {epoch+1} | trained in {round(training_time/60., 2)} minutes")
    print(f"Train Accuracy: {acc}% \t Loss: {logged_loss}")
    


    test_correct = 0
    test_total = 0
    for sentence, topic in test_data:
      sentence_in = prepare_sequence(sentence, word_to_ix)
      targets = prepare_sequence(topic, tag_to_ix)
      sentence_in.to(device)
      targets.to(device)
      tag_scores = model(sentence_in)
      output = tag_scores[-1]

      prediction = (output.argmax(0) == targets).sum().item()
      test_correct += prediction
      test_total += 1

    test_acc =  100 * test_correct / test_total
    print(f"Test Accuracy: {test_acc}%\n")    

Running on cuda:0
Epoch: 1 | trained in 0.51 minutes
Train Accuracy: 67.50232126276694% 	 Loss: 0.3359502851963043
Test Accuracy: 77.45901639344262%

Epoch: 2 | trained in 0.51 minutes
Train Accuracy: 72.16805942432683% 	 Loss: 0.08834531158208847
Test Accuracy: 82.37704918032787%

Epoch: 3 | trained in 0.51 minutes
Train Accuracy: 77.01949860724234% 	 Loss: 0.011496486142277718
Test Accuracy: 84.8360655737705%

Epoch: 4 | trained in 0.51 minutes
Train Accuracy: 80.55942432683379% 	 Loss: 0.009496986865997314
Test Accuracy: 84.42622950819673%

Epoch: 5 | trained in 0.51 minutes
Train Accuracy: 83.31476323119777% 	 Loss: 0.0013167529832571745
Test Accuracy: 82.78688524590164%

