In [2]:
%pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m96.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m87

In [3]:
from datasets import load_dataset

import torch
import torch.nn as nn
import torch.nn.functional as F

from typing import Dict, List

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


*Basic Model, definition and loaded*

In [4]:
from tqdm.auto import tqdm
from collections import defaultdict
import pickle


class Morph():

    """ Trainable class that goes through a corpus and first saves the
        available words in a dictionary. It then reiterates the corpus and
        saves endings of the corpus by checking if each subword exists in the
        corpus. """

    def __init__(self):
        self.counts = defaultdict(lambda: 0)
        self.endings = defaultdict(lambda: 0)

        self.probable_endings = []
        self.is_trained = False

        self.vocab = set()

    def train(self, vocab):
        """ Training function.

            param: vocab: func -> generator

            Takes a function that returns a generator which is used twice in
            order to first create the counts dictionary, and then the
            endings-dictionary
        """
        v, l = vocab()
        bar = tqdm(range(l))
        for d in v:
            for sen in d:
                for word in sen.split():
                    word = word.lower()
                    self.vocab.add(word)
                    self.counts[word] += 1
            bar.update()

        v, l = vocab()
        bar = tqdm(range(l))
        for d in v:
            for sen in d:
                for word in sen.split():
                    word = word.lower()
                    self.process_word(word)
            bar.update()

        self.is_trained = True
        self.num_tokens = len(self.vocab)

    def process_word(self, word):
        """ Processes an input word.

            param: word: str

            Take and looks at each subword. Adds the ending to the
            endings-dictionary if the stem exits in the vocab. """

        stem = ''
        scores = {}

        for char in word.lower():
            stem += char
            ending = word[len(stem):]
            if self.counts[stem] and ending != '':
                self.endings[ending] += 1

    def evaluate_word(self, word):
        stem = ''
        endscores = {}
        scores = {}

        # Get counts
        for char in word.lower():
            stem += char
            ending = word[len(stem):]
            scores[stem] = self.counts[stem]
            endscores[ending] = self.endings[ending]

        print(scores)
        print(endscores)

        # Normalize
        sum_stem = 0
        sum_end = 0
        for score, endscore in zip(scores.values(), endscores.values()):
          sum_end += endscore
          sum_stem += score

        if sum_stem:
          stemprobs = {key: value / sum_stem  for key, value in scores.items()}
        else:
          stemprobs = {key: value for key, value in scores.items()}

        if sum_end:
          endprobs = {key: value / sum_end  for key, value in endscores.items()}
        else:
          endprobs = {key: value for key, value in endscores.items()}

        if not sum_end and not sum_stem:
          return (word, '')

        #print(stemprobs)

        highest_split = 0
        stem = ''
        pair = (word, '')
        total_prob = 0
        for char in word:
            stem += char
            ending = word[len(stem):]

            if stemprobs[stem]: # Only include splits were stem is in the vocab
              prob = ((1 * stemprobs[stem]) + (1 * endprobs[ending])) / 2
              total_prob += prob
            else:
              prob = 0
              total_prob += prob

            if highest_split < prob:
                highest_split = prob
                pair = (stem, ending)

        return pair

    def evaluate_sentence(self, sentence):
        tokens = sentence.split()
        splits = [ self.evaluate_word(word.lower()) for word in tokens ]

        processed_sentence = ""
        for split in splits:

            stem = split[0]
            end = split[1]

            word = stem

            if end != '' and stem != '':
                word = stem + '-' + end
            elif word != '' and end == '':
                word = stem
            else:
                word = end

            processed_sentence += word + ' '

        return processed_sentence

    def get_probable_endings(self, n=20):
        e, p = list(self.endings.keys()), list(self.endings.values())

        a = [x for _, x in sorted(zip(p, e))]

        self.probable_endings = a[-n:]
        return self.probable_endings

    def get_endings(self):
        if self.is_trained:
            return self.probable_endings
        else:
            print("Model has not been trained. ")
            return False

    def load_model(self, path, drive=False):
        if drive:
            file_path = '/content/gdrive/MyDrive/' + path
        else:
            file_path = path

        with open(file_path + '_counts.pickle', 'rb') as handle:
            self.counts = pickle.load(handle)

        with open(file_path + '_endings.pickle', 'rb') as handle:
            self.endings = pickle.load(handle)

        self.counts = defaultdict(int, self.counts)
        self.endings = defaultdict(int, self.endings)

        self.is_trained = True
        self.get_probable_endings()

        self.vocab = set(list(self.counts.keys()))
        self.num_tokens = len(self.vocab)

        return True

    def save_model(self, path, drive=False):
        if drive:
            file_path = '/content/gdrive/MyDrive/' + path
        else:
            file_path = path

        with open(file_path + '_counts.pickle', 'wb') as handle:
            pickle.dump(dict(self.counts), handle, protocol=pickle.HIGHEST_PROTOCOL)

        with open(file_path + '_endings.pickle', 'wb') as handle:
            pickle.dump(dict(self.endings), handle, protocol=pickle.HIGHEST_PROTOCOL)

        return True

    def __call__(self, sentence):
        return self.evaluate_sentence(sentence)

m = Morph()

In [33]:
m.load_model('swedish_model', drive=True)

True

*Evaluate the basic model*

In [36]:
m("barnen stabilt ditt sked? badkar spelad skedar spelat")

{'b': 4, 'ba': 1, 'bar': 1, 'barn': 16, 'barne': 0, 'barnen': 0}
{'arnen': 0, 'rnen': 0, 'nen': 8, 'en': 811, 'n': 2639, '': 0}
{'s': 9, 'st': 3, 'sta': 0, 'stab': 0, 'stabi': 0, 'stabil': 0, 'stabilt': 0}
{'tabilt': 0, 'abilt': 0, 'bilt': 0, 'ilt': 0, 'lt': 12, 't': 2270, '': 0}
{'d': 0, 'di': 0, 'dit': 0, 'ditt': 63}
{'itt': 131, 'tt': 2283, 't': 2270, '': 0}
{'s': 9, 'sk': 0, 'ske': 0, 'sked': 0, 'sked?': 0}
{'ked?': 0, 'ed?': 9, 'd?': 43, '?': 4278, '': 0}
{'b': 4, 'ba': 1, 'bad': 4, 'badk': 0, 'badka': 0, 'badkar': 0}
{'adkar': 0, 'dkar': 0, 'kar': 7, 'ar': 527, 'r': 1785, '': 0}
{'s': 9, 'sp': 0, 'spe': 0, 'spel': 9, 'spela': 12, 'spelad': 0}
{'pelad': 0, 'elad': 0, 'lad': 0, 'ad': 1415, 'd': 158, '': 0}
{'s': 9, 'sk': 0, 'ske': 0, 'sked': 0, 'skeda': 0, 'skedar': 0}
{'kedar': 0, 'edar': 0, 'dar': 4, 'ar': 527, 'r': 1785, '': 0}
{'s': 9, 'sp': 0, 'spe': 0, 'spel': 9, 'spela': 12, 'spelat': 1}
{'pelat': 1, 'elat': 0, 'lat': 0, 'at': 36, 't': 2270, '': 0}


'barn-en s-tabilt ditt s-ked? bad-kar spel-ad s-kedar spela-t '

In [35]:
m.get_probable_endings(n=10)

['ör', 'an', 'ad', 'r', 'g', 't', 'tt', 'n', 'a', '?']

*Preparing Data Generators for Training of Basic Model, don't run unless you want to train new model*

In [8]:
### English data
data = load_dataset('xsum')

def get_vocab():
    le = 1000
    batch_size = 10
    return ([data['train']['document'][i] for i in range(batch_size)] for _ in range(le)), le



  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
### Swedish data
sv = load_dataset('KBLab/overlim', 'qqp_sv')

def get_vocab():
    le = 300000
    batch_size = 50
    return (sv['train']['text_a'][i*batch_size:(i + 1)*batch_size] for i in range(0, le, batch_size)), int(le / batch_size)


Downloading builder script:   0%|          | 0.00/22.2k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/98.0k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.26k [00:00<?, ?B/s]

Downloading and preparing dataset overlim/qqp_sv (download: 20.98 MiB, generated: 55.33 MiB, post-processed: Unknown size, total: 76.31 MiB) to /root/.cache/huggingface/datasets/KBLab___overlim/qqp_sv/1.0.2/6ca5c4db42719c22d2dfbc40ce443cc3ffe94db893063ec66686be25549c7e7b...


Downloading data:   0%|          | 0.00/22.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/323419 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/40427 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/40430 [00:00<?, ? examples/s]

Dataset overlim downloaded and prepared to /root/.cache/huggingface/datasets/KBLab___overlim/qqp_sv/1.0.2/6ca5c4db42719c22d2dfbc40ce443cc3ffe94db893063ec66686be25549c7e7b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

*Prepare for semi-supervised learning by picking out labels based on the basic model*

In [42]:
def get_gold_labels(model):
    e, p = list(model.endings.keys()), list(model.endings.values())

    a = [x for _, x in sorted(zip(p, e))]
    end = a[-30:] # Only picking words with most common endings

    print(end)

    gold = {}
    for word in list(model.vocab):
        if model.counts[word] < 0.001*model.num_tokens: # Hyperparameter

            pair = model.evaluate_word(word)
            ending = pair[1]

            if ending in end or ending == '':
              gold[word] = ending

    return gold

labeled_data = get_gold_labels(m)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'l': 1, 'lå': 0, 'låt': 7, 'låts': 0, 'låtsa': 0, 'låtsas': 2, 'låtsasv': 0, 'låtsasvä': 0, 'låtsasvän': 0, 'låtsasvänn': 0, 'låtsasvänne': 0, 'låtsasvänner': 0, 'låtsasvänner?': 1}
{'åtsasvänner?': 1, 'tsasvänner?': 0, 'sasvänner?': 1, 'asvänner?': 0, 'svänner?': 0, 'vänner?': 1, 'änner?': 5, 'nner?': 1, 'ner?': 8, 'er?': 118, 'r?': 74, '?': 4278, '': 0}
{'b': 4, 'br': 1, 'bro': 0, 'bror': 4, 'brors': 1, 'brorso': 0}
{'rorso': 0, 'orso': 0, 'rso': 0, 'so': 1, 'o': 72, '': 0}
{'i': 1572, 'im': 1, 'imi': 0, 'imin': 0, 'imino': 0, 'iminod': 0, 'iminodi': 0, 'iminodiä': 0, 'iminodiät': 0, 'iminodiätt': 0, 'iminodiätti': 0, 'iminodiättik': 0, 'iminodiättiks': 0, 'iminodiättiksy': 0, 'iminodiättiksyr': 0, 'iminodiättiksyra': 0}
{'minodiättiksyra': 0, 'inodiättiksyra': 0, 'nodiättiksyra': 0, 'odiättiksyra': 0, 'diättiksyra': 0, 'iättiksyra': 0, 'ättiksyra': 0, 'ttiksyra': 0, 'tiksyra': 0, 'iksyra': 0, 'ksyra': 0, 'syra': 0, 'y

*Character-based tokenizer for transformer*

In [24]:
class Tokenizer():
    def __init__(self, morph: Morph):
        text = ' '
        for word in morph.vocab:
            text += word
        self.padding_token = '<PAD>'
        self.chars = [self.padding_token] + sorted(list(set(text)))
        self.i2w = { i: char for i, char in enumerate(self.chars) }
        self.w2i = { char: i for i, char in enumerate(self.chars) }

        self.longest_word = max(list(morph.vocab), key=len)

    def encode(self, string):
        encoding = [self.w2i[c] for c in string]
        padding = [self.w2i[self.padding_token] for _ in range(len(self.longest_word) - len(string))]
        mask = [1 for _ in encoding] + [0 for _ in padding]
        return encoding + padding, mask

    def decode(self, idx):
        return ''.join([self.i2w[c] for c in idx if self.i2w[c] != self.padding_token])


*Transformer with one bidirectional-attention block*

In [25]:
class Transformer(nn.Module):

    def __init__(self, input_dim, embed_dim, max_len):
        super(Transformer, self).__init__()
        self.embed_dim = embed_dim
        self.max_len = max_len

        self.embed = nn.Embedding(input_dim, embed_dim)
        self.keys = nn.Linear(embed_dim, embed_dim, bias=False)
        self.queries = nn.Linear(embed_dim, embed_dim, bias=False)
        self.values = nn.Linear(embed_dim, embed_dim, bias=False)

        self.output = nn.Linear(embed_dim*max_len, max_len)


    def forward(self, x, mask):

        x = self.embed(x)

        q = self.queries(x)
        k = self.keys(x)
        v = self.values(x)

        a = self.attention(q, k, v, mask)

        out = F.softmax(self.output(a.view(-1, self.max_len * self.embed_dim)), dim=1)

        return out

    def attention(self, q, k, v, mask):

        wei = q @ k.transpose(-2, -1)

        wei = wei.masked_fill(mask[:, :, None] == 0, float('-inf'))
        score = F.softmax(wei, dim=1)

        return score @ v

    def get_pseudo_labels(self, x):
        return


*Dataset classes for training supervised and semi-supervised*

In [46]:
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ratio = 10

class LabeledSet(Dataset):

    def __init__(self, data: Dict, tok: Tokenizer):
        self.tok = tok

        self.words = list(data.keys())
        self.labels = list(data.values())

    def __getitem__(self, idx):
        word = self.words[idx]
        ending = self.labels[idx]

        encoded_word, mask = self.tok.encode(word)
        data, mask = torch.tensor(encoded_word, dtype=torch.int32, device=device), torch.tensor(mask, dtype=torch.int32, device=device)
        label = torch.tensor(len(word) - len(ending), device=device, dtype=torch.int64)

        return data, mask, label

    def __len__(self):
        return len(self.words)

class UnLabaledSet(Dataset):

    def __init__(self, data: Dict, tok: Tokenizer, num_batches: int):
        self.tok = tok

        self.words = list(data.keys())[:batch_size*ratio*batch_size]

    def __getitem__(self, idx):

        word = self.words[idx]
        encoded_word, mask = self.tok.encode(word)
        data, mask = torch.tensor(encoded_word, dtype=torch.int32, device=device), torch.tensor(mask, dtype=torch.int32, device=device)

        return data, mask

    def __len__(self):
        return len(self.words)


batch_size = 4

t = Tokenizer(m)

unlabeled_data = dict(m.counts)

L = LabeledSet(labeled_data, t)

num_batches = len(L) // batch_size

UL = UnLabaledSet(unlabeled_data, t, num_batches)

unlbd_dl = DataLoader(UL, shuffle=True, batch_size=batch_size*ratio)
lbd_dl = DataLoader(L, shuffle=True, batch_size=batch_size)




SEMI-SUPERVISED TRAINING, skip unless you want to train from scratch


In [14]:
from torch.optim import AdamW
from tqdm.auto import tqdm

def alpha(t):
    T1 = 15
    T2 = 95
    alpha_f = 3

    if t < T1:
        return 0
    elif T1 <= t < T2:
        return ((t - T1) / (T2 - T1)) * alpha_f
    else:
        return alpha_f

model = Transformer(len(t.chars), 32, len(t.longest_word))
model = model.to(device)
opt = AdamW(model.parameters(), lr=0.001)
epochs = 100
crit = nn.CrossEntropyLoss()
bar = tqdm(range(epochs))

losses = []
iter_dl = iter(unlbd_dl)

for epoch in range(epochs):
  total_loss = 0
  total_unlbd = 0
  total_lbd = 0

  for data, mask, labels in lbd_dl:

      pred = model(data, mask)

      lbd_loss = F.cross_entropy(pred, labels) / pred.shape[0]

      for data, mask in unlbd_dl:
          break

      pred = model(data, mask)

      pseudo_labels = torch.argmax(pred, dim=1).detach()

      unlbd_loss = F.cross_entropy(pred, pseudo_labels) / pred.shape[0]

      loss = lbd_loss + (alpha(epoch) * unlbd_loss)

      loss.backward()
      opt.step()
      opt.zero_grad()

      total_loss += loss.item()
      total_lbd += lbd_loss.item()
      total_unlbd += unlbd_loss.item()

  bar.set_description(f"Alpha = {round(alpha(epoch), 3)} Unlab = {round(total_unlbd)} Lab = {round(total_lbd)} Loss: {round(total_loss)}")
  bar.update()
  losses.append(total_loss)

  semi_model = model


  0%|          | 0/100 [00:00<?, ?it/s]

*Loading pre-trained model, skip if training model from scratch*

In [53]:
semi_model = Transformer(len(t.chars), 32, len(t.longest_word))
semi_model.to(device)
semi_model.load_state_dict(torch.load('/content/gdrive/MyDrive/semi_model.pt'))

<All keys matched successfully>

*Evaluation of semi-supervised model*

In [54]:


words = ["barnen", 'stabilt', 'ditt', 'sked?', 'badkar', 'spelad', 'skedar', 'spelat']
masks = torch.zeros(8, len(t.longest_word), dtype=torch.int64).to(device)
tens = torch.zeros(8, len(t.longest_word), dtype=torch.int64).to(device)

for i, word in enumerate(words):
  en, mask = t.encode(word)
  ten = torch.tensor(en, device=device, dtype=torch.int64)
  mask = torch.tensor(mask, device=device, dtype=torch.int64)
  tens[i, :] = ten
  masks[i, :] = mask

preds = torch.argmax(semi_model(tens, masks), dim=1).cpu().detach().numpy()
print(preds)
for i, word in enumerate(words):
  print(word[:preds[i]] + '-' + word[preds[i]:])

[4 6 3 4 5 4 5 5]
barn-en
stabil-t
dit-t
sked-?
badka-r
spel-ad
skeda-r
spela-t


SUPERVISED TRAINING, skip unless you want to train from scratch

In [47]:
from torch.optim import AdamW
from tqdm.auto import tqdm


model = Transformer(len(t.chars), 32, len(t.longest_word))
model = model.to(device)
opt = AdamW(model.parameters(), lr=0.0005)
epochs = 100
crit = nn.CrossEntropyLoss()
bar = tqdm(range(epochs))

losses = []
for epoch in range(epochs):
  total_loss = 0
  for data, mask, labels in lbd_dl:

      pred = model(data, mask)

      lbd_loss = F.cross_entropy(pred, labels)

      loss = lbd_loss

      loss.backward()
      opt.step()
      opt.zero_grad()
      total_loss += loss.item()

  bar.set_description(f"Loss: {round(total_loss, 3)}")
  bar.update()
  losses.append(total_loss)

  0%|          | 0/100 [00:00<?, ?it/s]

*Pre-trained model, skip if training from scratch*

In [55]:
model = Transformer(len(t.chars), 32, len(t.longest_word))
model.to(device)
model.load_state_dict(torch.load('/content/gdrive/MyDrive/model.pt'))

<All keys matched successfully>

*Evaluation of supervised model*

In [56]:
words = ["barnen", 'stabilt', 'ditt', 'sked?', 'badkar', 'spelad', 'skedar', 'spelat']
masks = torch.zeros(8, len(t.longest_word), dtype=torch.int64).to(device)
tens = torch.zeros(8, len(t.longest_word), dtype=torch.int64).to(device)

for i, word in enumerate(words):
  en, mask = t.encode(word)
  ten = torch.tensor(en, device=device, dtype=torch.int64)
  mask = torch.tensor(mask, device=device, dtype=torch.int64)
  tens[i, :] = ten
  masks[i, :] = mask

preds = torch.argmax(model(tens, masks), dim=1).cpu().detach().numpy()
print(preds)
for i, word in enumerate(words):
  print(word[:preds[i]] + '-' + word[preds[i]:])

[4 6 4 4 5 6 4 4]
barn-en
stabil-t
ditt-
sked-?
badka-r
spelad-
sked-ar
spel-at


In [37]:
model

Transformer(
  (embed): Embedding(81, 32)
  (keys): Linear(in_features=32, out_features=32, bias=False)
  (queries): Linear(in_features=32, out_features=32, bias=False)
  (values): Linear(in_features=32, out_features=32, bias=False)
  (output): Linear(in_features=1248, out_features=39, bias=True)
)

In [39]:
semi_model

Transformer(
  (embed): Embedding(81, 32)
  (keys): Linear(in_features=32, out_features=32, bias=False)
  (queries): Linear(in_features=32, out_features=32, bias=False)
  (values): Linear(in_features=32, out_features=32, bias=False)
  (output): Linear(in_features=1248, out_features=39, bias=True)
)

In [None]:
m = Transformer()