In [7]:
# Read the data
with open('./../filtered_training_data/Finnish/train_data_kalevala.txt', 'r', encoding='utf-8') as file:
    content = file.read()

# Print the first 100 characters
print(content[:100])

# Display the length of the content
print(len(content))

Kalevala 1849

Kalevalan ensimmäinen painos ilmestyi vuonna 1835. Teos syntyi Elias Lönnrotin
(1802 
569255


In [8]:
# Create a set of unique characters
chars = sorted(list(set(content)))
vocab_size = len(chars)

# Display the characters
print(''.join(chars))

# Display the number of characters
print(vocab_size)



 !"'(),.01234589:;?@AEHIJKLMNOPRSTUVYadefghijklmnoprstuvyÄäö
61


In [9]:
# Create a dictionary of characters mapped to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encoded = lambda s: [stoi[ch] for ch in s] # encode a string
decoded = lambda l: ''.join([itos[i] for i in l]) # decode a list of integers

# Display the encoded and decoded versions of the first 100 characters
print(encoded(content[:100]))
print(decoded(encoded(content[:100])))

[26, 38, 47, 40, 56, 38, 47, 38, 1, 10, 15, 13, 16, 0, 0, 26, 38, 47, 40, 56, 38, 47, 38, 49, 1, 40, 49, 53, 44, 48, 48, 59, 44, 49, 40, 49, 1, 51, 38, 44, 49, 50, 53, 1, 44, 47, 48, 40, 53, 54, 57, 44, 1, 56, 55, 50, 49, 49, 38, 1, 10, 15, 12, 14, 8, 1, 34, 40, 50, 53, 1, 53, 57, 49, 54, 57, 44, 1, 22, 47, 44, 38, 53, 1, 27, 60, 49, 49, 52, 50, 54, 44, 49, 0, 5, 10, 15, 9, 11, 1]
Kalevala 1849

Kalevalan ensimmäinen painos ilmestyi vuonna 1835. Teos syntyi Elias Lönnrotin
(1802 


In [10]:
# Encode the entire content
import torch

encoded_content = encoded(content)
encoded_content = torch.tensor(encoded_content, dtype=torch.int64)

# Display the encoded content
print(encoded_content)

tensor([26, 38, 47,  ..., 53, 38,  8])


In [11]:
# Display some statistics about the encoded content
print('Min:', encoded_content.min().item())
print('Max:', encoded_content.max().item())
print('Mean:', encoded_content.float().mean().item())
print('Std:', encoded_content.float().std().item())
print('Length:', len(encoded_content))

Min: 0
Max: 60
Mean: 38.12738037109375
Std: 19.195058822631836
Length: 569255


In [12]:
# Split the data into training and validation sets
train_size = int(len(encoded_content) * 0.8)
train_data = encoded_content[:train_size]
val_data = encoded_content[train_size:]

In [13]:
torch.manual_seed(42)
batch_size = 32 # number of sequences processed in parallel
block_size = 128 # max context length

# Create a function to generate batches of data
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i: i + block_size] for i in ix])
    y = torch.stack([data[i + 1: i + 1 + block_size] for i in ix])
    return x, y

# Display the shapes of the first batch
xb, yb = get_batch('train')
print(xb.shape)
print(xb)
print(yb.shape)
print(yb)

torch.Size([32, 128])
tensor([[56, 44, 47,  ..., 38, 43, 38],
        [49,  7,  0,  ..., 56, 44, 54],
        [43, 57, 56,  ..., 50, 46, 53],
        ...,
        [51, 44, 50,  ..., 44,  1, 56],
        [55, 53,  7,  ..., 56, 38, 47],
        [27, 40, 49,  ..., 49,  1, 53]])
torch.Size([32, 128])
tensor([[44, 47, 44,  ..., 43, 38, 49],
        [ 7,  0,  1,  ..., 44, 54, 54],
        [57, 56, 59,  ..., 46, 53, 44],
        ...,
        [44, 50, 49,  ...,  1, 56, 59],
        [53,  7,  0,  ..., 38, 47, 45],
        [40, 49, 54,  ...,  1, 53, 38]])


In [14]:
# Display an example of the first batch
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t]
        target = yb[b, t]
        # print(f'Context: {context}')
        # print(f'Target: {target}')

In [15]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(42)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding(idx)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(-1) # B*T
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_tokens=100):
        for _ in range(max_tokens):
            logits, loss = self(idx) # predictions
            logits = logits[:, -1, :] # last prediction
            probs = F.softmax(logits, dim=1) # probabilities
            idx_new = torch.multinomial(probs, num_samples=1) # new index
            idx = torch.cat([idx, idx_new], dim=1) # add to the sequence
        return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.int64)
print(decoded(m.generate(idx, max_tokens=1000)[0].tolist()))

torch.Size([4096, 61])
tensor(4.6249, grad_fn=<NllLossBackward0>)

0A1J@@Ti,faOLUIdTeP.1K)oi;U:a.hKUHgMeo03MmUHP@2@"9luAKNYTiEmNy8Ä)4TäMh3uS0s8SKLh8ÄYa,S@
yöoK2PEkujuj,f2 "(ÄLpH8ÄölKf3äI8g,'t?iO";'aHLRöLP.NR0ytSKpfnN2ÄtiO?(";ldÄÄ,yR,dNg83
5i1pni5KTiL3ähP h(2OÄ94y?esniÄOdaa1 AäU;ekHM?8gaö;äU3ukMn
9K29nR,
3l8,n 2gMtnRMYö14y8sJhY.MaOöJsAao8PP4nHs,o5
jä45ÄdeTuOSkp"MSOPSKV"8(vMh3@H3tvL
VK,)4)4röujt4gHLi;'rjT"e05uO)(O
S(?Tetiu9pvE5(8:
 (tvvOOti!!ilHU5EmN2j4gg)sla
KsNA!nR?2mtö:d0;3tiu(uV0öuO!HsfKr.!sSMeoaII1@mm,nHOOj !) Tu!jAöAKAU)ju;äö8,Yr84fRv"VRkSH4ÄYja)jSpumöujhUHrKj0"gldU"fÄ:3nR2ÄM(eäU?f@0i,KpM9K2Uptö4EK;,oSpkv2;YJI?i,3ä@rYoeo5aNhO2L3LoRf"y''dT8MO!Sp)('gOeR4V"es"0@dHUa4'HÄaOuJJdV";m1,4VAdTf2OkmaR5@rUH3LoueM18ÄÄOSAäsEm?ÄyäÄL50,V')vök;ökRäu!SÄMgMu(g@,8 (dväETAH(I i5,g02V(diSä:3Pyj.g3Oö4o@AhKLiudrj0TuNÄUj,sff!
4V9@khyYp:R!S imi5.eYg)j1ö:0v14öd)R5uKr..äjKr S4jtiIAdU;YnJI@rTp08AHk.@3Li!!sjNdUVA3Lmlans.eT5vOnsE:3u2rP.äoMgET8d0uO 'y8V;lP5vp4UNnR2O4p
IöL
Kj5K@rjäILiÄd?8e0T84:9jKS5@la5Ä@!4TeTHYSa

In [16]:
from tqdm import tqdm

# PyTorch optimizer
opt = torch.optim.Adam(m.parameters(), lr=1e-3)

# Train the model
for steps in tqdm(range(10000), desc="Training", unit="step"):
    xb, yb = get_batch('train') # get a batch of data
    logits, loss = m(xb, yb)
    opt.zero_grad(set_to_none=True)
    loss.backward()
    opt.step()

print(f'Loss: {loss.item()}')

Training: 100%|██████████| 10000/10000 [00:23<00:00, 424.34step/s]

Loss: 2.3332417011260986





In [17]:
idx = torch.zeros((1,1), dtype=torch.int64)
print(decoded(m.generate(idx, max_tokens=1000)[0].tolist()))


 kstajataissiki usunuttojasatänt mahoutäinnän,
 va,
 akit yvaaummevilluon hei,
 staulkoreisinutonelmalotä hamikahinelöykstsen oaiese' atyvälka, seta,
 kkinan Mana jä "Läillesa,
 "Ma kan vi ovelika pa etäpiren keropytaja vi.
 m9llon vöertä harola,

 yNä' va.
  jaulangelallu Vise panei.
 t:
 o5tosurjahalÄsenent lauisetä n umäsäimäse llarrtan Sa,
 ken, evai:
 ha llelain,
 hata,
 kuui:
 Vellevisehomikatt mäna,
 mestäpihtsauulusasitisttesa,
 pokonrvojoikän pämen psen vavare vatin vot on kaienetunnsä isi:
 en si roittä sstsuotitikamevä,
 eni:
 lÄä, kitema,
 lläihina;
 Uni sesttervalehjasanäää.
 loino ppähatäinelele,
 lmisoaika llhtei,
 lltena,
 lemeran alPo se Sisusihtantssise Katä "Oi,
 pi,
 llituahitskon lliksuoikeluovähaleniri!
 vanahaiveroi ta,
 pä pitkeneup'ohoisisteitävi kuta?
 La visäisinärerjoni mähine aahi vama,
 Loisyn aap'isimine va vin, Kytäisi ke väsuosehätetoinuhehessehotyöyyöirrmökonehahetttvisima lilapin
 Aja.
 tän petssujahana, kippo ta mmpipiltisapä,
 tut kuokullluakarella