In [61]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
import os

# hyperparameters
batch_size = 64 
block_size = 256 
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------

print(block_size)
torch.manual_seed(166045)

# Load text data
with open('pan-tadeusz.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split, seq_sampling=False):
    data = train_data if split == 'train' else val_data
    if seq_sampling:
        start_idx = torch.randint(0, len(data) - block_size, (1,)).item()
        x = data[start_idx:start_idx + block_size].unsqueeze(0).repeat(batch_size, 1)
        y = data[start_idx + 1:start_idx + block_size + 1].unsqueeze(0).repeat(batch_size, 1)
    else:
        ix = torch.randint(len(data) - block_size, (batch_size,))
        x = torch.stack([data[i:i + block_size] for i in ix])
        y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    return x.to(device), y.to(device)

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = x + self.dropout(self.sa(self.ln1(x)))
        x = x + self.dropout(self.ffwd(self.ln2(x)))
        return x

class GPTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens, temperature=1.0):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx


def calculate_perplexity(logits, targets):
    # Przekształć logits na logarytmy prawdopodobieństw (softmax + log)
    log_probs = F.log_softmax(logits, dim=-1)

    # Spłaszcz targets do wymiaru 1D, aby pasował do logits
    targets = targets.view(-1)

    # Wybierz logarytmy prawdopodobieństw odpowiadające poprawnym tokenom (targets)
    log_probs_target = log_probs[torch.arange(logits.size(0)), targets]

    # Oblicz średni logarytm prawdopodobieństwa na token
    mean_log_prob = log_probs_target.mean()

    # Oblicz perplexity na podstawie wzoru
    perplexity = torch.exp(-mean_log_prob).item()

    return perplexity





# model = GPTModel().to(device)
# print(sum(p.numel() for p in model.parameters()) / 1e6, 'M parameters')

# optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# # Lists to store metrics
# train_losses, val_losses, perplexities_train, perplexities_val = [], [], [], []

# # Training loop
# for iter in range(max_iters):
#     # Collect training metrics for every iteration
#     xb, yb = get_batch('train')
#     logits, loss = model(xb, yb)
#     optimizer.zero_grad(set_to_none=True)
#     loss.backward()
#     optimizer.step()

#     # Save training loss and perplexity for every iteration
#     train_losses.append(loss.item())
#     perplexities_train.append(calculate_perplexity(logits, yb))

#     # Collect validation metrics for every iteration
#     X_val, Y_val = get_batch('val')
#     logits_val, _ = model(X_val, Y_val)
#     val_loss = F.cross_entropy(logits_val.view(-1, logits_val.size(-1)), Y_val.view(-1)).item()
#     val_losses.append(val_loss)
#     perplexity_val = calculate_perplexity(logits_val, Y_val)
#     perplexities_val.append(perplexity_val)

#     # Print metrics at eval_interval or the last iteration
#     if iter % eval_interval == 0 or iter == max_iters - 1:
#         print(f"step {iter}: train loss {train_losses[-1]:.4f}, val loss {val_losses[-1]:.4f}, "
#               f"train perplexity {perplexities_train[-1]:.4f}, val perplexity {perplexity_val:.4f}")
        
# # Create the directory if it doesn't exist
# output_dir = "wykresy"
# os.makedirs(output_dir, exist_ok=True)

# # Plot training and validation perplexity on the same graph
# plt.figure()
# plt.plot(perplexities_train, label='Nieokreśloność zbiór treningowy', color='blue')
# plt.plot(perplexities_val, label='Nieokreśloność zbiór walidacyjny', color='orange')
# plt.legend()
# plt.xlabel('Epoka')
# plt.ylabel('Nieokreśloność')
# plt.title('Zbiór treningowy vs walidacyjny')
# # Save the plot
# plt.savefig(os.path.join(output_dir, f'nieokreslonosc_{batch_size}.png'))
# plt.close()

# # Plot training loss
# plt.figure()
# plt.plot(train_losses, label='Strata zbiór treningowy', color='blue')
# plt.legend()
# plt.xlabel('Iteracja')
# plt.ylabel('Strata')
# plt.title('Strata treningowa w czasie')
# # Save the plot
# plt.savefig(os.path.join(output_dir, f'strata_treningowa_{batch_size}.png'))
# plt.close()

# # Plot validation loss
# plt.figure()
# plt.plot(val_losses, label='Strata zbiór walidacyjny', color='orange')
# plt.legend()
# plt.xlabel('Iteracja')
# plt.ylabel('Strata')
# plt.title('Strata walidacyjna w czasie')
# # Save the plot
# plt.savefig(os.path.join(output_dir, f'strata_walidacyjna_{batch_size}.png'))
# plt.close()

# # Generate text
# context = torch.zeros((1, 1), dtype=torch.long, device=device)
# generated_text = decode(model.generate(context, max_new_tokens=2000)[0].tolist())

# # Save generated text to a file
# with open(f'wygenerowany_tekst_{batch_size}_{n_embd}_dropout_{dropout}.txt', 'w', encoding='utf-8') as f:
#     f.write(generated_text)



256


In [62]:
import os
import itertools
import matplotlib.pyplot as plt
import csv

torch.manual_seed(166045)

# Hiperparametry do przetestowania
grid_params = {
    "n_embd": [384],
    "n_head": [6],
    "n_layer": [6],
    "dropout": [0.2],
}

# Utworzenie przestrzeni hiperparametrów
param_combinations = list(itertools.product(
    grid_params["n_embd"],
    grid_params["n_head"],
    grid_params["n_layer"],
    grid_params["dropout"]
))

# Główna pętla Grid Search
# Lista do przechowywania wyników
output_dir = "wyniki_grid_search"
os.makedirs(output_dir, exist_ok=True)

for params in param_combinations:
    results = []
    n_embd, n_head, n_layer, dropout = params
    print(f"Training with config: n_embd={n_embd}, n_head={n_head}, n_layer={n_layer}, dropout={dropout}, learning rate = {learning_rate}, podzial = {0.9}")

    # Tworzenie katalogu na wyniki tej konfiguracji
    #config_dir = f"testy_tekstow_epoka_{max_iters}_model_koncowy"
    config_dir = f"{output_dir}/n_embd_{n_embd}_n_head_{n_head}_n_layer_{n_layer}_dropout_{dropout}_lr{learning_rate}_0.9tr"
    os.makedirs(config_dir, exist_ok=True)
    
    # Inicjalizacja modelu z bieżącą konfiguracją
    model = GPTModel().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    
    # Listy na wyniki
    train_losses, val_losses, perplexities_train, perplexities_val = [], [], [], []

    # Pętla treningowa
    for iter in range(max_iters):
        # Dane treningowe
        xb, yb = get_batch('train')
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # Zapis strat i perplexity
        train_losses.append(loss.item())
        perplexities_train.append(calculate_perplexity(logits, yb))

        # Dane walidacyjne
        X_val, Y_val = get_batch('val')
        logits_val, _ = model(X_val, Y_val)
        val_loss = F.cross_entropy(logits_val.view(-1, logits_val.size(-1)), Y_val.view(-1)).item()
        val_losses.append(val_loss)
        perplexity_val = calculate_perplexity(logits_val, Y_val)
        perplexities_val.append(perplexity_val)

        # Drukowanie metryk
        if iter % eval_interval == 0 or iter == max_iters - 1:
            print(f"step {iter}: train loss {train_losses[-1]:.4f}, val loss {val_losses[-1]:.4f}, "
                  f"train perplexity {perplexities_train[-1]:.4f}, val perplexity {perplexity_val:.4f}")
        # Zapis wyników
        result = {
            "n_embd": n_embd,
            "n_head": n_head,
            "n_layer": n_layer,
            "dropout": dropout,
            "final_train_loss": train_losses[-1],
            "final_val_loss": val_losses[-1],
            "final_train_perplexity": perplexities_train[-1],
            "final_val_perplexity": perplexities_val[-1],
        }
        results.append(result)

    # Zapis wykresów
    plt.figure()
    plt.plot(perplexities_train, label='Nieokreśloność zbiór treningowy', color='blue')
    plt.plot(perplexities_val, label='Nieokreśloność zbiór walidacyjny', color='orange')
    plt.legend()
    plt.xlabel('Iteracja')
    plt.ylabel('Nieokreśloność')
    plt.title('Nieokreśloność zbiór treningowy vs walidacyjny')
    plt.savefig(os.path.join(config_dir, 'perplexity.png'))
    plt.close()

    plt.figure()
    plt.plot(train_losses, label='Strata zbiór treningowy', color='blue')
    plt.legend()
    plt.xlabel('Iteracja')
    plt.ylabel('Strata')
    plt.title('Strata treningowa w czasie')
    plt.savefig(os.path.join(config_dir, 'train_loss.png'))
    plt.close()

    plt.figure()
    plt.plot(val_losses, label='Strata zbiór walidacyjny', color='orange')
    plt.legend()
    plt.xlabel('Iteracja')
    plt.ylabel('Strata')
    plt.title('Strata walidacyjna w czasie')
    plt.savefig(os.path.join(config_dir, 'val_loss.png'))
    plt.close()

    # Zapis wygenerowanego tekstu
    context = torch.zeros((1, 1), dtype=torch.long, device=device)
    generated_text = decode(model.generate(context, max_new_tokens=200)[0].tolist())
    with open(os.path.join(config_dir, 'generated_text.txt'), 'w', encoding='utf-8') as f:
        f.write(generated_text)

    # Zapis wyników do pliku CSV
    csv_file = os.path.join(config_dir, "grid_search_results.csv")
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=results[0].keys())
        writer.writeheader()
        
        writer.writerows(results)

Training with config: n_embd=384, n_head=6, n_layer=6, dropout=0.2, learning rate = 0.0003, podzial = 0.9
step 0: train loss 4.5507, val loss 3.8120, train perplexity 94.6974, val perplexity 45.2394
step 500: train loss 2.3749, val loss 2.3709, train perplexity 10.7496, val perplexity 10.7067
step 1000: train loss 2.0450, val loss 2.0854, train perplexity 7.7295, val perplexity 8.0482
step 1500: train loss 1.7935, val loss 1.9023, train perplexity 6.0103, val perplexity 6.7011
step 2000: train loss 1.6051, val loss 1.7849, train perplexity 4.9783, val perplexity 5.9588
step 2500: train loss 1.5077, val loss 1.7637, train perplexity 4.5165, val perplexity 5.8340
step 3000: train loss 1.4181, val loss 1.7456, train perplexity 4.1293, val perplexity 5.7295
step 3500: train loss 1.3451, val loss 1.7923, train perplexity 3.8384, val perplexity 6.0033
step 4000: train loss 1.2490, val loss 1.8403, train perplexity 3.4870, val perplexity 6.2981
step 4500: train loss 1.1732, val loss 1.8887, t

In [38]:
def generate_text_from_indices(model, fragment, stoi, itos, max_new_tokens=200, device='cpu'):
    # Zamiana fragmentu tekstu na indeksy
    input_indices = torch.tensor([stoi[c] for c in fragment], dtype=torch.long, device=device).unsqueeze(0)

    # Generowanie tokenów
    with torch.no_grad():
        output_indices = model.generate(input_indices, max_new_tokens=max_new_tokens)[0]

    # Zamiana indeksów na tekst
    generated_text = ''.join([itos[idx] for idx in output_indices.tolist()])
    return generated_text

# Przykład użycia
fragment = "Wpół drogi żyzny łąk i pszenic wyroślejsze"
generated_text = generate_text_from_indices(model, fragment, stoi, itos, max_new_tokens=300, device=device)

print("Wygenerowany tekst:")
print(generated_text)


Wygenerowany tekst:
Wpół drogi żyzny łąk i pszenic wyroślejsze,
Nieraz książę pocheształ i Rzeczpią procesowale,
Że ją każe. Lecz NauVoncik, bóg ptaszek łaskawie:
Już wtenczas żykie spojrze nań w dalę śniadzwania:
Dlacz mowych pada, szlachcica, świszcze sztuczkę z z dworze,
Biegł u wtenczas, zna gałkę szlachta i źliwości,
Zcię byłem do wsparli bitw powmiesz os
