In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
import re

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data loading and preprocessing
def load_dataset(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    data_pairs = []
    for line in lines:
        parts = line.strip().split("\t")
        if len(parts) >= 2:
            source, target = parts[0].lower(), parts[1].lower()
            data_pairs.append((source, target))
    return data_pairs

class Vocabulary:
    def __init__(self, texts, min_freq=2):
        word_counter = Counter()
        for text in texts:
            tokens = self.tokenize(text)
            word_counter.update(tokens)

        self.words = ["<pad>", "<sos>", "<eos>", "<unk>"] + [word for word, freq in word_counter.items() if freq >= min_freq]
        self.word2idx = {word: idx for idx, word in enumerate(self.words)}
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}

    @staticmethod
    def tokenize(text):
        return re.findall(r'\b\w+\b', text)

    def encode(self, text):
        tokens = self.tokenize(text)
        return [self.word2idx.get(token, self.word2idx["<unk>"]) for token in tokens]

    def decode(self, indices):
        return " ".join([self.idx2word.get(idx, "<unk>") for idx in indices])

class TranslationDataset(Dataset):
    def __init__(self, data, src_vocab, tgt_vocab):
        self.data = data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src, tgt = self.data[idx]
        src_encoded = [self.src_vocab.word2idx["<sos>"]] + self.src_vocab.encode(src) + [self.src_vocab.word2idx["<eos>"]]
        tgt_encoded = [self.tgt_vocab.word2idx["<sos>"]] + self.tgt_vocab.encode(tgt) + [self.tgt_vocab.word2idx["<eos>"]]
        return torch.tensor(src_encoded), torch.tensor(tgt_encoded)

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_dim, n_heads, n_layers, ff_dim, dropout=0.1):
        super().__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, embed_dim)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, embed_dim)
        self.transformer = nn.Transformer(d_model=embed_dim, nhead=n_heads, num_encoder_layers=n_layers, \
                                          num_decoder_layers=n_layers, dim_feedforward=ff_dim, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(embed_dim, tgt_vocab_size)

    def forward(self, src, tgt):
        src_emb = self.src_embedding(src)
        tgt_emb = self.tgt_embedding(tgt)

        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(device)
        output = self.transformer(src=src_emb, tgt=tgt_emb, tgt_mask=tgt_mask)
        return self.fc_out(output)

def train_model(model, data_loader, optimizer, criterion, n_epochs):
    model.train()
    for epoch in range(n_epochs):
        total_loss = 0
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)

            optimizer.zero_grad()

            output = model(src, tgt[:, :-1])

            output = output.reshape(-1, output.size(-1))
            tgt_output = tgt[:, 1:].reshape(-1)

            loss = criterion(output, tgt_output)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{n_epochs}, Loss: {total_loss:.4f}")

def collate_fn(batch):
    src_batch = [item[0] for item in batch]
    tgt_batch = [item[1] for item in batch]
    src_padded = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=src_vocab.word2idx["<pad>"])
    tgt_padded = nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=tgt_vocab.word2idx["<pad>"])
    return src_padded, tgt_padded

if __name__ == "__main__":
    data_path = "ukr.txt"
    data_pairs = load_dataset(data_path)

    src_texts = [pair[0] for pair in data_pairs]
    tgt_texts = [pair[1] for pair in data_pairs]

    src_vocab = Vocabulary(src_texts)
    tgt_vocab = Vocabulary(tgt_texts)

    dataset = TranslationDataset(data_pairs, src_vocab, tgt_vocab)
    data_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

    model = Transformer(len(src_vocab.words), len(tgt_vocab.words), embed_dim=256, n_heads=8, n_layers=4, ff_dim=512).to(device)

    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab.word2idx["<pad>"])

    train_model(model, data_loader, optimizer, criterion, n_epochs=10)


KeyboardInterrupt: 

In [32]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Завантаження токенізатора та моделі для перекладу з французької на українську
model_name = "Helsinki-NLP/opus-mt-en-uk"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Текст французькою
input_text = "Bonjour, comment ça va?"

# Токенізація вхідного тексту
inputs = tokenizer(input_text, return_tensors="pt")

# Генерація перекладу
outputs = model.generate(**inputs)

# Декодування перекладу
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Translated text:", translated_text)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ValueError: This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed in order to use this tokenizer.

In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Параметри
batch_size = 128
learning_rate = 1e-3
epochs = 20
latent_dim = 20

# Завантаження даних
transform = transforms.Compose([
    transforms.ToTensor(),  # Значення будуть в діапазоні [0, 1]
])

dataset = datasets.FashionMNIST(root='./data', train=True, transform=transform, download=True)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Модель VAE
class VAE(nn.Module):
    def __init__(self, latent_dim):
        super(VAE, self).__init__()
        
        # Енкодер
        self.encoder = nn.Sequential(
            nn.Flatten(),
            nn.Linear(28 * 28, 400),
            nn.ReLU(),
            nn.Linear(400, 2 * latent_dim)  # Мю та сигма в одному шарі
        )
        
        # Декодер
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 400),
            nn.ReLU(),
            nn.Linear(400, 28 * 28),
            nn.Sigmoid()
        )

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        # Енкодинг
        h = self.encoder(x)
        mu, log_var = torch.chunk(h, 2, dim=1)
        z = self.reparameterize(mu, log_var)
        
        # Декодинг
        recon_x = self.decoder(z)
        return recon_x, mu, log_var

# Функція втрат
def loss_function(recon_x, x, mu, log_var):
    x = (x + 1) / 2  # Перетворення [-1, 1] -> [0, 1]
    BCE = nn.functional.binary_cross_entropy(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    return BCE + KLD

# Ініціалізація моделі, оптимізатора
vae = VAE(latent_dim).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
optimizer = optim.Adam(vae.parameters(), lr=learning_rate)

# Тренування
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vae.to(device)

for epoch in range(epochs):
    vae.train()
    train_loss = 0

    for batch_idx, (data, _) in enumerate(data_loader):
        data = data.to(device)
        data = data.view(-1, 28 * 28)

        optimizer.zero_grad()
        recon_batch, mu, log_var = vae(data)
        loss = loss_function(recon_batch, data, mu, log_var)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()

    print(f'Epoch {epoch + 1}, Loss: {train_loss / len(dataset):.4f}')

# Збереження моделі
torch.save(vae.state_dict(), 'vae_fashion_mnist.pth')

# Візуалізація
import matplotlib.pyplot as plt
vae.eval()

with torch.no_grad():
    z = torch.randn(64, latent_dim).to(device)
    samples = vae.decoder(z).view(-1, 1, 28, 28).cpu()

    plt.figure(figsize=(8, 8))
    for i in range(64):
        plt.subplot(8, 8, i + 1)
        plt.imshow(samples[i].squeeze(), cmap='gray')
        plt.axis('off')
    plt.show()

Epoch 1, Loss: 477.9832
Epoch 2, Loss: 472.0933
Epoch 3, Loss: 471.2002
Epoch 4, Loss: 470.6733
Epoch 5, Loss: 470.3130


KeyboardInterrupt: 