In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from collections import Counter
import numpy as np
import re
import random

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess the data
def load_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    pairs = []
    for line in lines:
        parts = line.split("\t")
        if len(parts) >= 2:
            source = parts[0].strip().lower()
            target = parts[1].strip().lower()
            # Clean target from attribution metadata
            target = re.sub(r"cc-by.*$", "", target).strip()
            pairs.append((source, target))

    return pairs

# Tokenizer and Vocabulary
class Tokenizer:
    def __init__(self, text_list, min_freq=2):
        self.counter = Counter()
        for text in text_list:
            tokens = self.tokenize(text)
            self.counter.update(tokens)

        self.tokens = ["<PAD>", "<BOS>", "<EOS>", "<UNK>"] + sorted(
            [word for word, freq in self.counter.items() if freq >= min_freq]
        )
        self.token2idx = {token: idx for idx, token in enumerate(self.tokens)}
        self.idx2token = {idx: token for token, idx in self.token2idx.items()}

    def tokenize(self, text):
        return re.findall(r'\b\w+\b', text)

    def encode(self, text):
        tokens = self.tokenize(text)
        return [self.token2idx.get(token, self.token2idx["<UNK>"]) for token in tokens]

    def decode(self, indices):
        return " ".join([self.idx2token.get(idx, "<UNK>") for idx in indices])

# Custom Dataset
class TranslationDataset(Dataset):
    def __init__(self, pairs, source_tokenizer, target_tokenizer):
        self.pairs = pairs
        self.source_tokenizer = source_tokenizer
        self.target_tokenizer = target_tokenizer

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        source, target = self.pairs[idx]
        source_encoded = [self.source_tokenizer.token2idx["<BOS>"]] + self.source_tokenizer.encode(source) + [self.source_tokenizer.token2idx["<EOS>"]]
        target_encoded = [self.target_tokenizer.token2idx["<BOS>"]] + self.target_tokenizer.encode(target) + [self.target_tokenizer.token2idx["<EOS>"]]
        return torch.tensor(source_encoded), torch.tensor(target_encoded)

# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, source_vocab_size, target_vocab_size, embed_size, num_heads, num_layers, ff_hidden_dim, dropout=0.1):
        super().__init__()
        self.source_embedding = nn.Embedding(source_vocab_size, embed_size)
        self.target_embedding = nn.Embedding(target_vocab_size, embed_size)
        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=ff_hidden_dim,
            dropout=dropout,
            batch_first=True,
        )
        self.fc_out = nn.Linear(embed_size, target_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, source, target):
        source = self.dropout(self.source_embedding(source))
        target = self.dropout(self.target_embedding(target))

        # Generate masks for the target
        tgt_seq_len = target.size(1)
        tgt_mask = torch.triu(torch.ones(tgt_seq_len, tgt_seq_len) * float('-inf'), diagonal=1).to(device)

        # Forward pass through transformer
        output = self.transformer(src=source, tgt=target, tgt_mask=tgt_mask)
        return self.fc_out(output)


# Training and Evaluation
def train_model(model, dataloader, optimizer, criterion, epochs=1):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for source, target in dataloader:
            source, target = source.to(device), target.to(device)
            target_input = target[:, :-1]
            target_output = target[:, 1:]

            optimizer.zero_grad()

            # Pass source and target_input through the model
            output = model(source, target_input)

            # Reshape output and target_output for loss calculation
            output = output.reshape(-1, output.shape[-1])
            target_output = target_output.reshape(-1)

            loss = criterion(output, target_output)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

            print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}")


def translate_sentence(model, sentence, source_tokenizer, target_tokenizer, max_len=50):
    model.eval()

    # Encode source sentence
    source_encoded = [source_tokenizer.token2idx["<BOS>"]] + source_tokenizer.encode(sentence) + [source_tokenizer.token2idx["<EOS>"]]
    source_tensor = torch.tensor(source_encoded).unsqueeze(0).to(device)

    # Start with <BOS> token for target
    target_tensor = torch.tensor([[target_tokenizer.token2idx["<BOS>"]]]).to(device)

    for _ in range(max_len):
        # Pass source and current target tensor to the model
        output = model(source_tensor, target_tensor)

        # Get the predicted next token
        next_token = output[:, -1, :].argmax(-1).item()

        # Append the predicted token to the target sequence
        target_tensor = torch.cat([target_tensor, torch.tensor([[next_token]]).to(device)], dim=1)

        # Stop if <EOS> token is predicted
        if next_token == target_tokenizer.token2idx["<EOS>"]:
            break

    # Decode the output sequence
    translated_sentence = target_tokenizer.decode(target_tensor.squeeze(0).cpu().numpy())
    return translated_sentence.replace("<BOS>", "").replace("<EOS>", "").strip()


def collate_fn(batch):
    sources = [item[0] for item in batch]
    targets = [item[1] for item in batch]

    sources_padded = nn.utils.rnn.pad_sequence(sources, batch_first=True, padding_value=source_tokenizer.token2idx["<PAD>"])
    targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=target_tokenizer.token2idx["<PAD>"])

    return sources_padded, targets_padded

# Main

# Load dataset
pairs = load_data(r"/content/drive/MyDrive/ukr/ukr.txt")

# Tokenizers
source_texts = [pair[0] for pair in pairs]
target_texts = [pair[1] for pair in pairs]
source_tokenizer = Tokenizer(source_texts)
target_tokenizer = Tokenizer(target_texts)

# Use only 10% of the dataset
subset_size = int(len(pairs) * 0.1)
subset_indices = random.sample(range(len(pairs)), subset_size)
subset_pairs = [pairs[i] for i in subset_indices]

# Dataset and DataLoader
dataset = TranslationDataset(subset_pairs, source_tokenizer, target_tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Model, Optimizer, Criterion
model = TransformerModel(len(source_tokenizer.tokens), len(target_tokenizer.tokens), embed_size=256, num_heads=8, num_layers=4, ff_hidden_dim=512).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=target_tokenizer.token2idx["<PAD>"])

  # Train for 1 epoch
train_model(model, dataloader, optimizer, criterion, epochs=1)


KeyboardInterrupt: 

In [32]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Завантаження токенізатора та моделі для перекладу з французької на українську
model_name = "Helsinki-NLP/opus-mt-en-uk"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Текст французькою
input_text = "Bonjour, comment ça va?"

# Токенізація вхідного тексту
inputs = tokenizer(input_text, return_tensors="pt")

# Генерація перекладу
outputs = model.generate(**inputs)

# Декодування перекладу
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Translated text:", translated_text)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ValueError: This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed in order to use this tokenizer.

In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Параметри
batch_size = 128
learning_rate = 1e-3
epochs = 20
latent_dim = 20

# Завантаження даних
transform = transforms.Compose([
    transforms.ToTensor(),  # Значення будуть в діапазоні [0, 1]
])

dataset = datasets.FashionMNIST(root='./data', train=True, transform=transform, download=True)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Модель VAE
class VAE(nn.Module):
    def __init__(self, latent_dim):
        super(VAE, self).__init__()
        
        # Енкодер
        self.encoder = nn.Sequential(
            nn.Flatten(),
            nn.Linear(28 * 28, 400),
            nn.ReLU(),
            nn.Linear(400, 2 * latent_dim)  # Мю та сигма в одному шарі
        )
        
        # Декодер
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 400),
            nn.ReLU(),
            nn.Linear(400, 28 * 28),
            nn.Sigmoid()
        )

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        # Енкодинг
        h = self.encoder(x)
        mu, log_var = torch.chunk(h, 2, dim=1)
        z = self.reparameterize(mu, log_var)
        
        # Декодинг
        recon_x = self.decoder(z)
        return recon_x, mu, log_var

# Функція втрат
def loss_function(recon_x, x, mu, log_var):
    x = (x + 1) / 2  # Перетворення [-1, 1] -> [0, 1]
    BCE = nn.functional.binary_cross_entropy(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    return BCE + KLD

# Ініціалізація моделі, оптимізатора
vae = VAE(latent_dim).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
optimizer = optim.Adam(vae.parameters(), lr=learning_rate)

# Тренування
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vae.to(device)

for epoch in range(epochs):
    vae.train()
    train_loss = 0

    for batch_idx, (data, _) in enumerate(data_loader):
        data = data.to(device)
        data = data.view(-1, 28 * 28)

        optimizer.zero_grad()
        recon_batch, mu, log_var = vae(data)
        loss = loss_function(recon_batch, data, mu, log_var)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()

    print(f'Epoch {epoch + 1}, Loss: {train_loss / len(dataset):.4f}')

# Збереження моделі
torch.save(vae.state_dict(), 'vae_fashion_mnist.pth')

# Візуалізація
import matplotlib.pyplot as plt
vae.eval()

with torch.no_grad():
    z = torch.randn(64, latent_dim).to(device)
    samples = vae.decoder(z).view(-1, 1, 28, 28).cpu()

    plt.figure(figsize=(8, 8))
    for i in range(64):
        plt.subplot(8, 8, i + 1)
        plt.imshow(samples[i].squeeze(), cmap='gray')
        plt.axis('off')
    plt.show()

Epoch 1, Loss: 477.9832
Epoch 2, Loss: 472.0933
Epoch 3, Loss: 471.2002
Epoch 4, Loss: 470.6733
Epoch 5, Loss: 470.3130


KeyboardInterrupt: 