In [2]:
!nvidia-smi  # Если выдаёт ошибку — доступ закрыт

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
!pip install --upgrade torch torchvision openssh-client



In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from tqdm import tqdm
from sklearn.model_selection import train_test_split
torch.backends.cudnn.benchmark = True
torch.set_num_threads(os.cpu_count())

Датасет -- 1. Загрузка рубленного датасета с диска ---> 2. Предобработка и Токенизация ---> 3. Создание DataLoader

In [None]:
# 1. ЗАГРУЗКА ДАННЫХ
from google.colab import drive
drive.mount('/content/drive')

all_data = []
for i in range(1, 51):
    file_path = f"/content/drive/MyDrive/dataset/subtitles_text_{i}.csv"
    try:
        chunk = pd.read_csv(file_path)
        chunk['en'] = chunk['en'].astype(str).str.strip()
        chunk['ru'] = chunk['ru'].astype(str).str.strip()
        chunk = chunk[(chunk['en'] != '') & (chunk['ru'] != '')]
        all_data.append(chunk)
    except Exception as e:
        print(f"Ошибка при загрузке {file_path}: {str(e)}")

if not all_data:
    raise ValueError("Не загружено ни одного файла!")

full_data = pd.concat(all_data, ignore_index=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print(full_data.head)

<bound method NDFrame.head of                                                          en  \
0                                      - Get away from her!   
1                                      - Get away from her!   
2                                     She can't leave here.   
3                              - Her heart stopped beating.   
4                                        - She can't leave.   
...                                                     ...   
24999995   Who's gonna remember them if we don't film them?   
24999996  The battle of Grunwald was never filmed, but s...   
24999997                   - But you'd watch it? - I would.   
24999998                           They're playing Mazurek.   
24999999                         Do you know what it means?   

                                                         ru  
0                                          Отойдите от неё.  
1                                        - Отойдите от неё.  
2                          

In [None]:
# 2. СОЗДАНИЕ СЛОВАРЕЙ
def build_vocab(texts, max_vocab_size=50000, min_word_count=3):
    """
    Создает словарь с фильтрацией редких слов

    :param texts: входные тексты
    :param max_vocab_size: максимальный размер словаря
    :param min_word_count: минимальная частота слова для включения
    :return: словарь {слово: индекс}
    """
    # Специальные токены
    vocab = {
        "<pad>": 0,
        "<unk>": 1,
        "<sos>": 2,
        "<eos>": 3
    }

    # Подсчет частот слов
    word_counts = defaultdict(int)
    total_words = 0

    for text in texts:
        text = str(text).strip()
        if not text or text.lower() == 'nan':
            continue
        for word in text.split():
            word_counts[word] += 1
            total_words += 1

    # Фильтрация редких слов и сортировка по частоте
    filtered_words = [(word, count) for word, count in word_counts.items() if count >= min_word_count]
    sorted_words = sorted(filtered_words, key=lambda x: x[1], reverse=True)

    # Заполнение словаря
    for word, _ in sorted_words[:max_vocab_size - 4]:
        if word not in vocab:
            vocab[word] = len(vocab)

    # Анализ покрытия
    covered_words = sum(count for word, count in word_counts.items() if word in vocab)
    coverage = covered_words / total_words * 100

    print(f"Словарь: {len(vocab)} слов")
    print(f"Покрытие текста: {coverage:.2f}%")
    print(f"Отброшено редких слов (<{min_word_count} вхождений): {len(word_counts) - len(filtered_words)}")

    return vocab

vocab_en = build_vocab(full_data['en'], min_word_count=5)
vocab_ru = build_vocab(full_data['ru'], min_word_count=3)

Словарь: 50000 слов
Покрытие текста: 94.39%
Отброшено редких слов (<5 вхождений): 1582755
Словарь: 50000 слов
Покрытие текста: 84.92%
Отброшено редких слов (<3 вхождений): 2623982


In [None]:
print(f"EN словарь: {len(vocab_en)} слов")
print(f"RU словарь: {len(vocab_ru)} слов")

EN словарь: 50000 слов
RU словарь: 50000 слов


In [None]:
train_data, test_data = train_test_split(full_data, test_size=0.1, random_state=42)

In [None]:
# 3. СОЗДАНИЕ DATALOADER
class TranslationDataset(Dataset):
    def __init__(self, en_texts, ru_texts, vocab_en, vocab_ru, max_length=64):
        self.en_texts = en_texts.tolist()
        self.ru_texts = ru_texts.tolist()
        self.vocab_en = vocab_en
        self.vocab_ru = vocab_ru
        self.max_length = max_length

    def __len__(self):
        return len(self.en_texts)

    def __getitem__(self, idx):
        en_text = self.en_texts[idx]
        ru_text = self.ru_texts[idx]

        # Токенизация EN
        en_tokens = [self.vocab_en['<sos>']]
        en_tokens += [self.vocab_en.get(word, self.vocab_en['<unk>'])
                      for word in en_text.split()[:self.max_length-2]]
        en_tokens.append(self.vocab_en['<eos>'])
        en_tokens = en_tokens[:self.max_length]
        en_tokens += [self.vocab_en['<pad>']] * (self.max_length - len(en_tokens))

        # Токенизация RU
        ru_tokens = [self.vocab_ru['<sos>']]
        ru_tokens += [self.vocab_ru.get(word, self.vocab_ru['<unk>'])
                      for word in ru_text.split()[:self.max_length-2]]
        ru_tokens.append(self.vocab_ru['<eos>'])
        ru_tokens = ru_tokens[:self.max_length]
        ru_tokens += [self.vocab_ru['<pad>']] * (self.max_length - len(ru_tokens))

        return (
            torch.tensor(en_tokens, dtype=torch.long),
            torch.tensor(ru_tokens, dtype=torch.long)
        )

train_dataset = TranslationDataset(train_data['en'], train_data['ru'], vocab_en, vocab_ru)
test_dataset = TranslationDataset(test_data['en'], test_data['ru'], vocab_en, vocab_ru)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    pin_memory=False,
    num_workers=4
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=32,
    pin_memory=False,
    num_workers=4
)



Модель --> 1. Определение архитектуры --> 2. Определение модели --> Тестирование модели

In [None]:
# 4. МОДЕЛЬ
class Seq2Seq(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, hidden_size):
        super().__init__()
        self.encoder_embed = nn.Embedding(input_vocab_size, hidden_size, padding_idx=0)
        self.encoder_lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.decoder_embed = nn.Embedding(output_vocab_size, hidden_size, padding_idx=0)
        self.decoder_lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_vocab_size)

    def forward(self, src, trg):
        # Энкодер
        src_embedded = self.encoder_embed(src)
        _, (hidden, cell) = self.encoder_lstm(src_embedded)

        # Декодер
        trg_embedded = self.decoder_embed(trg[:, :-1])  # Убираем последний токен
        outputs = []

        for i in range(trg_embedded.size(1)):
            lstm_out, (hidden, cell) = self.decoder_lstm(
                trg_embedded[:, i].unsqueeze(1),
                (hidden, cell)
            )
            outputs.append(self.fc(lstm_out.squeeze(1)))

        return torch.stack(outputs, dim=1)

In [None]:
model = Seq2Seq(len(vocab_en), len(vocab_ru), 128)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
print(model)

Seq2Seq(
  (encoder_embed): Embedding(50000, 128, padding_idx=0)
  (encoder_lstm): LSTM(128, 128, batch_first=True)
  (decoder_embed): Embedding(50000, 128, padding_idx=0)
  (decoder_lstm): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=50000, bias=True)
)


Обучение -- 1. Определение оптимайзера --> 2. Проверка вычислительных потерь

In [None]:
#ГРАФИКИ
import matplotlib.pyplot as plt

def plot_training_history(train_losses, val_losses):
    plt.figure(figsize=(10, 5))

    # График потерь
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # График точности (если есть)
    plt.subplot(1, 2, 2)
    plt.plot([1/x for x in train_losses], label='Train PPL')
    plt.plot([1/x for x in val_losses], label='Val PPL')
    plt.title('Perplexity')
    plt.xlabel('Epoch')
    plt.ylabel('PPL')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
# Ячейка загрузки чекпоинтов
import glob

def load_checkpoint(checkpoint_dir, model, optimizer):
    checkpoints = glob.glob(os.path.join(checkpoint_dir, "epoch_*.pt"))
    if not checkpoints:
        print("No checkpoints found. Starting from scratch.")
        return {
            'start_epoch': 0,
            'best_val_loss': float('inf'),
            'best_epoch': 0,
            'train_losses': [],
            'val_losses': []
        }

    latest_checkpoint = max(checkpoints, key=os.path.getctime)
    print(f"Loading checkpoint: {latest_checkpoint}")
    checkpoint = torch.load(latest_checkpoint)

    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    # Загружаем лучшую модель если существует
    best_model_path = os.path.join(checkpoint_dir, "best_model.pt")
    if os.path.exists(best_model_path):
        best_checkpoint = torch.load(best_model_path)
        best_val_loss = best_checkpoint['val_loss']
        best_epoch = best_checkpoint['epoch']
    else:
        best_val_loss = checkpoint['val_loss']
        best_epoch = checkpoint['epoch']

    return {
        'start_epoch': checkpoint['epoch'],
        'best_val_loss': best_val_loss,
        'best_epoch': best_epoch,
        'train_losses': checkpoint.get('train_loss_history', []),
        'val_losses': checkpoint.get('val_loss_history', [])
    }

# Использование:
checkpoint_data = load_checkpoint("checkpoints", model, optimizer)
start_epoch = checkpoint_data['start_epoch']
best_val_loss = checkpoint_data['best_val_loss']
best_epoch = checkpoint_data['best_epoch']
train_losses = checkpoint_data['train_losses']
val_losses = checkpoint_data['val_losses']

No checkpoints found. Starting from scratch.


In [None]:
# 5. ОБУЧЕНИЕ
import torch
import os
import glob
from tqdm import tqdm

checkpoint_dir = "checkpoints"  # ← Вот здесь определяется!
os.makedirs(checkpoint_dir, exist_ok=True)

# Данные для визуализации
train_losses, val_losses = [], []
max_validation_batches = 500  # Ограничение батчей при валидации


for epoch in range(start_epoch, start_epoch + 3):  # Начинаем с start_epoch
    # Фаза обучения
    model.train()
    train_loss = 0
    processed_batches = 0

    train_pbar = tqdm(train_dataloader,
                     desc=f"Epoch {epoch+1}/{start_epoch+3} [Train]",
                     unit="batch",
                     bar_format="{l_bar}{bar:20}{r_bar}")

    for src, trg in train_pbar:
      if processed_batches >= 1000:
          train_pbar.close()
          break

      # Основной тренинг (ДОЛЖЕН ВЫПОЛНЯТЬСЯ ВСЕГДА)
      optimizer.zero_grad()
      output = model(src, trg)
      loss = criterion(
          output.reshape(-1, output.size(-1)),
          trg[:, 1:].contiguous().reshape(-1)
      )
      loss.backward()
      optimizer.step()

      train_loss += loss.item()
      processed_batches += 1
      train_pbar.set_postfix({"train_loss": f"{loss.item():.4f}"})

    avg_train_loss = train_loss / processed_batches
    train_losses.append(avg_train_loss)

    # Фаза валидации
    model.eval()
    test_loss = 0
    val_batches_processed = 0

    val_pbar = tqdm(test_dataloader,
                   desc=f"Epoch {epoch+1}/{start_epoch+3} [Val]",
                   unit="batch",
                   bar_format="{l_bar}{bar:20}{r_bar}")

    with torch.no_grad():
        for src, trg in val_pbar:
            if val_batches_processed >= max_validation_batches:
                val_pbar.close()
                break

            output = model(src, trg)
            loss = criterion(
                output.reshape(-1, output.size(-1)),
                trg[:, 1:].contiguous().reshape(-1)
            )
            test_loss += loss.item()
            val_batches_processed += 1
            val_pbar.set_postfix({"val_loss": f"{loss.item():.4f}"})

    avg_test_loss = test_loss / val_batches_processed
    val_losses.append(avg_test_loss)

    print(f"\nEpoch {epoch+1} Results:")
    print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_test_loss:.4f}")
    print("-" * 50)

    # Сохранение чекпоинта
    if (epoch % 3 == 0):
        checkpoint_path = os.path.join(checkpoint_dir, f"epoch_{epoch+1}.pt")
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': avg_train_loss,
            'val_loss': avg_test_loss,
            'train_loss_history': train_losses,
            'val_loss_history': val_losses,
        }, checkpoint_path)
        print(f"Checkpoint saved: {checkpoint_path}")

    # Обновление лучшей модели
    if avg_test_loss < best_val_loss:
        best_val_loss = avg_test_loss
        best_epoch = epoch + 1
        best_model_path = os.path.join(checkpoint_dir, "best_model.pt")
        torch.save({
            'epoch': best_epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': avg_train_loss,
            'val_loss': best_val_loss,
            'train_loss_history': train_losses,
            'val_loss_history': val_losses,
        }, best_model_path)

print("\nTraining completed!")
plot_training_history(train_losses, val_losses)

Epoch 1/3 [Train]:   0%|                    | 1/703125 [00:09<1886:19:37,  9.66s/batch, train_loss=10.8138]

Инференс -- Преобразование результата в слова

In [None]:
# 6. ТЕСТИРОВАНИЕ
def translate(model, sentence, vocab_en, vocab_ru, max_length=32):
    model.eval()

    # Подготовка входных токенов
    tokens = [vocab_en['<sos>']]
    tokens += [vocab_en.get(word.lower(), vocab_en['<unk>']) for word in str(sentence).split()]
    tokens.append(vocab_en['<eos>'])
    tokens = tokens[:max_length]
    tokens += [vocab_en['<pad>']] * (max_length - len(tokens))

    src = torch.LongTensor(tokens).unsqueeze(0)  # [1, seq_len]

    # Энкодинг
    with torch.no_grad():
        src_embedded = model.encoder_embed(src)
        _, (hidden, cell) = model.encoder_lstm(src_embedded)

        # Декодинг
        trg = torch.LongTensor([[vocab_ru['<sos>']]])  # [1, 1]
        output_sentence = []

        for _ in range(max_length):
            trg_embedded = model.decoder_embed(trg)
            output, (hidden, cell) = model.decoder_lstm(trg_embedded, (hidden, cell))
            output = model.fc(output.squeeze(1))
            next_token = output.argmax(-1).item()

            if next_token == vocab_ru['<eos>']:
                break

            output_sentence.append(next_token)
            trg = torch.LongTensor([[next_token]])

    # Преобразование индексов в слова
    ru_vocab_rev = {v: k for k, v in vocab_ru.items()}
    return ' '.join([ru_vocab_rev[idx] for idx in output_sentence])

In [None]:
test_phrases = ["hello world", "how are you", "I love machine learning"]
for phrase in test_phrases:
    print(f"'{phrase}' -> '{translate(model, phrase, vocab_en, vocab_ru)}'")

In [None]:
print("Есть ли 'hello' в vocab_en?", 'hello' in vocab_en)
print("Есть ли 'world' в vocab_en?", 'world' in vocab_en)
print("Первые 20 слов vocab_en:", list(vocab_en.keys())[:20])
print("Первые 20 слов vocab_ru:", list(vocab_ru.keys())[:20])