In [42]:
from __future__ import unicode_literals, print_function, division

import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from io import open
import unicodedata
import re
import random

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [43]:
dataset = pd.read_csv('data/data_tokenize.csv')

In [44]:
SOS_token = 0
EOS_token = 1

In [45]:
MAX_VOCAB_SIZE = 100_000  # Оставляем только 100к самых частых слов

class Vocab:
    """Создаёт словари на основе входных данных"""

    def __init__(self, name):
        self.name = name
        self.word2index = {"<UNK>": 0, "SOS": 1, "EOS": 2}
        self.word2count = {"<UNK>": 0}
        self.index2word = {0: "<UNK>", 1: "SOS", 2: "EOS"}
        self.n_words = 3

    def addText(self, text: str):
        """Для каждого слова в тексте добавляет его в словарь"""
        for word in text.split():
            self.addWord(word)

    def addWord(self, word):
        """Добавляет слова в словари"""
        if word not in self.word2index:
            if self.n_words < MAX_VOCAB_SIZE:  # Лимит на размер словаря
                self.word2index[word] = self.n_words
                self.word2count[word] = 1
                self.index2word[self.n_words] = word
                self.n_words += 1
            else:
                self.word2count["<UNK>"] += 1  # Редкие слова -> <UNK>

    def __str__(self):
        """Строковое представление словаря"""
        return (
            f"Vocab(name='{self.name}', "
            f"n_words={self.n_words}, "
        )

In [46]:
title_vocab = Vocab("title")
text_vocab = Vocab("text")

In [47]:
for text in dataset['text']:
    text_vocab.addText(text)

for title in dataset['title']:
    title_vocab.addText(title)

In [48]:
text_vocab.__str__(), title_vocab.__str__()

("Vocab(name='text', n_words=100000, ", "Vocab(name='title', n_words=19534, ")

In [49]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.lstm(embedded)
        return output, (hidden, cell)