In [2]:
pip install pymorphy3

/home/sypoo/mtuci/ml/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.


In [3]:
import re
import nltk
import pymorphy3
import pandas as pd

from tqdm import tqdm
from nltk.stem.snowball import RussianStemmer
from sklearn.model_selection import train_test_split

import torch

from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

In [4]:
tqdm.pandas()
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/sypoo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
train_dataframe = pd.read_csv("./train.csv").iloc[:2000]
valid_dataframe = pd.read_csv("./test.csv").iloc[:400]

In [6]:
# train_dataframe

In [7]:
def preproc(text):
  text = text.lower()
  text = re.sub(r"([^\w\s]\s)", r" \1", text)
  text = re.sub(r"(\d)\.", r"\1 . ", text)
  text = re.sub(r"\s+", r" ", text)
  text = text.strip()
  text = text.split(" ")

  morph = pymorphy3.MorphAnalyzer()
  text = [morph.parse(word)[0].normal_form for word in text]

  # stemmer = RussianStemmer()
  # stemmed_words = [stemmer.stem(word) for word in text]

  return text

def slicingWindow(sequence):
  pairs = []
  for index, word in enumerate(sequence):
    for jndex in [-2, -1, 1, 2]:
      if 0 <= index+jndex < len(sequence):
        pairs.append((sequence[index], sequence[index+jndex]))
  return pairs

In [8]:
preproc("Волшебные фото Виктория Поплавская Евгения. 2.3.1")

['волшебный',
 'фото',
 'виктория',
 'поплавский',
 'евгений',
 '.',
 '2',
 '.',
 '3',
 '.',
 '1']

In [9]:
text = "   Волшебные фото Виктория Поплавская Евгения.   2.3.1.  "
sequence_of_words = preproc(text)
slicingWindow(sequence_of_words)

[('волшебный', 'фото'),
 ('волшебный', 'виктория'),
 ('фото', 'волшебный'),
 ('фото', 'виктория'),
 ('фото', 'поплавский'),
 ('виктория', 'волшебный'),
 ('виктория', 'фото'),
 ('виктория', 'поплавский'),
 ('виктория', 'евгений'),
 ('поплавский', 'фото'),
 ('поплавский', 'виктория'),
 ('поплавский', 'евгений'),
 ('поплавский', '.'),
 ('евгений', 'виктория'),
 ('евгений', 'поплавский'),
 ('евгений', '.'),
 ('евгений', '2'),
 ('.', 'поплавский'),
 ('.', 'евгений'),
 ('.', '2'),
 ('.', '.'),
 ('2', 'евгений'),
 ('2', '.'),
 ('2', '.'),
 ('2', '3'),
 ('.', '.'),
 ('.', '2'),
 ('.', '3'),
 ('.', '.'),
 ('3', '2'),
 ('3', '.'),
 ('3', '.'),
 ('3', '1'),
 ('.', '.'),
 ('.', '3'),
 ('.', '1'),
 ('.', '.'),
 ('1', '3'),
 ('1', '.'),
 ('1', '.'),
 ('.', '.'),
 ('.', '1')]

In [10]:
train_pairs = train_dataframe["text"].progress_apply(preproc).progress_apply(slicingWindow).explode().tolist()
valid_pairs = valid_dataframe["text"].progress_apply(preproc).progress_apply(slicingWindow).explode().tolist()

100%|██████████| 2000/2000 [01:44<00:00, 19.13it/s]
100%|██████████| 2000/2000 [00:00<00:00, 8252.33it/s]
100%|██████████| 400/400 [00:21<00:00, 18.86it/s]
100%|██████████| 400/400 [00:00<00:00, 20180.20it/s]


In [11]:
train_pairs[:10]

[('волшебный', 'фото'),
 ('волшебный', 'виктория'),
 ('фото', 'волшебный'),
 ('фото', 'виктория'),
 ('фото', 'поплавский'),
 ('виктория', 'волшебный'),
 ('виктория', 'фото'),
 ('виктория', 'поплавский'),
 ('виктория', 'евгениямедведев'),
 ('поплавский', 'фото')]

In [12]:
class MyDataset(torch.utils.data.Dataset):
  def __init__(self, pairs):
    self.pairs = pairs

  def __len__(self):
    return len(self.pairs)

  def __getitem__(self, index):
    return self.pairs[index][0], self.pairs[index][1]

In [13]:
batch_size = 16
train_dataset = MyDataset(train_pairs)
valid_dataset = MyDataset(valid_pairs)

In [14]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

In [15]:
class Tokenizer:
  def __init__(self, set_of_words):
    self.__word2index = {}
    self.__index2word = {}
    for index, value in enumerate(set_of_words):
      self.__word2index[value] = index
      self.__index2word[index] = value

  def __call__(self, sequence):
    tokens = []
    for word in sequence:
      try:
        tokens.append(self.__word2index[word.lower()])
      except:
        raise ValueError("Такого слова нет в токенизаторе")
    return torch.tensor(tokens)

  def decode(self, sequence):
    words = []
    for index in sequence:
      try:
        words.append(self.__index2word[index])
      except:
        raise ValueError("Такого токена нет в токенизаторе")
    return words

  def __len__(self):
    return len(self.__word2index)


In [16]:
tokenizer_dataframe = pd.concat([train_dataframe["text"], valid_dataframe["text"]])

In [17]:
tokenizer = Tokenizer(set(tokenizer_dataframe.progress_apply(preproc).explode().tolist()))

100%|██████████| 2400/2400 [02:08<00:00, 18.64it/s]


In [18]:
tokenizer(preproc("Привет меня зовут Руслан"))

tensor([15968,  4878, 11257, 12421])

In [19]:
len(tokenizer)

18728

In [20]:
class word2vec(nn.Module):
  def __init__(self, num_dict, hidden):
    super().__init__()
    self.encode = nn.Embedding(num_dict, hidden)
    self.decode = nn.Linear(hidden, num_dict)

  def forward(self, x):
    out = self.encode(x)
    out = self.decode(out)
    return out

In [21]:
num_dict = 18728
hidden = 300

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = word2vec(num_dict, hidden).to(device)
optim = torch.optim.Adam(model.parameters(), lr=3e-4)
loss_fn = torch.nn.CrossEntropyLoss()
epochs = range(10)

In [23]:
device

device(type='cpu')

In [24]:
train_loss_list = []
valid_loss_list = []
train_acc_list = []
valid_acc_list = []

In [25]:
for epoch in epochs:
  print(f"Epochs {epoch}")

  train_loss_mean = 0
  valid_loss_mean = 0
  train_acc_mean = 0
  valid_acc_mean = 0

  for data, target in (pbar := tqdm(train_dataloader)):
    optim.zero_grad()

    data = tokenizer(data).to(device)
    target = tokenizer(target).to(device)

    out = model(data)

    train_acc = float((out.argmax(dim=1) == target).sum()/batch_size)
    train_acc_mean += train_acc

    target = F.one_hot(target, num_classes=num_dict).float()
    loss = loss_fn(out, target)
    loss.backward()
    train_loss_mean += loss.item()

    optim.step()

    pbar.set_postfix({"Loss": loss.item(), "Accuracy": train_acc}, refresh=True)

  for data, target in (pbar := tqdm(valid_dataloader)):
    with torch.no_grad():
      data = tokenizer(data).to(device)
      target = tokenizer(target).to(device)

      out = model(data)
      valid_acc = float((out.argmax(dim=1) == target).sum()/batch_size)
      valid_acc_mean += valid_acc

      target = F.one_hot(target, num_classes=num_dict).float()
      loss = loss_fn(out, target)
      valid_loss_mean += loss.item()

      pbar.set_postfix({"Loss": loss.item(), "Accuracy": valid_acc}, refresh=True)

  train_loss_list.append(train_loss_mean/len(train_dataloader))
  valid_loss_list.append(valid_loss_mean/len(valid_dataloader))
  train_acc_list.append(train_acc_mean/len(train_dataloader))
  valid_acc_list.append(valid_acc_mean/len(valid_dataloader))

  print(f"Train_loss: {train_loss_mean/len(train_dataloader)}")
  print(f"Valid_loss: {valid_loss_mean/len(valid_dataloader)}")
  print(f"Train_acc: {train_acc_mean/len(train_dataloader)}")
  print(f"Valid_acc: {valid_acc_mean/len(valid_dataloader)}")

Epochs 0


 14%|█▍        | 4460/31061 [06:16<37:26, 11.84it/s, Loss=8.82, Accuracy=0]     


KeyboardInterrupt: 