In [0]:
import csv
import numpy as np
import sys
from tqdm import tqdm
import pandas as pd
import re 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
#import unidecode
import string
import random
import re

In [0]:
import warnings
warnings.filterwarnings("ignore")

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
filename = "/content/drive/My Drive/nlp/lenta.csv"
#raw_text = open(filename).read()
#raw_text = raw_text.lower()

In [0]:
data = pd.read_csv(filename, sep = ';')

In [0]:
data.head()

Unnamed: 0,url,title,text,topic,tags
0,https://lenta.ru/news/2019/03/06/sofar/,Раскрыта уязвимость «Посейдона»,Перемещающийся со скоростью около 100 узлов (б...,Наука и техника,Оружие
1,https://lenta.ru/news/2019/03/05/sexists_gibdd/,ГИБДД помечтала о дороге без женщин,Госавтоинспекция Уфы помечтала о возможности и...,Интернет и СМИ,Интернет
2,https://lenta.ru/news/2019/03/06/vishinskiy/,Кириллу Вышинскому предъявили обвинения и собр...,Арестованному на Украине руководителю портала ...,Бывший СССР,Украина
3,https://lenta.ru/news/2019/03/06/champions_lea...,«Реал» впервые за 5 лет остался без четвертьфи...,Первыми четвертьфиналистами Лиги Чемпионов ста...,Спорт,Футбол
4,https://lenta.ru/news/2019/03/06/venezuela/,Определено будущее венесуэльских денег в России,Счета венесуэльских компаний в России переведу...,Экономика,


In [0]:
def preprocess_text(document):
    document = re.sub(r'\W', ' ',  document)
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    document = re.sub(r'[^\w]', ' ', document)
    return document

In [0]:
data.title = data.title.apply(lambda x: preprocess_text(x))
data = data.title
data.head()

0                       Раскрыта уязвимость Посейдона 
1                  ГИБДД помечтала о дороге без женщин
2    Кириллу Вышинскому предъявили обвинения и собр...
3     Реал впервые за 5 лет остался без четвертьфин...
4      Определено будущее венесуэльских денег в России
Name: title, dtype: object

In [0]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import itertools

In [0]:
SEQ_END = '@'
SEQ_TILDA = '~'

In [0]:
LEN = 83
titles = [title[1:-1] for title in data]

In [0]:
vocab = list(set(' '.join(titles) + SEQ_TILDA + SEQ_END))
vocab.append(SEQ_END)
vocab.append(SEQ_TILDA)
char_to_int = {a : i for i, a in enumerate(sorted(vocab))}
int_to_char = {i : a for i, a in enumerate(sorted(vocab))}

In [0]:
class Model(nn.Module):
    def __init__(self, input_size, emb_size, output_size):
        super(Model, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        
        self.emb = nn.Embedding(input_size, hidden_size)
        self.ltsm1 = nn.LSTM(input_size, emb_size, batch_first=True)
        self.ltsm2 = nn.LSTM(emb_size, emb_size, batch_first=True)
        self.decoder = nn.Linear(emb_size, output_size)
    
    def forward(self, inp):
        inp = self.emb(inp)
        inp, _ = self.ltsm1(inp)
        inp, _ = self.ltsm2(inp)
        inp = self.decoder(inp)
        return inp

In [0]:
def transform(titles):
    X = np.array([list(x + SEQ_TILDA * (sequence_len - len(x))) for x in titles])
    Y = np.array([list(x[1:] + SEQ_END + SEQ_TILDA * (sequence_len - len(x))) for x in titles])

    X = [[char_to_int[t] for t in title] for title in X]
    Y = [[char_to_int[t] for t in title] for title in Y]
    return X, Y

In [0]:
def create_batch(indxs, X, Y, batch_size):
    if batch_size * (indxs + 1) > len(X):       
        for _ in range(len(X) - batch_size * (indxs + 1)):
            X.append([char_to_int[SEQ_END] * LEN])
            Y.append([char_to_int[SEQ_END] * LEN])
            
    x = torch.stack([torch.Tensor(i) for i in X[batch_size * indxs : batch_size * (indxs + 1)]]).long()
    
    
    y = torch.stack([torch.Tensor(i) for i in Y[batch_size * indxs : batch_size * (indxs + 1)]]).long()
    return x, y

In [0]:
X, Y = transform(titles)

In [0]:
model_path = '/content/drive/My Drive/nlp/model.torch'

In [0]:
batch_size = 10
embedding_size = 128
epoch_n = 5
net = Model(len(vocab), embedding_size, len(vocab))
try:
  net.load_state_dict(torch.load(model_path))
except:
  pass
opt = torch.optim.Adam(net.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [0]:
for epoch in range(epoch_n):  
    print('Epoch#', epoch)
    train_loss = 0.
    net.train(True)
    for i in tqdm(range(len(X) // batch_size)):
        batch, y = create_batch(i, X, Y, batch_size)
        out = net.forward(batch)
        pred = out.cpu().detach().numpy().argmax(axis=2)[0]
      
        loss = criterion(out.transpose(2, 1), y)
        
        loss.backward()
        opt.step()
        opt.zero_grad()
        
        train_loss += loss.item()        
        
        if i % 100 == 0:
          print(i, loss.item())
        
        if i % 10 == 0:
          torch.save(net.state_dict(), model_path)
    train_loss /= (len(X) // batch_size)
    
    print('\nEpoch: {}, train loss: {}'.format(epoch, train_loss))


In [0]:
def evaluate(word):
  net.eval()
  for _ in range (1, LEN):
    input = [[char_to_int[a] for a in c] for c in word]
    batch, y = create_batch(0, input, None, batch_size)
    out = net.forward(batch)
    pred = out.cpu().detach().numpy().argmax(axis=2)[0]
    char = ind_to_char[pred[-1]]
    if char is SEQ_TILDA:
      print(SEQ_TILDA)
      break
    word = [word[0] + char]
  return word

In [171]:
evaluate(['Полиция'])

['Полиция пострадала на видео с полицейского по стрельбе с полицейского по стрельбе с полиц']