# Неделя 7: обработка естесственного языка 

## Классификация отзывов с помощью рекуррентных нейронных сетей

#### В сегодняшнем задании необходимо классифицировать [отзывы]((https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)) с сайта IMDB  и сравнить полученный результат со вчерашним (когда классификация проходила с помощью классических алгоритмов). 

Загрузи датасет, задай модель и попробуй улучшить результат, который был получен вчера. Для корректной обработки текста необходимо его представить в виде последовательности индексов, которую нужно пропустить через слой `Embedding`. 

* [документация](https://keras.io/api/layers/core_layers/embedding/) по слою в `keras`
* [документация](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) по слою в `pyTorch`

Если будешь использовать `pyTorch`, то можно применить `torchtext`: относительно новая библиотека для работы с текстом в `pyTorch`-стиле. Например, там есть собственный [токенизатор](https://pytorch.org/text/stable/data_utils.html#get-tokenizer): `get_tokenizer`. 

In [1]:
# импортируй библиотеки 
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder

# natural language toolkit 
import nltk

# regular expression
import re

import string

import tqdm
from tqdm.notebook import tqdm

from collections import Counter
from matplotlib import pyplot as plt

In [2]:
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

In [3]:
import torch
from torch import nn
from torch import optim

from torchsummary import summary
from torchmetrics import MeanSquaredError

In [4]:
class sentimentLSTM(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """
    
    def __init__(self,
                 vocab_size, # объём словаря с которым мы работаем
                 output_size, # нейроны полносвязного
                 embedding_dim, # размер выходного эмбеддинга
                 hidden_dim, # размерность внутреннего слоя LSTM
                 n_layers, # число слоев в LSTM
                 drop_prob=0.5):
        
        super().__init__()
        
        self.output_size = output_size
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # nn.Linear(64, 16) / embedding_dim - выходная размерность 
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            n_layers, 
                            dropout=drop_prob, 
                            batch_first=True)
        
        self.dropout = nn.Dropout(0.3)
        
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):

        batch_size = x.size(0)
        
        embeds = self.embedding(x)
        # print(f'Embed shape: {embeds.shape}')
        lstm_out, hidden = self.lstm(embeds, hidden)
        # print(f'lstm_out {lstm_out.shape}')
        # print(f'hidden {hidden[0].shape}')
        # print(f'hidden {hidden[1].shape}')
        #stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # print(f'lstm out after contiguous: {lstm_out.shape}')
        # Dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        #sigmoid function
        sig_out = self.sigmoid(out)
        
        # reshape to be batch size first
        # print(sig_out.shape)
        sig_out = sig_out.view(batch_size, -1)
        # print(sig_out.shape)
        # print(f'Sig out before indexing:{sig_out.shape}')
        sig_out = sig_out[:, -1] # get last batch of labels
        # print(sig_out.shape)
        
        return sig_out, hidden
    
    def init_hidden(self, batch_size):
        ''' Hidden state и Cell state инициализируем нулями '''

        h0 = torch.zeros((self.n_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.n_layers,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden

In [5]:
model_loaded = sentimentLSTM(161203, 1, 32, 16, 2)

In [6]:
model_loaded.load_state_dict(torch.load('state_dict.pt'))

<All keys matched successfully>

In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
test_h1 = model.init_hidden(1)
# print(test_h1)

model_loaded.eval()
# for inputs, labels in test_loader:
    #     print(inputs)
test_h = tuple([each.data for each in test_h1])

inputs, labels = inputs.to(device), labels.to(device)

output, test_h = model(for_pred, test_h)

# test_loss = criterion(output.squeeze(), labels.float())
# test_losses.append(test_loss.item())
# sm = torch.nn.Softmax()

pred = output.squeeze()
print(pred, pred.shape)  

NameError: name 'model' is not defined

In [97]:
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import csv

def LSTMpred(str: str):
    df = pd.DataFrame(pd.Series(str, name='review'))
    # print(df)
    def clean(text):
        text = text.lower() # нижний регистр
        # text = re.sub(r'http\S+', " ", text) # удаляем ссылки
        # text = re.sub(r'@\w+',' ',text) # удаляем упоминания пользователей
        # text = re.sub(r'#\w+', ' ', text) # удаляем хэштеги
        text = re.sub(r'\d+', ' ', text) # удаляем числа
        text = re.sub(r'<.*?>',' ', text) # 
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        return text
    
    df['review'] = df['review'].apply(clean)

    wn_lemmatizer = WordNetLemmatizer()

    lemmatized_text = []

    for review in df['review']:
        lemmatized_text.append(' '.join([wn_lemmatizer.lemmatize(word, 'a') for word in review.split()]))

    reg_tokenizer = RegexpTokenizer('\w+')
    
    tokenized_text = reg_tokenizer.tokenize_sents(lemmatized_text)
    sw = stopwords.words('english')
    # print(sw)
    clean_tokenized_reviews = [] 
    for i, element in tqdm(enumerate(tokenized_text), total=len(tokenized_text)):
        clean_tokenized_reviews.append(' '.join([word for word in element if word not in sw]))
    df['review'] = pd.Series(clean_tokenized_reviews)
    # print(df)

    
    corpus = [word for text in df['review'] for word in text.split()]
    count_words = Counter(corpus)
    sorted_words = count_words.most_common()

    

    with open('vocab.csv', mode='r') as infile:
        reader = csv.reader(infile)
        # print(reader)
        with open('coors_new.csv', mode='w') as outfile:
            writer = csv.writer(outfile)
            vocab_to_int = {rows[0]:rows[1] for rows in reader}
    
    reviews_int = []
    for text in df['review']:
        # print(text)
        r = [int(vocab_to_int[word]) for word in text.split() if word in vocab_to_int.keys()]
       
        reviews_int.append(r)

    def padding(review_int, seq_len):
        '''
        Делаем padding, если длинна меньше seq_len, 
        если больше – берем первые seq_len индексов
        '''
        features1 = np.zeros((len(reviews_int), seq_len), dtype = int)
        for i, review in enumerate(review_int):
            if len(review) <= seq_len:
                zeros = list(np.zeros(seq_len - len(review)))
                new = zeros + review
            else:
                new = review[: seq_len]
            # print(i, new)
            features1[i, :] = np.array(new)
                
        return features1
    features = padding(reviews_int, seq_len = 50)
    return np.array(features)
    

In [98]:
for_pred = LSTMpred("Last week, I watched seasons 1-7 of GoT so those episodes are still very fresh in my mind and I am unaffected by rose tinted nostalgia. I have just watched episode 1 of House of the Dragon. How can I sum it up? So far, so good. The music is sufficiently different to be fresh while remaining recognizable. The production quality and cinematography are as expected - fantastic, epic and awe inspiring, aside from a couple of excessively dark scenes that were over and done with very quickly. There is gore aplenty. Nudity, vomit, sex and depravity. Incidental background humor. A promise of things to come. The casting seems to be on point - I saw no issue with any of the characters, and the leads were well chosen for their roles. The writing seems to be up to par. As an introduction, this episode was written and directed well. I want to see and know more - a good sign. So far I am pleased with this return to the land of Westeros. Valar morghulis!")
for_pred

  0%|          | 0/1 [00:00<?, ?it/s]

array([[  134,  1103,   178,  2209,    91,   560,    45,  1263,   230,
        21148,  2107, 20868,  4166,   178,   279,   218,  2878,  2761,
          128,     4,   105, 10608,   166,  1263,  3183,  5634,   250,
          369,   502,   718,   662,  1454,  4697,  3584,  1013,   256,
        11704,   301,    49,   124,   794,   516, 16499,   908,  6357,
          275, 11222,  7832,   811,   341]])

In [99]:
test_h1 = model_loaded.init_hidden(1)
# print(test_h1)

model_loaded.eval()
# for inputs, labels in test_loader:
    #     print(inputs)
test_h = tuple([each.data for each in test_h1])

# inputs, labels = inputs.to(device), labels.to(device)

output, test_h = model_loaded(torch.tensor(for_pred), test_h)

# test_loss = criterion(output.squeeze(), labels.float())
# test_losses.append(test_loss.item())
# sm = torch.nn.Softmax()

pred = float(output.squeeze().detach().numpy())
# print(pred, pred.shape)  

In [100]:
pred

0.9733338952064514

In [101]:
output = {'Positive': f'{format(pred*100, ".2f")} %', 'Negative': f'{format((1-pred)*100, ".2f")} %' }
out_stl = pd.DataFrame(output, index=['Probability'])

In [102]:
out_stl

Unnamed: 0,Positive,Negative
Probability,97.33 %,2.67 %


##### Сформируй датафрейм, в котором по строкам будут расположены названия обученных моделей, а по столбцам значение метрики на валидации. 

|model_arch | val_accuracy |val_loss|
|-----------|:------------:|:------:|
|SimpleRNN  | ...          |...|
|LSTM       |     ...      |...|
|BiLSTM     | ...          |...|
|GRU        |  ...         |...|
|...        |...           |...|

## Вопросы


1. Засчет чего (кроме увеличения длины последовательности) можно улучшить модель?

> ответ тут

2. Какую предобработку применять? Нужно ли, например, удалять стоп-слова, как мы делали вчера? А знаки препинания? Объясни свой ответ. 

> ответ тут

3. Какова структура отзыва на фильм? Какая часть рецензии, как правило, выражает отношение пользователя к фильму? Что можно сделать, чтобы увеличить качество модели с учетом твоих предположений?

> ответ тут