In [1]:
import re, string, nltk, torch

from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 1. Представление и предобработка текстовых данных 

1.1 Операции по предобработке:
* токенизация
* стемминг / лемматизация
* удаление стоп-слов
* удаление пунктуации
* приведение к нижнему регистру
* любые другие операции над текстом

In [3]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

* токенизация

In [4]:
# применяем токенизацию
tokens = nltk.tokenize.word_tokenize(text)

# выводим результат
print(tokens)

['Select', 'your', 'preferences', 'and', 'run', 'the', 'install', 'command', '.', 'Stable', 'represents', 'the', 'most', 'currently', 'tested', 'and', 'supported', 'version', 'of', 'PyTorch', '.', 'Note', 'that', 'LibTorch', 'is', 'only', 'available', 'for', 'C++']


* стемминг / лемматизация

In [5]:
stemmer = SnowballStemmer("english")
words = ["playing", "played", "plays", "players", "collections"]
stemmed_words = [stemmer.stem(word) for word in words]
print(stemmed_words)

['play', 'play', 'play', 'player', 'collect']


In [6]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

# пример лемматизации предложения
sentence = "Сегодня я сумел сделать одно из самых важных дел всей моей жизни — найти ту самую, именно ту, да..."
lemmas = []

for word in sentence.split():
    parsed_word = morph.parse(word)[0]
    lemmas.append(parsed_word.normal_form)
lemmatized_sentence = ' '.join(lemmas)

print(lemmatized_sentence)

сегодня я суметь сделать один из самый важный дело весь мой жизнь — найти тот самую, именно ту, да...


* удаление стоп-слов

In [7]:
stop_words = set(stopwords.words('english'))
print(stop_words, end="\n\n")

text = "This is an example sentence to demonstrate stop word removal."
tokens = text.split()

filtered_tokens = [token for token in tokens if not token in stop_words]
filtered_text = " ".join(filtered_tokens)

print(filtered_text)

{'hadn', 'from', 'over', 'needn', 'her', 'hasn', 'again', 'had', 'myself', 'but', 'whom', 'was', 'do', 'this', 'have', 'mightn', 'why', "hadn't", 'no', 'more', 'up', 'me', 'very', 'has', 'mustn', 'once', "you'd", 'himself', 're', 'his', 'few', 'because', 'on', 'to', 'were', "shan't", 'such', "hasn't", 'below', "haven't", "didn't", 'now', "that'll", 'ourselves', 'into', 'there', 'about', 'doesn', 'isn', "you'll", "don't", 'didn', 'other', "wasn't", 'herself', 'most', 'with', 'my', "mustn't", 'each', 'same', 'under', 'shouldn', 't', "you're", 'their', 'or', 'then', 'own', 'while', 'did', 'in', 'by', "doesn't", 'been', 'our', 'yours', 'for', 'doing', 'against', 'don', 'm', "needn't", 'having', 'which', 'she', 'who', 'some', 'until', 'above', 'its', 'just', "wouldn't", 'further', 'at', 'o', 'am', 'we', 'him', 'being', "mightn't", 'weren', 'an', 'before', 'off', 'than', 'won', 'is', 'both', "aren't", 'any', 'shan', 'can', "shouldn't", 'and', 'between', 'not', 'theirs', 'of', 'out', 'be', 'd

In [8]:
stop_words = set(stopwords.words('russian'))
stop_words

{'а',
 'без',
 'более',
 'больше',
 'будет',
 'будто',
 'бы',
 'был',
 'была',
 'были',
 'было',
 'быть',
 'в',
 'вам',
 'вас',
 'вдруг',
 'ведь',
 'во',
 'вот',
 'впрочем',
 'все',
 'всегда',
 'всего',
 'всех',
 'всю',
 'вы',
 'где',
 'да',
 'даже',
 'два',
 'для',
 'до',
 'другой',
 'его',
 'ее',
 'ей',
 'ему',
 'если',
 'есть',
 'еще',
 'ж',
 'же',
 'за',
 'зачем',
 'здесь',
 'и',
 'из',
 'или',
 'им',
 'иногда',
 'их',
 'к',
 'как',
 'какая',
 'какой',
 'когда',
 'конечно',
 'кто',
 'куда',
 'ли',
 'лучше',
 'между',
 'меня',
 'мне',
 'много',
 'может',
 'можно',
 'мой',
 'моя',
 'мы',
 'на',
 'над',
 'надо',
 'наконец',
 'нас',
 'не',
 'него',
 'нее',
 'ней',
 'нельзя',
 'нет',
 'ни',
 'нибудь',
 'никогда',
 'ним',
 'них',
 'ничего',
 'но',
 'ну',
 'о',
 'об',
 'один',
 'он',
 'она',
 'они',
 'опять',
 'от',
 'перед',
 'по',
 'под',
 'после',
 'потом',
 'потому',
 'почти',
 'при',
 'про',
 'раз',
 'разве',
 'с',
 'сам',
 'свою',
 'себе',
 'себя',
 'сейчас',
 'со',
 'совсем',
 'так

Реализовать функцию `preprocess_text(text: str)`, которая:
* приводит строку к нижнему регистру
* заменяет все символы, кроме a-z, A-Z и знаков .,!? на пробел


In [9]:
def preprocess_text(text: str) -> str:
    return ''.join([' ' if not char.isalpha() and char not in ['?', '!', '.', ','] else char for char in text.lower()])

preprocess_text(text)

'this is an example sentence to demonstrate stop word removal.'

In [10]:
preprocess_text('Начинается %новое; % **приключение** совсем скоро &&SAP&&')

'начинается  новое      приключение   совсем скоро   sap  '

1.2 Представление текстовых данных при помощи бинарного кодирования


Представить первое предложение из `text` в виде тензора `sentence_t`: `sentence_t[i] == 1`, если __слово__ с индексом `i` присуствует в предложении.

In [11]:
processed_item = preprocess_text(text)
processed_item

'this is an example sentence to demonstrate stop word removal.'

In [12]:
first_sentence = sent_tokenize(text)[0]

# Создаем словарь слов и присваиваем каждому уникальный индекс
dictionary = {word: i for i, word in enumerate(set(text.split()))}
print(dictionary)

sentence_t = torch.zeros(len(dictionary))

print(first_sentence)

for word in first_sentence.split():
    print(word)
    if word in dictionary:
        print(dictionary[word])
        sentence_t[dictionary[word]] = 1

print(sentence_t)

{'sentence': 0, 'word': 1, 'to': 2, 'This': 3, 'stop': 4, 'example': 5, 'an': 6, 'is': 7, 'removal.': 8, 'demonstrate': 9}
This is an example sentence to demonstrate stop word removal.
This
3
is
7
an
6
example
5
sentence
0
to
2
demonstrate
9
stop
4
word
1
removal.
8
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])


## 2. Классификация фамилий по национальности

Датасет: https://disk.yandex.ru/d/owHew8hzPc7X9Q?w=1

2.1 Считать файл `surnames/surnames.csv`.
2.2 Закодировать национальности числами, начиная с 0.

In [27]:
dataframe = pd.read_csv("./data/surnames.csv")
dataframe['nationality'], _ = pd.factorize(dataframe['nationality'])
dataframe

Unnamed: 0,surname,nationality
0,Woodford,0
1,Coté,1
2,Kore,0
3,Koury,2
4,Lebzak,3
...,...,...
10975,Quraishi,2
10976,Innalls,0
10977,Król,12
10978,Purvis,0


In [28]:
X = dataframe['surname'].str.lower()
y = dataframe['nationality']
n_classes = y.nunique()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

2.4 Реализовать класс `Vocab` (токен = __символ__)

In [29]:
class Vocab:
    def __init__(self, data):
        tokens = set()
        for item in data:
          tokens.update(item)
        self.idx_to_token = dict(enumerate(tokens))
        self.token_to_idx = {token: idx for idx, token in self.idx_to_token.items()}
        self.vocab_len = len(self.idx_to_token)

vocab = Vocab(dataframe)
vocab

<__main__.Vocab at 0x2bb586cf520>

In [30]:
surnames = dataframe["surname"].str.lower()
surnames

0        woodford
1            coté
2            kore
3           koury
4          lebzak
           ...   
10975    quraishi
10976     innalls
10977        król
10978      purvis
10979    messerli
Name: surname, Length: 10980, dtype: object

In [31]:
vocab = Vocab(surnames)
print(vocab.idx_to_token, vocab.token_to_idx, sep="\n\n")

{0: 'p', 1: 'w', 2: 'ä', 3: 'm', 4: 'ú', 5: 'à', 6: 'ż', 7: 'ç', 8: 'y', 9: 'ù', 10: 'd', 11: 'o', 12: 'c', 13: 'j', 14: 'í', 15: '1', 16: 'i', 17: 'q', 18: 's', 19: 'ß', 20: 'v', 21: 'u', 22: 'õ', 23: 'è', 24: '-', 25: 'ś', 26: 'ń', 27: 'f', 28: 'ö', 29: 'ą', 30: 'a', 31: 'z', 32: 'á', 33: 'k', 34: 'r', 35: ':', 36: 't', 37: 'x', 38: 'ì', 39: '/', 40: 'b', 41: 'ã', 42: 'g', 43: 'h', 44: 'ñ', 45: 'ò', 46: 'ł', 47: 'é', 48: 'e', 49: 'ó', 50: 'ü', 51: 'n', 52: "'", 53: 'l', 54: 'ê'}

{'p': 0, 'w': 1, 'ä': 2, 'm': 3, 'ú': 4, 'à': 5, 'ż': 6, 'ç': 7, 'y': 8, 'ù': 9, 'd': 10, 'o': 11, 'c': 12, 'j': 13, 'í': 14, '1': 15, 'i': 16, 'q': 17, 's': 18, 'ß': 19, 'v': 20, 'u': 21, 'õ': 22, 'è': 23, '-': 24, 'ś': 25, 'ń': 26, 'f': 27, 'ö': 28, 'ą': 29, 'a': 30, 'z': 31, 'á': 32, 'k': 33, 'r': 34, ':': 35, 't': 36, 'x': 37, 'ì': 38, '/': 39, 'b': 40, 'ã': 41, 'g': 42, 'h': 43, 'ñ': 44, 'ò': 45, 'ł': 46, 'é': 47, 'e': 48, 'ó': 49, 'ü': 50, 'n': 51, "'": 52, 'l': 53, 'ê': 54}


In [32]:
print(vocab.vocab_len)
print(len(surnames))

55
10980


2.5 Реализовать класс `SurnamesDataset`

In [55]:
class SurnamesDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, surname):
    '''Генерирует представление фамилии surname в при помощи бинарного кодирования (см. 1.2)'''
    surname_t = torch.zeros(self.vocab.vocab_len)
    for token in surname:
      if surname_t[self.vocab.token_to_idx[token]] == 1:
        surname_t[self.vocab.token_to_idx[token]] += 1
      else:
        surname_t[self.vocab.token_to_idx[token]] = 1
    return surname_t

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return self.vectorize(self.X.iloc[idx]), self.y.iloc[idx]

2.3 Разбить датасет на обучающую и тестовую выборку

In [56]:
train_dataset = SurnamesDataset(X_train, y_train, vocab)
test_dataset = SurnamesDataset(X_test, y_test, vocab)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=True)

2.6. Обучить классификатор.
2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: прогнать несколько фамилий студентов группы через модели и проверить результат. Для каждой фамилии выводить 3 наиболее вероятных предсказания.

In [57]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_model(model, train_dataloader, test_dataloader, criterion, optimizer, num_epochs):
    model.to(device)
    train_losses, test_losses = [], []

    for epoch in range(num_epochs):
        model.train()
        train_loss, test_loss = 0, 0
        for inputs, labels in train_dataloader:
            x = inputs.to(device)
            y = labels.to(device)

            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            train_loss += loss.item()

        # Валидация на val_loader
        model.eval()
        with torch.no_grad():
            for inputs, labels in test_dataloader:
                inputs = inputs.to(device)
                labels = labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                test_loss += loss.item()

        train_losses.append(train_loss/len(train_dataloader))
        test_losses.append(test_loss/len(test_dataloader))

        print(f'Epoch {epoch+1}, Train Loss: {train_losses[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}')

In [58]:
def evaluate_model(model, dataloader):
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for surnames, labels in dataloader:

            x = surnames.to(device)
            y = labels.to(device)

            logits = model(x)
            _, predicted = torch.max(logits, 1)

            correct += (predicted == y).sum().item()
            total += y.size(0)

    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.5f}')

In [59]:
def predict(model, dataset, surname):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    model.eval()
    with torch.no_grad():
        vectorized = dataset.vectorize(surname)
        print(vectorized)
        tensor = vectorized.unsqueeze(0).to(device)
        logits = model(tensor)

        probs = torch.softmax(logits, dim=1).squeeze()

        top3_probs, top3_indices = torch.topk(probs, k=3)

        top3_nationalities = _[top3_indices.detach().cpu().numpy()]
        print(f'{surname}: {top3_nationalities.values[0]} ({top3_probs[0].item():.4f}), {top3_nationalities.values[1]} ({top3_probs[1].item():.4f}), {top3_nationalities.values[2]} ({top3_probs[2].item():.4f})')

In [60]:
print(vocab.vocab_len)

55


In [61]:
model = nn.Sequential(
    nn.Linear(vocab.vocab_len, 300),
    nn.ReLU(),
    nn.Linear(300, len(set(y_train)))
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

evaluate_model(model, test_dataloader)

Test Accuracy: 0.08607


In [62]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Количество обучаемых параметров: {total_params}")

Количество обучаемых параметров: 22218


In [69]:
train_model(model, train_dataloader, test_dataloader, criterion, optimizer, 20)

Epoch 1, Train Loss: 1.1282, Test Loss: 1.1965
Epoch 2, Train Loss: 1.1144, Test Loss: 1.2103
Epoch 3, Train Loss: 1.1007, Test Loss: 1.2011
Epoch 4, Train Loss: 1.0910, Test Loss: 1.1727
Epoch 5, Train Loss: 1.0785, Test Loss: 1.1913
Epoch 6, Train Loss: 1.0692, Test Loss: 1.1388
Epoch 7, Train Loss: 1.0590, Test Loss: 1.1575
Epoch 8, Train Loss: 1.0473, Test Loss: 1.1434
Epoch 9, Train Loss: 1.0393, Test Loss: 1.1703
Epoch 10, Train Loss: 1.0278, Test Loss: 1.1347
Epoch 11, Train Loss: 1.0210, Test Loss: 1.1274
Epoch 12, Train Loss: 1.0143, Test Loss: 1.1391
Epoch 13, Train Loss: 1.0048, Test Loss: 1.1413
Epoch 14, Train Loss: 0.9976, Test Loss: 1.1587
Epoch 15, Train Loss: 0.9884, Test Loss: 1.1193
Epoch 16, Train Loss: 0.9838, Test Loss: 1.1161
Epoch 17, Train Loss: 0.9760, Test Loss: 1.1453
Epoch 18, Train Loss: 0.9688, Test Loss: 1.1079
Epoch 19, Train Loss: 0.9612, Test Loss: 1.1227
Epoch 20, Train Loss: 0.9567, Test Loss: 1.1050


In [70]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.67532


In [65]:
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=True)
X_batch, y_batch = next(iter(test_loader))
predictions = model(X_batch).argmax(dim=1).cpu().detach()
print(classification_report(y_batch, predictions))

              precision    recall  f1-score   support

           0       0.57      0.81      0.67       567
           1       0.36      0.11      0.17        36
           2       0.72      0.93      0.81       346
           3       0.71      0.78      0.75       482
           4       0.75      0.55      0.63       161
           5       0.39      0.61      0.47        36
           6       0.52      0.43      0.47       108
           7       0.22      0.06      0.10        81
           8       0.73      0.20      0.31        41
           9       0.61      0.24      0.34       118
          10       0.71      0.38      0.49        32
          11       0.67      0.21      0.32        57
          12       0.75      0.36      0.49        25
          13       0.75      0.12      0.21        49
          14       0.00      0.00      0.00        15
          15       0.00      0.00      0.00        15
          16       0.00      0.00      0.00        14
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [66]:
predict(model, train_dataset, "fox")

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.])
fox: English (0.3876), Chinese (0.2666), French (0.0572)


In [67]:
predict(model, train_dataset, "balinyan")

tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 1.,
        0.])
balinyan: Russian (0.8657), English (0.0593), Irish (0.0340)


In [68]:
predict(model, train_dataset, "valyaev")

tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
        0.])
valyaev: Russian (0.8590), English (0.0774), Czech (0.0384)


## 3. Классификация обзоров ресторанов

Датасет: https://disk.yandex.ru/d/nY1o70JtAuYa8g

3.1 Считать файл `yelp/raw_train.csv`. Оставить от исходного датасета 10% строчек.

3.2 Воспользоваться функцией `preprocess_text` из 1.1 для обработки текста отзыва. Закодировать рейтинг числами, начиная с 0.

3.3 Разбить датасет на обучающую и тестовую выборку

3.4 Реализовать класс `Vocab` (токен = слово)

3.5 Реализовать класс `ReviewDataset`

3.6 Обучить классификатор

3.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)


In [214]:
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    text = text.lower()
    text = ''.join([' ' if not char.isalpha() and char not in ['.', ',', '!', '?', "'"] else char for char in text])

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    preprocessed_text = ' '.join(lemmatized_tokens)

    return preprocessed_text

In [215]:
raw_train = pd.read_csv("data/yelp/raw_train.csv", names=["rating", "review"])
raw_train

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...
...,...,...
559995,2,Ryan was as good as everyone on yelp has claim...
559996,2,Professional \nFriendly\nOn time AND affordabl...
559997,1,Phone calls always go to voicemail and message...
559998,1,Looks like all of the good reviews have gone t...


In [148]:
raw_train[21691:21693]

Unnamed: 0,rating,review
21691,1,Horrible horrible horrible! Worst nail place E...
21692,1,I went in her for the first time today for a g...


In [149]:
# выбор 10% случайных строк
raw_train_10 = raw_train.sample(frac=0.005)
raw_train_10["review"] = raw_train_10["review"].apply(lambda x: preprocess_text(x))
raw_train_10

Unnamed: 0,rating,review
539138,2,great view and wonderful patio . service wa te...
318337,1,my husband and i stayed here during the memori...
437652,2,my husband and i were catching a show at the s...
130931,2,what 's really awesome with chase field ? ? ? ...
233880,1,they need to get rid of justin ... just plain ...
...,...,...
118665,1,my wife and i are sushi buff . after reading s...
259466,1,hate this airport the limited food option are ...
314501,2,my family stay at the holiday inn all the time...
242185,2,"fun place to go get drink , dance and enjoy th..."


In [150]:
raw_train_10['rating'], rating_labels = pd.factorize(raw_train_10['rating'])
raw_train_10

Unnamed: 0,rating,review
539138,0,great view and wonderful patio . service wa te...
318337,1,my husband and i stayed here during the memori...
437652,0,my husband and i were catching a show at the s...
130931,0,what 's really awesome with chase field ? ? ? ...
233880,1,they need to get rid of justin ... just plain ...
...,...,...
118665,1,my wife and i are sushi buff . after reading s...
259466,1,hate this airport the limited food option are ...
314501,0,my family stay at the holiday inn all the time...
242185,0,"fun place to go get drink , dance and enjoy th..."


In [151]:
rating_labels

Int64Index([2, 1], dtype='int64')

In [152]:
class Vocab:
  def __init__(self, data):
    self.idx_to_token = {}
    self.token_to_idx = {}
    self.vocab_len = 0

    # Получаем список всех слов в данных
    all_words = [word for sentence in data["review"] for word in word_tokenize(sentence)]

    # Строим словарь
    for word in all_words:
        if word not in self.token_to_idx:
            self.idx_to_token[self.vocab_len] = word
            self.token_to_idx[word] = self.vocab_len
            self.vocab_len += 1

vocab = Vocab(raw_train_10)
vocab.vocab_len

15925

In [153]:
vocab.idx_to_token

{0: 'great',
 1: 'view',
 2: 'and',
 3: 'wonderful',
 4: 'patio',
 5: '.',
 6: 'service',
 7: 'wa',
 8: 'terrible',
 9: ',',
 10: 'waitress',
 11: 'did',
 12: 'not',
 13: 'write',
 14: 'anything',
 15: 'down',
 16: 'we',
 17: 'had',
 18: 'a',
 19: 'table',
 20: 'of',
 21: 'u',
 22: 'she',
 23: 'screwed',
 24: 'everything',
 25: 'up',
 26: 'would',
 27: 'only',
 28: 'bring',
 29: 'drink',
 30: 'at',
 31: 'time',
 32: 'ordered',
 33: 'food',
 34: 'it',
 35: 'came',
 36: 'out',
 37: 'forgot',
 38: 'to',
 39: 'put',
 40: 'order',
 41: 'in',
 42: 'for',
 43: 'some',
 44: 'mediocre',
 45: 'best',
 46: 'thing',
 47: 'is',
 48: 'the',
 49: 'eat',
 50: 'there',
 51: 'again',
 52: 'but',
 53: 'have',
 54: 'hang',
 55: 'they',
 56: 'an',
 57: 'awesome',
 58: 'acoustic',
 59: 'guitar',
 60: 'player',
 61: 'shame',
 62: 'him',
 63: 'that',
 64: 'saw',
 65: 'people',
 66: 'leave',
 67: 'due',
 68: 'my',
 69: 'husband',
 70: 'i',
 71: 'stayed',
 72: 'here',
 73: 'during',
 74: 'memorial',
 75: 'day',

In [154]:
class ReviewDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, review):
    '''Генерирует представление отзыва review при помощи бинарного кодирования (см. 1.2)'''
    vec = torch.zeros(self.vocab.vocab_len)

    # Проходим по каждому слову в фамилии
    for word in word_tokenize(review):
      # Если слово есть в словаре, устанавливаем соответствующий бит в векторе
      if word in self.vocab.token_to_idx:
          vec[self.vocab.token_to_idx[word]] = 1

    return vec

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    vec = self.vectorize(self.X[idx])
    label = self.y[idx]
    return vec, label

In [155]:
X_train, X_test, y_train, y_test = train_test_split(raw_train_10['review'].to_numpy(), raw_train_10['rating'].to_numpy(), test_size=0.2)

In [156]:
train_dataset = ReviewDataset(X_train, y_train, vocab)
test_dataset = ReviewDataset(X_test, y_test, vocab)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [157]:
model = nn.Sequential(
    nn.Linear(vocab.vocab_len, 1024),
    nn.ReLU(),
    nn.Linear(1024, 2),
    nn.LogSoftmax(dim=1),
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [158]:
vocab.vocab_len

15925

In [159]:
len(set(y_train))

2

In [160]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

model.to(device)

Sequential(
  (0): Linear(in_features=15925, out_features=1024, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1024, out_features=2, bias=True)
  (3): LogSoftmax(dim=1)
)

In [161]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Количество обучаемых параметров: {total_params}")

Количество обучаемых параметров: 16310274


In [162]:
train_model(model, train_dataloader, test_dataloader, criterion, optimizer, 5)

Epoch 1, Train Loss: 0.4011, Test Loss: 0.4024
Epoch 2, Train Loss: 0.0653, Test Loss: 0.3874
Epoch 3, Train Loss: 0.0136, Test Loss: 0.4331
Epoch 4, Train Loss: 0.0035, Test Loss: 0.4785
Epoch 5, Train Loss: 0.0014, Test Loss: 0.5232


In [163]:
rating_labels = ["Positive", "Negative"]

def predict(model, dataset, review):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    model.eval()
    with torch.no_grad():
        vectorized = dataset.vectorize(review)
        tensor = vectorized.unsqueeze(0).to(device)
        logits = model(tensor)
        probs = torch.softmax(logits, dim=1).squeeze()
        print(probs)
        print(f'{rating_labels[probs.argmax()]} ({probs.max():.4f}), {rating_labels[probs.argmin()]} ({probs.min():.4f}) \n{review}')

In [164]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.87857


In [165]:
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=True)
X_batch, y_batch = next(iter(test_loader))
predictions = model(X_batch).argmax(dim=1).cpu().detach()
print(classification_report(y_batch, predictions))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88       290
           1       0.86      0.89      0.88       270

    accuracy                           0.88       560
   macro avg       0.88      0.88      0.88       560
weighted avg       0.88      0.88      0.88       560



In [166]:
predict(model, train_dataset, "I had a terrible experience at this restaurant. The staff was rude and the food was overpriced for the quality.")

tensor([6.1828e-04, 9.9938e-01])
Negative (0.9994), Positive (0.0006) 
I had a terrible experience at this restaurant. The staff was rude and the food was overpriced for the quality.


In [167]:
predict(model, train_dataset, "I can't say enough good things about this restaurant. It's the perfect place for a romantic dinner or a night out with friends.")

tensor([0.9953, 0.0047])
Positive (0.9953), Negative (0.0047) 
I can't say enough good things about this restaurant. It's the perfect place for a romantic dinner or a night out with friends.


In [168]:
predict(model, train_dataset, "The menu at this restaurant is very limited and the food is nothing special. I wouldn't go back.")

tensor([0.0221, 0.9779])
Negative (0.9779), Positive (0.0221) 
The menu at this restaurant is very limited and the food is nothing special. I wouldn't go back.
