# Глубокое обучение и обработка естественного языка

## Домашняя работа №4

Исходный набор данных - [Fake and real news dataset](https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset)
2. Реализовать классификацию двумя моделями: CNN, LSTM - 6 баллов = 3 + 3
3. Сравнить качество обученных моделей 1 балл
4. Обеспечена воспроизводимость решения: зафиксированы random_state, ноутбук воспроизводится от начала до конца без ошибок - 2 балла
5. Соблюден code style на уровне pep8 и [On writing clean Jupyter notebooks](https://ploomber.io/blog/clean-nbs/) - 1 балл

Примеры: [Using Convolution Neural Networks to Classify Text in PyTorch](https://tzuruey.medium.com/using-convolution-neural-networks-to-classify-text-in-pytorch-3b626a42c3ca), [LSTM in Pytorch](https://wandb.ai/sauravmaheshkar/LSTM-PyTorch/reports/Using-LSTM-in-PyTorch-A-Tutorial-With-Examples--VmlldzoxMDA2NTA5)

In [1]:
# установка torchmetrics
!pip install torchmetrics

In [2]:
# подключение библиотек
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchmetrics import F1Score

import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
nltk.download("punkt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
seed = 2023

np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7c5fe13f2ff0>

In [4]:
# функция, переводит текст в список id-слов
def text_to_sequence(text, maxlen, vocabulary):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]

    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))

    return padding + result[-maxlen:]

In [127]:
# функция обучения модели
def train(model, train_loader, epochs=10):
  model.train()

  f1 = F1Score(task="binary")
  optimizer = torch.optim.Adam(model.parameters(), lr=10e-3)
  criterion = nn.CrossEntropyLoss()

  for epoch in range(1, epochs + 1):
      print(f"Train epoch {epoch}/{epochs}")
      temp_loss = []
      temp_metrics = []
      for i, (data, target) in enumerate(train_loader):
          optimizer.zero_grad()
          output = model(data)

          loss = criterion(output, target)
          loss.backward()

          optimizer.step()
          temp_loss.append(loss.float().item())
          temp_metrics.append(f1(output.argmax(1), target).item())

      epoch_loss = np.array(temp_loss).mean()
      epoch_f1 = np.array(temp_metrics).mean()
      print(f'Loss: {epoch_loss}, f1 score: {epoch_f1}')

In [131]:
# функция оценки модели
def eval(model, val_loader):
  f1 = F1Score(task="binary")
  temp_metrics = []

  for i, (data, target) in enumerate(val_loader):
    output = model(data)
    temp_metrics.append(f1(output.argmax(1), target).item())

  f1_mean = np.array(temp_metrics).mean()
  print(f'F1 score: {f1_mean}')

In [7]:
# класс, обертка над данными
class TextDataWrapper(Dataset):
    def __init__(self, data, target=None, transform=None):
        self.data = torch.from_numpy(data).long()
        if target is not None:
            self.target = torch.from_numpy(target).long()
        else:
          self.target = None
        self.transform = transform

    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index] if self.target is not None else -1

        if self.transform:
            x = self.transform(x)
        return x, y

    def __len__(self):
        return len(self.data)

In [8]:
# класс, CNN
class ConvTextClassifier(nn.Module):
    def __init__(self, vocab_size=2000, embedding_dim=128, out_channel=128, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv = nn.Conv1d(embedding_dim, out_channel, kernel_size=3)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(out_channel, num_classes)

    def forward(self, x):
        output = self.embedding(x)
        output = output.permute(0, 2, 1) # bs, emb_dim, len
        output = self.conv(output)
        output = self.relu(output)
        output = torch.max(output, axis=2).values
        output = self.linear(output)
        return output

In [136]:
# класс, LSTM
# class LSTMTextClassifier(nn.Module):
#     def __init__(self, vocab_size=2000, embedding_dim=128, out_channel=128, num_classes=2):
#         super().__init__()
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
#         self.conv = nn.Conv1d(embedding_dim, out_channel, kernel_size=3)
#         self.relu = nn.ReLU()
#         self.linear = nn.Linear(out_channel, num_classes)

#     def forward(self, x):
#         output = self.embedding(x)
#         output = output.permute(0, 2, 1) # bs, emb_dim, len
#         output = self.conv(output)
#         output = self.relu(output)
#         output = torch.max(output, axis=2).values
#         output = self.linear(output)
#         return output

## Загрузка данных


In [9]:
df_fake = pd.read_csv('Fake.csv')
df_true = pd.read_csv('True.csv')

In [10]:
df_fake['class'] = 0
df_true['class'] = 1

In [11]:
df = pd.concat([df_fake, df_true], axis=0)

In [12]:
df

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


Работа с полем `text`

In [13]:
df.drop(columns=['title', 'subject', 'date'], axis=0, inplace=True)

## Предобработка

Создание корпуса

In [14]:
train_corpus = list(df['text'])
tokens = []

for text in tqdm(train_corpus):
  tokens.extend(word_tokenize(text.lower()))
tokens_filtered = [word for word in tokens if word.isalnum()]

  0%|          | 0/44898 [00:00<?, ?it/s]

In [15]:
max_words = 2000
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [16]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

Создание `train` и `test`

In [17]:
batch_size = 256
max_len = 40

In [18]:
df_train, df_test = train_test_split(df, test_size=0.2)

x_train = np.array([text_to_sequence(text, max_len, vocabulary) for text in tqdm(df_train["text"])], dtype=np.int32)
x_test = np.array([text_to_sequence(text, max_len, vocabulary) for text in tqdm(df_test["text"])], dtype=np.int32)
y_train = np.array(df_train["class"])
y_test = np.array(df_test["class"])

  0%|          | 0/35918 [00:00<?, ?it/s]

  0%|          | 0/8980 [00:00<?, ?it/s]

In [20]:
train_dataset = TextDataWrapper(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TextDataWrapper(x_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

## Классификация. CNN и LSTM

In [44]:
epochs = 10

### CNN

In [128]:
model = ConvTextClassifier()
print(model)
print("Parameters:", sum([param.nelement() for param in model.parameters()]))

ConvTextClassifier(
  (embedding): Embedding(2000, 128)
  (conv): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
  (relu): ReLU()
  (linear): Linear(in_features=128, out_features=2, bias=True)
)
Parameters: 305538


In [129]:
train(model, train_loader, epochs=10)

Train epoch 1/10
Loss: 0.2512625794567115, f1 score: 0.885200818485402
Train epoch 2/10
Loss: 0.10137975609577293, f1 score: 0.9607159748990485
Train epoch 3/10
Loss: 0.04672349308425865, f1 score: 0.9827876200912692
Train epoch 4/10
Loss: 0.03587115235653118, f1 score: 0.9864212435188023
Train epoch 5/10
Loss: 0.06990057164604993, f1 score: 0.9780239137351936
Train epoch 6/10
Loss: 0.09111327671112347, f1 score: 0.9779906940798387
Train epoch 7/10
Loss: 0.03703764785706158, f1 score: 0.9899727435822182
Train epoch 8/10
Loss: 0.028893595638272932, f1 score: 0.9919149778413434
Train epoch 9/10
Loss: 0.015680854024378653, f1 score: 0.9954078759707458
Train epoch 10/10
Loss: 0.010409598990782856, f1 score: 0.9970900522056201


In [134]:
eval(model, test_loader)

F1 score: 0.9608665804068247


### LSTM