<a href="https://colab.research.google.com/github/ThePryanic/nlp_course/blob/main/HomeWork_%E2%84%961.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install torch

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [None]:
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import datasets
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

In [None]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, device='cuda'):
        self.encodings = encodings
        self.labels = labels
        self.device = device

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(self.device)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, vocab_dim):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vocab_dim)
        self.rnn = nn.RNN(input_size=vocab_dim, hidden_size=vocab_dim,
                          num_layers=2, batch_first=True, bidirectional=False)
        self.fc_1 = nn.Linear(vocab_dim, vocab_dim)
        self.fc_2 = nn.Linear(vocab_dim, 4)
    def forward(self, x):
        embedding = self.embedding(x)
        x, _ = self.rnn(embedding)
        x = x.mean(dim=1)
        x = torch.tanh(x)
        x = self.fc_1(x)
        x = torch.tanh(x)
        x = self.fc_2(x)
        return x

In [None]:
def evaluate(model, val_loader, epoch,device = 'cuda'):
    print(model)
    model.eval()
    predictions = []
    target = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} test: "):
            logits = model(batch["input_ids"].to(device))
            predictions.append(logits.argmax(dim=1))
            target.append(batch['labels'])

    predictions = torch.cat(predictions)
    target = torch.cat(target)
    accuracy = (predictions == target).float().mean().item()
    print()
    print(accuracy)


In [None]:
def train_model(model, optimizer, criterion, train_loader, val_loader, epochs, scheduler,device = 'cuda'):
    for epoch in tqdm(range(epochs)):
        model.train()
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} train: "):
            optimizer.zero_grad()
            xb, yb = batch["input_ids"], batch["labels"]
            logits = model(xb.to(device))
            loss = criterion(logits, yb.to(device))
            loss.backward()
            optimizer.step()
        scheduler.step()

        evaluate(model, val_loader, epoch)

In [None]:
def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # 1. Загрузим датасет и разобьем его на train и val subsets
    dataset = datasets.load_dataset('ag_news')
    dataset = dataset
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        dataset['train']['text'], dataset['train']['label'], test_size=.1)

    # 2. Токенезируем subsets
    seq_size = 128
    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=seq_size)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=seq_size)

    train_dataset = IMDbDataset(train_encodings, train_labels)
    val_dataset = IMDbDataset(val_encodings, val_labels)

    batch_size = 256
    train_dataloader = DataLoader(
        train_dataset, shuffle=True, batch_size=batch_size)

    eval_dataloader = DataLoader(
        val_dataset, shuffle=False, batch_size=batch_size)

    model = LanguageModel(len(tokenizer.vocab), seq_size).to(device)

    epochs = 8
    lr = 1e-3

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=0.9, gamma=0.3)

    train_model(model, optimizer, criterion, train_dataloader, eval_dataloader, epochs, scheduler)

if __name__ == '__main__':
      main()

  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 1 train:   0%|          | 0/422 [00:00<?, ?it/s]

LanguageModel(
  (embedding): Embedding(30522, 128)
  (rnn): RNN(128, 128, num_layers=2, batch_first=True)
  (fc_1): Linear(in_features=128, out_features=128, bias=True)
  (fc_2): Linear(in_features=128, out_features=4, bias=True)
)


Epoch 1 test:   0%|          | 0/47 [00:00<?, ?it/s]


0.6401666402816772


Epoch 2 train:   0%|          | 0/422 [00:00<?, ?it/s]

LanguageModel(
  (embedding): Embedding(30522, 128)
  (rnn): RNN(128, 128, num_layers=2, batch_first=True)
  (fc_1): Linear(in_features=128, out_features=128, bias=True)
  (fc_2): Linear(in_features=128, out_features=4, bias=True)
)


Epoch 2 test:   0%|          | 0/47 [00:00<?, ?it/s]


0.6884166598320007


Epoch 3 train:   0%|          | 0/422 [00:00<?, ?it/s]

LanguageModel(
  (embedding): Embedding(30522, 128)
  (rnn): RNN(128, 128, num_layers=2, batch_first=True)
  (fc_1): Linear(in_features=128, out_features=128, bias=True)
  (fc_2): Linear(in_features=128, out_features=4, bias=True)
)


Epoch 3 test:   0%|          | 0/47 [00:00<?, ?it/s]


0.6934166550636292


Epoch 4 train:   0%|          | 0/422 [00:00<?, ?it/s]

LanguageModel(
  (embedding): Embedding(30522, 128)
  (rnn): RNN(128, 128, num_layers=2, batch_first=True)
  (fc_1): Linear(in_features=128, out_features=128, bias=True)
  (fc_2): Linear(in_features=128, out_features=4, bias=True)
)


Epoch 4 test:   0%|          | 0/47 [00:00<?, ?it/s]


0.6995833516120911


Epoch 5 train:   0%|          | 0/422 [00:00<?, ?it/s]

LanguageModel(
  (embedding): Embedding(30522, 128)
  (rnn): RNN(128, 128, num_layers=2, batch_first=True)
  (fc_1): Linear(in_features=128, out_features=128, bias=True)
  (fc_2): Linear(in_features=128, out_features=4, bias=True)
)


Epoch 5 test:   0%|          | 0/47 [00:00<?, ?it/s]


0.6991666555404663


Epoch 6 train:   0%|          | 0/422 [00:00<?, ?it/s]

LanguageModel(
  (embedding): Embedding(30522, 128)
  (rnn): RNN(128, 128, num_layers=2, batch_first=True)
  (fc_1): Linear(in_features=128, out_features=128, bias=True)
  (fc_2): Linear(in_features=128, out_features=4, bias=True)
)


Epoch 6 test:   0%|          | 0/47 [00:00<?, ?it/s]


0.6859999895095825


Epoch 7 train:   0%|          | 0/422 [00:00<?, ?it/s]

LanguageModel(
  (embedding): Embedding(30522, 128)
  (rnn): RNN(128, 128, num_layers=2, batch_first=True)
  (fc_1): Linear(in_features=128, out_features=128, bias=True)
  (fc_2): Linear(in_features=128, out_features=4, bias=True)
)


Epoch 7 test:   0%|          | 0/47 [00:00<?, ?it/s]


0.7005000114440918


Epoch 8 train:   0%|          | 0/422 [00:00<?, ?it/s]

LanguageModel(
  (embedding): Embedding(30522, 128)
  (rnn): RNN(128, 128, num_layers=2, batch_first=True)
  (fc_1): Linear(in_features=128, out_features=128, bias=True)
  (fc_2): Linear(in_features=128, out_features=4, bias=True)
)


Epoch 8 test:   0%|          | 0/47 [00:00<?, ?it/s]


0.6990000009536743
