In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 59.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 46.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.2 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

from tqdm.auto import tqdm

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
from transformers import get_scheduler, TrainingArguments, Trainer, DefaultDataCollator

from datasets import Dataset, load_metric, list_metrics

In [None]:
seed_val = 213
random.seed(seed_val)
np.random.seed(seed_val)

In [None]:
path_to_drive = "drive/MyDrive/"

# Preparing data

In [None]:
# loading data
train = pd.read_csv(path_to_drive + "nlp/data/restricted/aita_train_balanced.csv")
train.title = train.title.fillna("")
train.text = train.text.fillna("")

test = pd.read_csv(path_to_drive + "nlp/data/restricted/aita_test_balanced.csv")
test.title = test.title.fillna("")
test.text = test.text.fillna("")

In [None]:
# encoding labels
possible_labels = train.flair.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
    
y_train = train.flair.replace(label_dict)
y_test = test.flair.replace(label_dict)

In [None]:
label_dict

{'asshole': 0,
 'everyone sucks': 1,
 'meta': 2,
 'no a-holes here': 3,
 'not enough info': 4,
 'not the a-hole': 5,
 'tl;dr': 6,
 'update': 7}

In [None]:
# collapsing title and selftext
X_train = train[['title', 'text']].agg(' [SEP] '.join, axis=1)
X_test = test[['title', 'text']].agg(' [SEP] '.join, axis=1)

In [None]:
train = Dataset.from_dict({"text": X_train, "labels": y_train})
test = Dataset.from_dict({"text": X_test, "labels": y_test})

# Tokenization

In [None]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
train_tokenized = train.map(tokenize_function, batched=True, batch_size=100) # może też zmniejszanie pomaga unikać wywalania kernela
test_tokenized = train.map(tokenize_function, batched=True, batch_size=100)

  0%|          | 0/160 [00:00<?, ?ba/s]

  0%|          | 0/160 [00:00<?, ?ba/s]

In [None]:
train_tokenized = train_tokenized.remove_columns(["text"])
test_tokenized = test_tokenized.remove_columns(["text"])

In [None]:
train_tokenized.save_to_disk(path_to_drive + "nlp/data/restricted/tokenized/aita_balanced/train/")
test_tokenized.save_to_disk(path_to_drive + "nlp/data/restricted/tokenized/aita_balanced/test/")

# TPU

In [None]:
import os
assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

In [None]:
!pip install cloud-tpu-client==0.10 torch==1.11.0 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch-xla==1.11
  Using cached https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl (152.9 MB)


In [None]:
# imports pytorch
import torch

# imports the torch_xla package
import torch_xla
import torch_xla.core.xla_model as xm

In [None]:
device = xm.xla_device()

In [None]:
device

device(type='xla', index=1)

In [None]:
!export XLA_USE_BF16=1

# If tokenization was done before already

In [None]:
from datasets import load_from_disk
train_tokenized = load_from_disk(path_to_drive + "nlp/data/restricted/tokenized/aita_balanced/train/")

# Data Loader from torch

In [None]:
train_tokenized.set_format("torch")
#test_tokenized.set_format("torch")

In [None]:
train_dataloader = DataLoader(train_tokenized, shuffle=True, batch_size=16) 
#eval_dataloader = DataLoader(test_tokenized, batch_size=8)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=10)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# unfinished model
model = torch.load(path_to_drive + "nlp/models/aita_balanced/1/epoch" + str(2))

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
# if CUDA
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
device

device(type='cuda')

In [None]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
progress_bar = tqdm(range(num_training_steps))
model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    print("epoch compeleted: " + str(epoch))
    torch.save(model, path_to_drive + "nlp/models/aita/1/epoch" + str(3))

  0%|          | 0/1000 [00:00<?, ?it/s]

epoch compeleted: 0


# Eval

In [None]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
list_metrics()

['accuracy',
 'bertscore',
 'bleu',
 'bleurt',
 'cer',
 'chrf',
 'code_eval',
 'comet',
 'competition_math',
 'coval',
 'cuad',
 'exact_match',
 'f1',
 'frugalscore',
 'glue',
 'google_bleu',
 'indic_glue',
 'mae',
 'mahalanobis',
 'matthews_correlation',
 'mauve',
 'mean_iou',
 'meteor',
 'mse',
 'pearsonr',
 'perplexity',
 'precision',
 'recall',
 'rl_reliability',
 'roc_auc',
 'rouge',
 'sacrebleu',
 'sari',
 'seqeval',
 'spearmanr',
 'squad',
 'squad_v2',
 'super_glue',
 'ter',
 'trec_eval',
 'wer',
 'wiki_split',
 'xnli',
 'xtreme_s',
 'jordyvl/ece',
 'lvwerra/aweeesoooome_metric',
 'lvwerra/test']

In [None]:
accuracy = load_metric("accuracy")

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [None]:
progress_bar = tqdm(range(len(train_dataloader)))
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    accuracy.add_batch(predictions=predictions, references=batch["labels"])
    #f1.add_batch(predictions=predictions, references=batch["labels"])
    progress_bar.update(1)

  0%|          | 0/6000 [00:00<?, ?it/s]

In [None]:
accuracy.compute()

{'accuracy': 0.75425}

In [None]:
f1b = load_metric("f1", average=None)
#micro_f1 = load_metric("f1", average="micro")

In [None]:
progress_bar = tqdm(range(len(train_dataloader)))
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    f1b.add_batch(predictions=predictions, references=batch["labels"])
    #micro_f1.add_batch(predictions=predictions, references=batch["labels"])
    progress_bar.update(1)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [None]:
f1b.compute(average=None)

ValueError: ignored

In [None]:
micro_f1.compute(average="micro")

ValueError: ignored