In [1]:
# Define hyper-params
model_name = 'bert-base-uncased'
num_labels = 2
val_ratio = 0.2

HUGGINGFACE_KEY = "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"

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
# Load model & tokenizer
from huggingface_hub import login
from transformers import AutoModelForSequenceClassification, AutoTokenizer

login(HUGGINGFACE_KEY)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import json

import torch
from torch.utils.data import Dataset, DataLoader, Subset

# Load json data
with open("K-NCT_v1.4.json") as f:
    raw_data = json.load(f)["data"]

class TorchDataset(Dataset):

    def __len__(self):
        return len(raw_data) * 2
    
    def __getitem__(self, index):
        idx, is_correct = divmod(index, 2)
        sentence = (
            raw_data[idx]["correct_sentence"] 
            if is_correct 
            else raw_data[idx]["error_sentence"]
        )
        inputs = tokenizer(sentence, truncation=True, padding='max_length', max_length=256)
        inputs = {k: torch.tensor(v) for k, v in inputs.items()}
        return inputs, torch.tensor(is_correct)

dataset = TorchDataset()
# dataset[0][0].keys()

# split train dataset and val dataset
n_trains = int(len(dataset) * (1 - val_ratio)) + 1
train_dataset = Subset(dataset, range(0, n_trains))
val_dataset = Subset(dataset, range(n_trains, len(dataset)))

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

In [4]:
# Make torch lightning trainer

import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

class PLModel(pl.LightningModule):
    
    def __init__(self):
        super().__init__()

        self.model = model
        self.criterion = nn.CrossEntropyLoss()
    
    def forward(self, inputs):
        return self.model(**inputs).logits

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = self.criterion(outputs, labels)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = self.criterion(outputs, labels)
        self.log("val_loss", loss, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=5e-6, weight_decay=0.01)
        return optimizer

pl_model = PLModel()

batch = next(iter(train_loader))
pred = pl_model(batch[0])
pl_model.criterion(pred, batch[1])

tensor(0.7211, grad_fn=<NllLossBackward0>)

In [5]:
trainer = pl.Trainer(
    max_epochs=10,
    devices=1,
    log_every_n_steps=10,
    check_val_every_n_epoch=1,
)
trainer.fit(pl_model, train_loader, val_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


/home/wooshik.myung/miniconda3/envs/llm/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                    

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/wooshik.myung/miniconda3/envs/llm/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/home/wooshik.myung/miniconda3/envs/llm/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [6]:
val_results = trainer.validate(pl_model, dataloaders=val_loader)
val_results

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/wooshik.myung/miniconda3/envs/llm/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Validation: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        val_loss           0.017359746620059013
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss': 0.017359746620059013}]

In [None]:
pl_model = PLModel.load_from_checkpoint("lightning_logs/version_7/checkpoints/epoch=9-step=6010.ckpt")

# Sample text for prediction
texts = ["알람을 꺼냈습니다.", "알람을 껐습니다."]

# Tokenize the input text
inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding='max_length')

# Get model predictions
with torch.no_grad():
    inputs = {key: value.to(model.device) for key, value in inputs.items()}
    outputs = pl_model(inputs)
    predictions = outputs.argmax(dim=-1)
    print("Predicted class labels:", predictions)

tensor([[-2.8130,  3.2046],
        [-2.8130,  3.2046]], device='cuda:0')
Predicted class labels: tensor([1, 1], device='cuda:0')
