In [2]:
# Define hyper-params
model_name = 'bert-base-uncased'
num_labels = 2
val_ratio = 0.2

HUGGINGFACE_KEY = "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"

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
# Load model & tokenizer
from huggingface_hub import login
from transformers import AutoModelForSequenceClassification, AutoTokenizer

login(HUGGINGFACE_KEY)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import json
from datasets import Dataset, DatasetDict

# Load json data
with open("K-NCT_v1.4.json") as f:
    raw_data = json.load(f)["data"]

# expand correct data and incorrect data
processed_data = []
for tmp_raw_data in raw_data:
    tmp_data = tmp_raw_data.copy()
    tmp_data["sentence"] = tmp_data["error_sentence"]
    tmp_data["labels"] = 0
    processed_data.append(tmp_data)

    tmp_data = tmp_raw_data.copy()
    tmp_data["sentence"] = tmp_data["correct_sentence"]
    tmp_data["labels"] = 1
    processed_data.append(tmp_data)


# split train dataset and val dataset
n_trains = int(len(processed_data) * (1 - val_ratio)) + 1
train_data, val_data = processed_data[:n_trains], processed_data[n_trains:]
dataset = DatasetDict(
    {
        "train": Dataset.from_list(train_data),
        "test": Dataset.from_list(val_data),
    }
)
# print(dataset)
# print(dataset["train"][0])
# print(dataset["test"][0])

In [4]:
# Tokenize the dataset
def preprocess_function(examples):
    encoding = tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=256)
    encoding['labels'] = examples['labels']
    return encoding

# Apply the preprocessing function
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Convert the dataset to PyTorch tensors
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/4801 [00:00<?, ? examples/s]

Map:   0%|          | 0/1199 [00:00<?, ? examples/s]

In [6]:
from transformers import (
    DataCollatorWithPadding,
    Trainer, 
    TrainingArguments,
)

# Prepare the data collator
data_collator = DataCollatorWithPadding(tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    learning_rate=5e-6,
    weight_decay=0.01,
)

# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0516,0.01536
2,0.0004,0.018339
3,0.0002,0.020752
4,0.0001,0.022387
5,0.0,0.023749
6,0.0,0.024705
7,0.0,0.025872
8,0.0,0.026605
9,0.0,0.027425
10,0.0,0.027596


TrainOutput(global_step=6010, training_loss=0.004368727838943252, metrics={'train_runtime': 611.8633, 'train_samples_per_second': 78.465, 'train_steps_per_second': 9.822, 'total_flos': 6315980883916800.0, 'train_loss': 0.004368727838943252, 'epoch': 10.0})

In [7]:
# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.02759585715830326, 'eval_runtime': 4.0958, 'eval_samples_per_second': 292.738, 'eval_steps_per_second': 36.623, 'epoch': 10.0}


In [8]:
import torch

# Sample text for prediction
texts = ["알람을 꺼냈습니다.", "알람을 껐습니다."]

# Tokenize the input text
inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding='max_length')

# Get model predictions
with torch.no_grad():
    inputs = {key: value.to(model.device) for key, value in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = logits.argmax(dim=-1)
    print("Predicted class labels:", predictions)

Predicted class labels: tensor([1, 1], device='cuda:0')


In [9]:
# load base model w/ bnb, lora
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from huggingface_hub import login


login(HUGGINGFACE_KEY)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# get bnb base model
base_model_id = "meta-llama/Meta-Llama-3-8B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

# get lora base model
peft_config = LoraConfig(
    task_type="CAUSAL_LM", 
    inference_mode=False, 
    r=8, 
    lora_alpha=16, 
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "out_proj", "fc_in", "fc_out", "wte"],
)
model = get_peft_model(model, peft_config)
model.to(device)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): l