In [1]:
# Install necessary libraries
!pip install -q -U transformers datasets accelerate evaluate

In [2]:
# Load the dataset
from datasets import load_dataset

ds = load_dataset('thainq107/ntc-scv')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Tokenization
from transformers import AutoTokenizer

# Define model name
model_name = "distilbert-base-uncased"  # or "bert-base-uncased"

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)

# Set maximum sequence length
max_seq_length = 100
max_seq_length = min(max_seq_length, tokenizer.model_max_length)

# Preprocessing function
def preprocess_function(examples):
    # Tokenize the texts
    result = tokenizer(
        examples["preprocessed_sentence"],
        padding="max_length",
        max_length=max_seq_length,
        truncation=True
    )
    result["label"] = examples["label"]
    return result

# Apply the preprocessing pipeline on the dataset
processed_dataset = ds.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset: 100%|██████████| 10000/10000 [00:01<00:00, 9339.47 examples/s]


In [4]:
from transformers import AutoConfig, AutoModelForSequenceClassification

# Number of labels for classification
num_labels = 2

# Load model configuration
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    finetuning_task="text-classification"
)

# Load pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config
)

  warn(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
import numpy as np
import evaluate

# Load accuracy metric
metric = evaluate.load("accuracy")

# Define a function to compute metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)  # Get predicted class
    result = metric.compute(predictions=predictions, references=labels)
    return result

In [9]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="save_model",  # Directory to save the model
    learning_rate=2e-5,       # Learning rate
    per_device_train_batch_size=128,  # Training batch size
    per_device_eval_batch_size=128,   # Evaluation batch size
    num_train_epochs=10,      # Number of epochs
    eval_strategy="epoch",    # Evaluation strategy: evaluate at the end of each epoch
    save_strategy="epoch",    # Save strategy: save at the end of each epoch
    load_best_model_at_end=True,  # Load the best model at the end of training
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                   # Pre-trained model
    args=training_args,            # Training arguments
    train_dataset=processed_dataset["train"],  # Training dataset
    eval_dataset=processed_dataset["valid"],   # Evaluation dataset
    compute_metrics=compute_metrics,           # Metrics function
    tokenizer=tokenizer,                       # Tokenizer
)

# Start training
trainer.train()

  trainer = Trainer(
  5%|▌         | 126/2350 [25:11<8:38:19, 13.98s/it]

KeyboardInterrupt: 