In [None]:
# ! pip uninstall torch torchvision torchaudio torchsummary torchtext -y
# ! pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cu124

In [2]:
import pandas as pd
import os
from datasets import Dataset, DatasetDict
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device       

'cuda'

In [4]:
def load_data_from_path(folder_path):
    examples = []
    for label in os.listdir(folder_path):
        full_path = os.path.join(folder_path, label)
        for file_name in os.listdir(full_path):
            file_path = os.path.join(full_path, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                lines = f.readlines()
                sentence = " ".join(lines)
                if label == "neg":
                    label = 0
                if label == "pos":
                    label = 1
                data = {
                    'sentence': sentence,
                    'label': label
                }
                examples.append(data)
    return pd.DataFrame(examples)


folder_paths = {
    'train': './data/ntc-scv/data_train/train',
    'valid': './data/ntc-scv/data_train/test',
    'test': './data/ntc-scv/test'
}

train_df = load_data_from_path(folder_paths['train'])
valid_df = load_data_from_path(folder_paths['valid'])
test_df = load_data_from_path(folder_paths['test'])

In [5]:
import re
import string

def preprocess_text(text):
    # remove URLs https://www.
    url_pattern = re.compile(r'https?://\s+\wwww\.\s+')
    text = url_pattern.sub(r" ", text)

    # remove HTML Tags: <>
    html_pattern = re.compile(r'<[^<>]+>')
    text = html_pattern.sub(" ", text)

    # remove puncs and digits
    replace_chars = list(string.punctuation + string.digits)
    for char in replace_chars:
        text = text.replace(char, " ")

    # remove emoji
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U0001F1F2-\U0001F1F4"  # Macau flag
                               u"\U0001F1E6-\U0001F1FF"  # flags
                               u"\U0001F600-\U0001F64F"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U0001F1F2"
                               u"\U0001F1F4"
                               u"\U0001F620"
                               u"\u200d"
                               u"\u2640-\u2642"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r" ", text)

    # normalize whitespace
    text = " ".join(text.split())

    # lowercasing
    text = text.lower()
    return text

In [6]:
train_df_vi = train_df.copy()
train_df_vi['preprocess_sentence'] = train_df_vi['sentence'].apply(
    preprocess_text)

valid_df_vi = valid_df.copy()
valid_df_vi['preprocess_sentence'] = valid_df_vi['sentence'].apply(
    preprocess_text)

test_df_vi = test_df.copy()
test_df_vi['preprocess_sentence'] = test_df_vi['sentence'].apply(
    preprocess_text)

### From pandas to DatasetDict

In [7]:
ds_train = Dataset.from_pandas(train_df_vi)
ds_valid = Dataset.from_pandas(valid_df_vi)
ds_test = Dataset.from_pandas(test_df_vi)

ds = DatasetDict({
    "train": ds_train,
    "validation": ds_valid,
    "test": ds_test
})

In [8]:
ds

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'preprocess_sentence'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['sentence', 'label', 'preprocess_sentence'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['sentence', 'label', 'preprocess_sentence'],
        num_rows: 10000
    })
})

In [9]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)

In [10]:
tokenizer.model_max_length

512

In [11]:
max_seq_length = 100
max_seq_length = min(max_seq_length, tokenizer.model_max_length)

def preprocess_function(examples):
    # Tokenize the texts

    result = tokenizer(
        examples['preprocess_sentence'],
        padding="max_length",
        max_length=max_seq_length,
        truncation=True
    )
    result["label"] = examples['label']

    return result

In [12]:
# Running the preprocessing pipeline on all the datasets
processed_dataset = ds.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset: 100%|██████████| 30000/30000 [00:03<00:00, 7776.71 examples/s]
Running tokenizer on dataset: 100%|██████████| 10000/10000 [00:01<00:00, 8653.71 examples/s]
Running tokenizer on dataset: 100%|██████████| 10000/10000 [00:01<00:00, 8937.06 examples/s]


In [13]:
processed_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'preprocess_sentence', 'input_ids', 'attention_mask'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['sentence', 'label', 'preprocess_sentence', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['sentence', 'label', 'preprocess_sentence', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [14]:
from transformers import AutoConfig, AutoModelForSequenceClassification

num_labels = 2

config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    fineutning_task="text-classification"
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config
)

Error while downloading from https://cdn-lfs.hf.co/distilbert-base-uncased/5e3f1108e3cb34ee048634875d8482665b65ac713291a7e32396fb18f6ff0063?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1735890701&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNTg5MDcwMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9kaXN0aWxiZXJ0LWJhc2UtdW5jYXNlZC81ZTNmMTEwOGUzY2IzNGVlMDQ4NjM0ODc1ZDg0ODI2NjViNjVhYzcxMzI5MWE3ZTMyMzk2ZmIxOGY2ZmYwMDYzP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=RxSwb1fiDd2VIdQ2AtkTRqvbr4l%7EqrXHrGUPMszgL2r2aX8tJXXAxkUfckSeIlubIzL1voP4jNlITFQJ3GStYbtOYuluka5CVHI7NvqedLaAsdazj6a6pnHwYsF7xsCHbpCJgp5JJ6wH2JMUAs3x9GBjNpvTu%7EhKxrQVxiHLLSxfLFsAgZPb6ajFqNBaUVnh9jUenr7F7vwKqoTXbZh8erwsylUF7xgdlNosMCZ2rUSsxlzOYcoAdioLyvpemYiQe5n6tHYaFaYNw1kZl8AhubYED7HbBfvHU0aUZgYB-RY2WRWaNDnYhjf%7ECtihzy%7Eqpgw0QNAlEZI-Y8%7EDtH8KfQ__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSConnec

ConnectionError: HTTPSConnectionPool(host='cdn-lfs.hf.co', port=443): Read timed out.

In [15]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    result = metric.compute(predictions=predictions, references=labels)
    return result

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 4.20MB/s]


In [16]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='ntc-scv-distilbert-base-uncased',
    learning_rate=2e-5,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["valid"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)



NameError: name 'model' is not defined

In [None]:
import wandb
wandb.init(mode='disabled')

In [None]:
trainer.train()

In [None]:
trainer.evaluate(processed_dataset["test"])

In [None]:
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="thainq107/ntc-scv-distilbert-base-uncased"
)

In [None]:
classifier("quán ăn này ngon quá luôn nè")