# This Notebook is Designed for Sentiment Analysis

#### 1. Load library and Read Data

In [1]:
!pip install -qU jsonlines
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git

In [2]:
# Load model directly
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType, replace_lora_weights_loftq, prepare_model_for_kbit_training
import torch
import jsonlines
from datasets import Dataset, DatasetDict
import warnings
from kaggle_secrets import UserSecretsClient
import os
from accelerate import Accelerator

In [3]:
warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Your device:", device)
user_secrets = UserSecretsClient()
os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"]="Sentiment-Analysis-Tiki"

Your device: cuda


In [4]:
with jsonlines.open('/kaggle/input/reviews-tiki/reviews_main.jsonl') as reader:
    data = []
    for obj in reader:
        review = {}
        if obj["content"] == "":
            continue
        review["label"] = int(obj["rating"]) - 1
        review["text"] = obj["content"]
        data.append(review)

#### 2. Split, Transform Data

In [5]:
dataset = Dataset.from_list(data)
train_size = int(len(dataset) * 0.9)
val_size = len(dataset) - train_size

In [6]:
shuffled_dataset = dataset.shuffle(seed=42, writer_batch_size=10_000)
train_dataset = shuffled_dataset.take(train_size)
shuffled_dataset = shuffled_dataset.skip(train_size)
val_dataset = shuffled_dataset.take(val_size)

In [7]:
dataset_dict = DatasetDict()
dataset_dict['train'] = train_dataset
dataset_dict['validation'] = val_dataset
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 56448
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 6272
    })
})

In [8]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
dataset_encoded = dataset_dict.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/56448 [00:00<?, ? examples/s]

Map:   0%|          | 0/6272 [00:00<?, ? examples/s]

In [10]:
dataset_encoded = dataset_encoded.remove_columns(["text"])
dataset_encoded = dataset_encoded.rename_column("label", "labels")
dataset_encoded.set_format("torch")

#### 3. Prepare Model, Metrics and Trainer

In [11]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [12]:
config_bnb = BitsAndBytesConfig(
    load_in_4bit=True, # quantize the model to 4-bits when you load it
    bnb_4bit_quant_type="nf4", # use a special 4-bit data type for weights initialized from a normal distribution
    bnb_4bit_use_double_quant=True, # nested quantization scheme to quantize the already quantized weights
    bnb_4bit_compute_dtype=torch.bfloat16, # use bfloat16 for faster computation
    llm_int8_skip_modules=["classifier", "pre_classifier"] #  Don't convert the "classifier" and "pre_classifier" layers to 8-bit
)

In [13]:
device_index = Accelerator().process_index
device_map = {"": device_index}

model_qlora = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased",
                                                                  num_labels=5,
                                                                  device_map=device_map,
                                                                  quantization_config=config_bnb)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
model_qlora.gradient_checkpointing_enable()
model_qlora = prepare_model_for_kbit_training(model_qlora)

In [15]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Specify the task type as sequence classification
    r=16,  # Rank of the low-rank matrices
    lora_alpha=64,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate  
    target_modules=["query", "key", "value"] # which modules
)

peft_model_qlora = get_peft_model(model_qlora, lora_config)

In [16]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [17]:
batch_size = 32
logging_steps = len(dataset_encoded["train"]) // batch_size // 5
model_name = "sentiment-analysis-finetuned-qlora"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  disable_tqdm=False,
                                  eval_strategy="steps",
                                  logging_strategy="steps",
                                  logging_steps=logging_steps,
                                  eval_steps=logging_steps,
                                  push_to_hub=False,
                                  report_to="wandb",
                                  log_level="error")


In [18]:
trainer = Trainer(model=peft_model_qlora,
                  args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=dataset_encoded["train"],
                  eval_dataset=dataset_encoded["validation"],
                  data_collator=data_collator,
                  tokenizer=tokenizer)

In [19]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mngocnguyen14073[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240928_070753-vrqrcitw[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33msentiment-analysis-finetuned-qlora[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/ngocnguyen14073/Sentiment-Analysis-Tiki[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/ngocnguyen14073/Sentiment-Analysis-Tiki/runs/vrqrcitw[0m


Step,Training Loss,Validation Loss,Accuracy,F1
352,0.559,0.444032,0.898119,0.849912
704,0.4412,0.411923,0.898119,0.849912
1056,0.4211,0.385258,0.898119,0.849912
1408,0.3964,0.393183,0.898438,0.851165
1760,0.4126,0.374513,0.898597,0.852686
2112,0.3827,0.36735,0.899554,0.855496
2464,0.3852,0.368198,0.901626,0.862569
2816,0.3697,0.366936,0.901307,0.864003
3168,0.386,0.366165,0.900351,0.864301
3520,0.3886,0.365181,0.900829,0.864118


TrainOutput(global_step=3528, training_loss=0.41437938863458007, metrics={'train_runtime': 2980.1707, 'train_samples_per_second': 37.882, 'train_steps_per_second': 1.184, 'total_flos': 1.1831093922227712e+16, 'train_loss': 0.41437938863458007, 'epoch': 2.0})

In [20]:
trainer.save_model("./latest_model")

In [21]:
# from transformers import pipeline

# classifier = pipeline('text-classification', model='./latest_model')

# result = classifier("Sản phẩm này rất tốt!")
# print(result)