In [None]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import (
    PeftModel,
    PeftConfig, 
    get_peft_model,
    LoraConfig,
    )

import evaluate
import torch
import numpy as np

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [2]:
id2label = {0:"Negative" ,1:"Positive"}
label2id = {"Negative":0 , "Positive":1}

In [4]:
dataset = load_dataset("shawhin/imdb-truncated")

In [None]:
# Load a pretrained model using  AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label=id,
    label2id = label
).to(device)
model

In [6]:
# Load a pre-trained tokenizer using AutoTokenizer
Tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased",add_prefix_space=True)
Tokenizer

In [7]:
# Define a dictionary to hold special tokens
special_tokens_dict = {
            'pad_token': '[PAD]',
            'cls_token': '[CLS]',
}

In [8]:
if Tokenizer.pad_token is None:
    Tokenizer.additional_special_tokens(special_tokens_dict)
    Tokenizer.resize_token_embeddings(len(Tokenizer))

In [9]:
def Token_func(sample):
    text = sample["text"]
    #configure the behavior of a tokenizer
    Tokenizer.truncation_side = "left"
    #Tokenizer settings
    inputs = Tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding = "max_length",
        return_tensors=None,
        add_special_tokens=True,
        pad_to_multiple_of=None,
)
    return inputs

In [10]:
# Create a data collator that automatically pads inputs to the same length
collator = DataCollatorWithPadding(tokenizer=Tokenizer)
collator

In [11]:
# Apply a tokenization function to the dataset
tokenized_dataset = dataset.map(Token_func,batched=True)
tokenized_dataset

In [12]:
# Load the accuracy metric from the evaluate library
metrics = evaluate.load("accuracy")

In [13]:
#define accuracy metric
def compute_metric(metric):
    predictions , labels = metric
    predictions = np.argmax(predictions,axis=1)
    accuracy_metric = metrics.compute(predictions=predictions , references=labels)
    return {"accuracy" : accuracy_metric}

In [14]:
# Configure LoRA (Low-Rank Adaptation) settings for a model
lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r = 6,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules=["q_lin"]   
)

In [15]:
model = get_peft_model(model,lora_config)

In [16]:
model.print_trainable_parameters()

trainable params: 1239556 || all params: 67602436 || trainable%: 1.8335966473160819


In [17]:
# Set up training arguments for model training
training_args = TrainingArguments(
    output_dir= "distilbert-base-uncased" + "Model-Text-Classification",
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate = 1e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True
)

In [18]:
# Set up the Trainer for training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=Tokenizer,
    train_dataset= tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["validation"],
    compute_metrics= compute_metric,
    data_collator=data_collator,
)

In [19]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.288906,{'accuracy': 0.889}
2,No log,0.701047,{'accuracy': 0.855}
3,0.309900,0.432492,{'accuracy': 0.878}
4,0.309900,0.622782,{'accuracy': 0.88}
5,0.309900,0.772798,{'accuracy': 0.886}
6,0.075800,0.853956,{'accuracy': 0.885}
7,0.075800,0.887032,{'accuracy': 0.877}
8,0.075800,0.934234,{'accuracy': 0.878}
9,0.022900,0.910749,{'accuracy': 0.886}
10,0.022900,0.919872,{'accuracy': 0.887}


Trainer is attempting to log a value of "{'accuracy': 0.889}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.855}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.878}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.88}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.886}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This in

TrainOutput(global_step=1670, training_loss=0.12265576823742803, metrics={'train_runtime': 1237.3949, 'train_samples_per_second': 8.081, 'train_steps_per_second': 1.35, 'total_flos': 1344562913280000.0, 'train_loss': 0.12265576823742803, 'epoch': 10.0})

In [45]:
text_list = ["i would like to ask for a refund"]
print("Sentiment Analysis LLM results:")
print("--------------------------")
# Iterate over each text in the list
for text in text_list:
    # Tokenize text and move inputs to GPU
    inputs = Tokenizer.encode(text, return_tensors="pt").to("cuda")
    # Compute logits
    logits = model(inputs).logits
    # Convert logits to label
    output = torch.max(logits,1).indices

    print(text + " - " + id2label[output.tolist()[0]])
    print("--------------------------")

Sentiment Analysis LLM results:
--------------------------
i would like to ask for a refund - Negative
--------------------------


In [46]:
#Save the model in your directory
model.save_pretrained("Model")

In [47]:
#Save the Tokenizer in your directory
tokenizer.save_pretrained("Model")