## Data Preparation

### Imports

In [1]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
  AutoTokenizer,
  AutoConfig,
  AutoModelForSequenceClassification,
  DataCollatorWithPadding,
  TrainingArguments,
  Trainer
)
from huggingface_hub import from_pretrained_keras

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

import evaluate
import torch
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


### Loading raw model

In [None]:
model_checkpoint = 'distilbert-base-uncased'

# Define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0, "Positive": 1}

# Generate classification model from checkpoint
model = from_pretrained_keras(
  model_checkpoint, num_labels=2
)

### Loading dataset

In [None]:
dataset = load_dataset("imdb")
# dataset
dataset['train'] = dataset['train'].select(range(0, 100))
dataset['test'] = dataset['test'].select(range(0, 100))
dataset['unsupervised'] = dataset['unsupervised'].select(range(0, 100))
dataset

### Preprocessing data

In [None]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Create tokenize function
def tokenize_function(examples):
  # extract text
  text = examples["text"]
  
  # Tokenize and truncate text
  tokenizer.truncation_side = "left"
  tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    truncation=True,
    max_length=512,
  )
  
  return tokenized_inputs

# Add pad tokens if none exists
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({"pad_token": "[PAD]"})
  model.resize_model_embeddings(len(tokenizer))
  
# Tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

In [None]:
# Creating data collator

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Evaluating model

In [None]:
# Import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [None]:
# Define an evaluation function to pass into trainer later
def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)

  return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

### Applying untrained model to text

In [None]:
# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
  
  # tokenize text
  inputs = tokenizer.encode(text, return_tensors="pt")
  
  # compute logits
  logits = model(inputs).logits
  
  # convert logits to label
  predictions = torch.argmax(logits)

  print(text + " - " + id2label[predictions.tolist()])

## Model Training

### Configuring LORA

In [None]:
peft_config = LoraConfig(
  task_type="SEQ_CLS", # Sequence classification
  r=4, # Intrictic rank of trainable weiht matrix
  lora_alpha=32, # This is like a learning rate
  lora_dropout=0.01, # Probability of dropout (zero-ing random weights)
  target_modules = ['q_lin'] # Which layers do we apply LORA to
)

peft_config

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

###  Configuring Trainer

In [None]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 1

In [None]:
# define training arguments
training_args = TrainingArguments(
  output_dir= model_checkpoint + "-lora-text-classification",
  learning_rate=lr,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  num_train_epochs=num_epochs,
  weight_decay=0.01,
  evaluation_strategy="epoch",
  save_strategy="epoch",
  load_best_model_at_end=True,
)

# creater trainer object
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_dataset["train"],
  eval_dataset=tokenized_dataset['test'],
  tokenizer=tokenizer,
  data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
  compute_metrics=compute_metrics,
)

### Training model

In [None]:
# train model
trainer.train()