## Setup

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U datasets
# !pip install -q -U pandas # you don't need to install either of these last two libs if you're using Colab
# !pip install -q -U torch

In [None]:
!pip install --upgrade transformers

!pip uninstall Jinja2 -y # Uninstall any existing Jinja2
!pip install Jinja2==3.0.3 # Or another compatible version like 3.0.3 or 3.0.x
!pip install Jinja2==3.0.3 # Or another compatible version like 3.0.3 or 3.0.x

# Importing necessary libraries

In [None]:
import random
import torch
import pandas as pd
from datasets import Dataset
import peft
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)

set_seed()

In [None]:
mistral7b = 'mistralai/Mistral-7B-v0.1'
model_name = mistral7b

## EDA

In [None]:
df = pd.read_csv("frankenstein_chunks.csv")
df.head()

In [None]:
print("Dataframe Info:")
print(df.info())
print("\n")
print("Dataframe Description:")
print(df.describe())
print("\n")
print("Number of unique values in each column:")
print(df.nunique())
random_index= random.randint(0, len(df) - 1)
df.loc[random_index, 'text']

In [None]:
df.isnull().sum()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Now we'll quickly convert this to a train/test split
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2)

# STEP 2. Convert the train_df and test_df from Pandas into Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


## Model Import and Tokenization

In [None]:
from huggingface_hub import login
import os

login(os.getenv("HF_TOKEN"))  # Read from environment variable


In [None]:
quant_config = BitsAndBytesConfig(
  # STEP 3. Passing the appropriate parameters here to 4-bit quantize the model
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quant_config)
print("\n\nModel is running on:" + "\n")
model

In [None]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# STEP 4. Preparing the model for QLoRA. Configure LoRA for our finetuning run. Then tokenize the data.
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quant_config)

model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenized_train_dataset=train_dataset.map(lambda examples: tokenizer(examples['text'], padding="longest", truncation=True), batched=True)
tokenized_test_dataset =test_dataset.map(lambda examples: tokenizer(examples['text'], padding="longest", truncation=True), batched=True)

## Base Model Evaluation

In [None]:
def generate_text(prompt):
  device = "cuda"
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
  outputs = model.generate(**inputs, max_new_tokens=100)
  output = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return output

In [None]:
# STEP 5. Generating a completion with the base model for informal evaluation.
base_generation =generate_text("I'm afraid I've created a ")
base_generation

In [None]:
def calc_perplexity(model):
  total_perplexity = 0
  # Determine the device the model is on
  device = next(model.parameters()).device
  for row in test_dataset:
    inputs = tokenizer(row['text'], return_tensors="pt")
    input_ids = inputs["input_ids"]

    # Move the input tensors to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}
    input_ids = input_ids.to(device)

    # Calculate the loss without updating the model
    with torch.no_grad():
        outputs = model(**inputs, labels=input_ids)
    loss = outputs.loss
    # STEP 6. Complete the equation for perplexity.
    perplexity = torch.exp(torch.tensor(loss))
    total_perplexity += perplexity

  num_test_rows = len(test_dataset)
  avg_perplexity = total_perplexity / num_test_rows
  return avg_perplexity

base_ppl = calc_perplexity(model)
base_ppl

## Training

Make sure you can leave your browser open for a while. This may take around 15-25 minutes on a Colab T4 GPU.

In [None]:
import transformers

tokenizer.pad_token = tokenizer.eos_token
model.config.use_cache = False

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    args=transformers.TrainingArguments(
        warmup_steps=2,
        fp16=True,
        logging_steps=1,
        save_steps=200,
        output_dir="outputs",
      # STEP 7. Configure the training arguments.
        per_device_train_batch_size=2,
        num_train_epochs=2,
        learning_rate=0.00002,
        optim="paged_adamw_8bit",
        report_to="none",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
# STEP 8. Finetuning the model.
trainer.train()

## Evaluating the finetuned model

In [None]:
# STEP 9. Generating a completion with the finetuned model and compare it to the base generation.
ft_generation = generate_text("I'm afraid I've created a ")

print("Base model generation: " + base_generation + "\n\n")
print("Finetuned generation: " + ft_generation)

A little more like the original text, right?
Experimenting with the hyperparameters to see if you can improve performance.

In [None]:
# STEP 10. Calculating  the finetuned model's perplexity and compare it to the base model's.
ft_ppl = calc_perplexity(model)
print("Base model perplexity: " + str(base_ppl))
print("Finetuned model perplexity: " + str(ft_ppl))

In [None]:
# Calculates and prints the total number of parameters and the number of trainable parameters
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)