<h2>Installing & Importing Necessary Libraries </h2>

In [None]:
%pip install trl==0.6.0 transformers==4.32.0 accelerate peft==0.5.0 datasets -Uqqq
%pip install datasets==2.13.1 bitsandbytes==0.41.3 einops==0.7.0 wandb==0.15.8 -Uqqq

In [1]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
import warnings
warnings.filterwarnings("ignore")

  torch.utils._pytree._register_pytree_node(


<h2> Flattening The trec_cast Dataset </h2>

In [1]:
import pandas as pd
import json
import os

# Load the JSON data from a file
with open('trec_cast.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Flatten the data
flattened_data = []

for dialogue in data:
    for turn in dialogue['turn']:
        human_utterance = f"<HUMAN>: {turn['utterance']}"
        # Check if 'response' key exists
        assistant_response = f"<ASSISTANT>: {turn.get('response', '[No response available]')}"
        flattened_data.append({
            "dialogue": f"{human_utterance} {assistant_response}"
        })

# Create a DataFrame
df = pd.DataFrame(flattened_data)

# Print the current working directory
print("Current working directory:", os.getcwd())

# Specify the full path
full_path = 'formatted_dialogue.parquet'

# Save to a .parquet file
try:
    df.to_parquet(full_path)
    print(f"File saved successfully at {full_path}")
except Exception as e:
    print(f"An error occurred: {e}")

# Display the DataFrame
print(df)

Current working directory: /teamspace/studios/this_studio
File saved successfully at formatted_dialogue.parquet
                                              dialogue
0    <HUMAN>: I remember Glasgow hosting COP26 last...
1    <HUMAN>: Interesting. What are the effects of ...
2    <HUMAN>: That’s rather vague. Can you be more ...
3    <HUMAN>: Woah. They’re not all bad, right? <AS...
4    <HUMAN>: I remember Glasgow hosting COP26 last...
..                                                 ...
279  <HUMAN>: How do you get impartial results? <AS...
280  <HUMAN>: Um, what is search neutrality? <ASSIS...
281  <HUMAN>: No, I want to know how to get unbiase...
282  <HUMAN>: Yes, what alternatives are there? <AS...
283  <HUMAN>: I’ve never heard of ecosia. What does...

[284 rows x 1 columns]


<h2>Logging into HuggingFace to access Falcon 7B Sharded Model </h2>

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
data = load_dataset("parquet", data_files="formatted_dialogue.parquet")
data
print(data["train"][0]['dialogue'])

<HUMAN>: I remember Glasgow hosting COP26 last year, but unfortunately I was out of the loop. What was it about? <ASSISTANT>: The COP26 event is a global united Nations summit about climate change and how countries are planning to tackle it. The term “climate change” is often used as if it means the same thing as the term “global warming”. The National Academy of Sciences says “climate change” is growing in favor of “global warming” because it helps convey that there are other changes in addition to rising temperatures. In fact, “climate change” means major changes in temperature, rainfall, snow, or wind patterns lasting for decades or longer.


<h2>Loading & Quantizing Falcon 7B Sharded Model</h2>

In [4]:
model_name = "ybelkada/falcon-7b-sharded-bf16" # sharded falcon-7b model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,            # load model in 4-bit precision
    bnb_4bit_quant_type="nf4",    # pre-trained model should be quantized in 4-bit NF format
    bnb_4bit_use_double_quant=True, # Using double quantization as mentioned in QLoRA paper
    bnb_4bit_compute_dtype=torch.bfloat16, # During computation, pre-trained model should be loaded in BF16 format
)


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, # Use bitsandbytes config
    device_map="auto",  # Specifying device_map="auto" so that HF Accelerate will determine which GPU to put each layer of the model on
    trust_remote_code=True, # Set trust_remote_code=True to use falcon-7b model with custom code
)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # Set trust_remote_code=True
tokenizer.pad_token = tokenizer.eos_token # Setting pad_token same as eos_token

<h2>Preparing the Model for Training</h2>

In [7]:
model = prepare_model_for_kbit_training(model)

lora_alpha = 32 # scaling factor for the weight matrices
lora_dropout = 0.05 # dropout probability of the LoRA layers
lora_rank = 32 # dimension of the low-rank matrices

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",  # setting to 'none' for only training weight params instead of biases
    task_type="CAUSAL_LM",
    target_modules=[         # Setting names of modules in falcon-7b model that we want to apply LoRA to
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

peft_model = get_peft_model(model, peft_config)

In [10]:
output_dir = "./falcon-7b-sharded-bf16-finetuned-treccast"
per_device_train_batch_size = 16 # reduce batch size by 2x if out-of-memory error
gradient_accumulation_steps = 4  # increase gradient accumulation steps by 2x if batch size is reduced
optim = "paged_adamw_32bit" # activates the paging for better memory management
save_strategy="steps" # checkpoint save strategy to adopt during training
save_steps = 5 # number of updates steps before two checkpoint saves
logging_steps = 5  # number of update steps between two logs if logging_strategy="steps"
learning_rate = 2e-4  # learning rate for AdamW optimizer
max_grad_norm = 0.3 # maximum gradient norm (for gradient clipping)
max_steps = 100        # training will happen for 320 steps
warmup_ratio = 0.03 # number of steps used for a linear warmup from 0 to learning_rate
lr_scheduler_type = "cosine"  # learning rate scheduler

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    bf16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    push_to_hub=True,
)

In [11]:
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=data['train'],
    peft_config=peft_config,
    dataset_text_field="dialogue",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_arguments,
)

Map:   0%|          | 0/284 [00:00<?, ? examples/s]

In [12]:
# upcasting the layer norms in torch.bfloat16 for more stable training
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.bfloat16)

<h2>Training and Logging the Training Loss in Wandb</h2>

In [14]:
import os
import wandb
os.environ['WANDB_NOTEBOOK_NAME'] = 'Falcon7b_Finetuning.ipynb'
wandb.init(project="falcon_7b", entity="universitat-bonn")
peft_model.config.use_cache = False
trainer.train()
wandb.finish()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /teamspace/studios/this_studio/.netrc


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
5,2.5191
10,2.1472
15,2.0244
20,1.8265
25,1.6331
30,1.3538
35,1.1817
40,0.9011
45,0.7116
50,0.5775


0,1
train/epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███
train/global_step,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███
train/learning_rate,███▇▇▇▆▆▅▅▄▄▃▃▂▂▁▁▁▁
train/loss,█▇▇▆▅▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,22.22
train/global_step,100.0
train/learning_rate,0.0
train/loss,0.1002
train/total_flos,2.488131525066547e+16
train/train_loss,0.84133
train/train_runtime,2544.7051
train/train_samples_per_second,2.515
train/train_steps_per_second,0.039
