In [1]:
# Load model directly
!pip install transformers



In [58]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

In [3]:
!pip install datasets



In [4]:
from datasets import load_dataset
dataset = load_dataset("databricks/databricks-dolly-15k")

In [5]:
dataset = dataset["train"].train_test_split(test_size=0.3)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'context', 'response', 'category'],
        num_rows: 10507
    })
    test: Dataset({
        features: ['instruction', 'context', 'response', 'category'],
        num_rows: 4504
    })
})

In [7]:
dataset_train = dataset['train']
dataset_val = dataset['test']

**Processing dataset**

In [12]:
# We prefix our tasks with "answer the question"
prefix = "Please answer this question from the options given: "

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized questions:
   inputs = [prefix + question for question in (examples["instruction"])]

   model_inputs = tokenizer(inputs, max_length=256, truncation=True)

   # The "labels" are the tokenized answers:
   labels = tokenizer(text_target=examples["response"],
                      max_length=512,
                      truncation=True
                      )

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [13]:
tokenized_dataset_train = dataset_train.map(preprocess_function, batched=True)

Map:   0%|          | 0/10507 [00:00<?, ? examples/s]

In [14]:
tokenized_dataset_train

Dataset({
    features: ['instruction', 'context', 'response', 'category', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10507
})

In [69]:

print("Question")
print('-----------')
print(tokenizer.decode(tokenized_dataset_train['input_ids'][20]))
print("Answer")
print('------------')
print(tokenizer.decode(tokenized_dataset_train['labels'][20]))


Question
-----------
Please answer this question from the options given: What is a tornado?</s>
Answer
------------
A tornado is a major storm that consists of rotating columns of air that touch the base of a thunderstorm and surface of the earth. They have speeds ranging from 86 - 200 miles per hour and destroy everything in their path. They can last for seconds or minutes at a time. States that have the most tornados are Texas, Kansas, Oklahoma, Alabama, and Mississippi. People that are in the path of a large tornado often say it sounds like an approaching freight train.</s>


**Output before finetuning**

In [70]:
# Without options input
input_text = tokenizer.decode(tokenized_dataset_train['input_ids'][20],skip_special_tokens=True)
print(input_text)
input_ids = tokenizer.encode(input_text, return_tensors="pt")
print('Base Model Output')
outputs = base_model.generate(input_ids)
print(tokenizer.decode(outputs[0],skip_special_tokens=True))

Please answer this question from the options given: What is a tornado?
Base Model Output




a storm that moves at a high speed


# LoRA(Low-Rank-Decomposition)

In [19]:
!pip install peft

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [20]:
import torch
import transformers
from peft import LoraConfig, PeftModel, get_peft_model

In [21]:
config = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.1, bias="none", task_type="SEQ_2_SEQ_LM",use_rslora=True,init_lora_weights=True,use_dora=True)

lora_model = get_peft_model(base_model, config)

In [22]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(lora_model))

trainable model parameters: 2506752
all model parameters: 785656832
percentage of trainable model parameters: 0.32%


In [23]:
from transformers import DataCollatorForSeq2Seq,Seq2SeqTrainingArguments,Seq2SeqTrainer

In [24]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=lora_model)


In [25]:
!pip install evaluate
!pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [26]:
import nltk
import numpy as np
import evaluate
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [27]:

def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

   return result

In [28]:
!pip install accelerate -U
!pip install transformers[torch]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


zsh:1: no matches found: transformers[torch]


In [29]:
%cd /teamspace/studios/this_studio/fine_tuned_general_QnA

/teamspace/studios/this_studio/fine_tuned_general_QnA


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [30]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


runs


In [31]:
tokenized_dataset_val=tokenized_dataset_train.train_test_split(test_size=0.1)

In [32]:
tokenized_dataset_val

DatasetDict({
    train: Dataset({
        features: ['instruction', 'context', 'response', 'category', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9456
    })
    test: Dataset({
        features: ['instruction', 'context', 'response', 'category', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1051
    })
})

In [33]:
tokenized_dataset_validation = tokenized_dataset_val['test']

In [34]:
tokenized_dataset_validation

Dataset({
    features: ['instruction', 'context', 'response', 'category', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1051
})

In [35]:
data = tokenized_dataset_validation.train_test_split(test_size=0.1)
data

DatasetDict({
    train: Dataset({
        features: ['instruction', 'context', 'response', 'category', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 945
    })
    test: Dataset({
        features: ['instruction', 'context', 'response', 'category', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 106
    })
})

In [36]:
train = data['train']
test = data['test']

In [37]:
from transformers import TrainingArguments,Trainer,Seq2SeqTrainingArguments,Seq2SeqTrainer

In [38]:
fsdp_config = {
    "sharding_strategy": "FULL_SHARD",  # Options: "NO_SHARD", "SHARD_GRAD_OP", "FULL_SHARD"
    "min_num_params": 1e8,  # Minimum number of parameters for FSDP to be applied
    "cpu_offload": True,  # Enable CPU offloading to save GPU memory
    "mixed_precision": True  # Enable mixed precision to reduce memory usage
}

In [45]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 4
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
#SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

training_args = Seq2SeqTrainingArguments(
   output_dir="/teamspace/studios/this_studio/SEQ2SEQ",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_strategy="epoch",
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   load_best_model_at_end=True,
   push_to_hub=False
)

trainer = Seq2SeqTrainer(
   model=lora_model,
   args=training_args,
   train_dataset=train,
   eval_dataset=test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)




In [43]:
import torch
torch.cuda.empty_cache()

In [42]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [46]:
trainer.train()

Epoch,Training Loss,Validation Loss




TrainOutput(global_step=711, training_loss=2.55534721192093, metrics={'train_runtime': 343.2828, 'train_samples_per_second': 8.258, 'train_steps_per_second': 2.071, 'total_flos': 550624991735808.0, 'train_loss': 2.55534721192093, 'epoch': 3.0})

In [84]:
peft_model_path = "/teamspace/studios/this_studio/save_ft_model"
trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('/teamspace/studios/this_studio/save_ft_model/tokenizer_config.json',
 '/teamspace/studios/this_studio/save_ft_model/special_tokens_map.json',
 '/teamspace/studios/this_studio/save_ft_model/tokenizer.json')

In [49]:
from peft import PeftModel, PeftConfig

peft_base_model = base_model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

peft_model = PeftModel.from_pretrained(peft_base_model,
                                       '/teamspace/studios/this_studio/save_ft_model',                                      
                                       is_trainable=False)

# Base Model Output

In [89]:
# Without options input
input_text = tokenizer.decode(train['input_ids'][2],skip_special_tokens=True)
print(input_text)
input_ids = tokenizer.encode(input_text, return_tensors="pt")
print('Base Model Output')
outputs = base_model.generate(input_ids)
print(tokenizer.decode(outputs[0],skip_special_tokens=True))

Please answer this question from the options given: when was the 2nd world war?
Base Model Output




1939


# Output in Dataset


In [91]:

print("Question")
print('-----------')
print(tokenizer.decode(train['input_ids'][2]))
print("Answer")
print('------------')
print(tokenizer.decode(train['labels'][2]))


Question
-----------
Please answer this question from the options given: when was the 2nd world war?</s>
Answer
------------
It started in 1939 and ended in 1945</s>


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
peft_model.to(device)

#input_tensor = input_tensor.to(device)


# Fine-Tuned model output

In [92]:
input_text = tokenizer.decode(train['input_ids'][2],skip_special_tokens=True)
print(input_text)
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
#input_ids = input_ids.to_de
peft_model_outputs = peft_model.generate(input_ids=input_ids)
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
print(peft_model_text_output)

Please answer this question from the options given: when was the 2nd world war?




The 2nd World War was fought between 1939 and 1945.


In [95]:
print("Dataset Output: ",tokenizer.decode(train['labels'][2]))
print("Base Model Output: ",tokenizer.decode(outputs[0],skip_special_tokens=True))
print("Finetuned Model Output: ",peft_model_text_output)

Dataset Output:  It started in 1939 and ended in 1945</s>
Base Model Output:  1939
Finetuned Model Output:  The 2nd World War was fought between 1939 and 1945.


# Another set of question

In [110]:
print("Question")
print('-----------')
print(tokenizer.decode(train['input_ids'][1]))

Question
-----------
Please answer this question from the options given: What was Jiang's previous position before becoming the president of China?</s>


In [111]:
# Base Model
input_text = tokenizer.decode(train['input_ids'][1],skip_special_tokens=True)
input_ids = tokenizer.encode(input_text, return_tensors="pt")
outputs = base_model.generate(input_ids)
base_model_output = tokenizer.decode(outputs[0],skip_special_tokens=True)



In [112]:
# Fine Tuned

input_text = tokenizer.decode(train['input_ids'][1],skip_special_tokens=True)
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
peft_model_outputs = peft_model.generate(input_ids=input_ids)
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)




In [113]:
print("Dataset Output: ",tokenizer.decode(train['labels'][1]))
print("Base Model Output: ",base_model_output)
print("Finetuned Model Output: ",peft_model_text_output)

Dataset Output:  He was the mayor of Shanghai and then became Communist Party secretary</s>
Base Model Output:  vice president of china
Finetuned Model Output:  Jiang was the Vice President of China from 2000 to 2003.


# Evaluation

In [152]:
print("Question")
print('-----------')
print(tokenizer.decode(train['input_ids'][40]))

# Base Model
input_text = tokenizer.decode(train['input_ids'][40],skip_special_tokens=True)
input_ids = tokenizer.encode(input_text, return_tensors="pt")
outputs = base_model.generate(input_ids)
base_model_output = tokenizer.decode(outputs[0],skip_special_tokens=True)

# Fine Tuned
input_text = tokenizer.decode(train['input_ids'][40],skip_special_tokens=True)
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
peft_model_outputs = peft_model.generate(input_ids=input_ids)
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)


print("Dataset Output: ",tokenizer.decode(train['labels'][40]))
print("Base Model Output: ",base_model_output)
print("Finetuned Model Output: ",peft_model_text_output)


Question
-----------
Please answer this question from the options given: What is React?</s>




Dataset Output:  React is a JavaScript library that specializes in helping developers build user interfaces.</s>
Base Model Output:  reacted
Finetuned Model Output:  React is a software tool that allows developers to quickly and easily create and deploy applications.


**ROUGE stands for Recall-Oriented Understudy for Gisting Evaluation. Some key components of ROUGE for question-answering include:

ROUGE-L: Measures the longest common subsequence between the candidate and reference answers. This focuses on recall of the full text.

ROUGE-1, ROUGE-2, ROUGE-SU4: Compare unigram, bigram, 4-gram overlaps between candidate and reference. Focus on recall of key parts/chunks.

Higher ROUGE scores generally indicate better performance for question answering. Scores close to or above 0.70+ are considered strong**

In [128]:
from rouge_score import rouge_scorer

# Function to compute ROUGE scores
def compute_rouge(predictions, references):
    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Initialize lists to store scores
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    # Iterate through each prediction and reference
    for pred, ref in zip(predictions, references):
        # Compute ROUGE scores
        scores = scorer.score(pred, ref)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    
    # Calculate average scores
    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
    avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)
    
    # Return results as a dictionary
    return {
        'rouge1': avg_rouge1,
        'rouge2': avg_rouge2,
        'rougeL': avg_rougeL
    }

# Example usage

In [133]:
references = train['instruction']

In [135]:
len(train['instruction'])

945

In [138]:
base_predictions = []

In [139]:
for i in range(len(train['instruction'])):
        
    input_text = tokenizer.decode(train['input_ids'][i],skip_special_tokens=True)
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = base_model.generate(input_ids)
    base_model_output = tokenizer.decode(outputs[0],skip_special_tokens=True)
    base_predictions.append(base_model_output)



In [141]:
len(base_predictions)

945

In [144]:
peft_predictions = []
for i in range(len(train['instruction'])):
        
    input_text = tokenizer.decode(train['input_ids'][i],skip_special_tokens=True)
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    peft_model_outputs = peft_model.generate(input_ids=input_ids)
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
    peft_predictions.append(peft_model_text_output)



In [145]:
len(peft_predictions)

945

In [147]:
 # For Base Model
 results = compute_rouge(base_predictions, references)
    
    # Print results
   #  print("ROUGE-1: {:.4f}".format(results['rouge1']))
   #  print("ROUGE-2: {:.4f}".format(results['rouge2']))
   #  print("ROUGE-L: {:.4f}".format(results['rougeL']))


In [148]:
results

{'rouge1': 0.12603983473527258,
 'rouge2': 0.042914325021049464,
 'rougeL': 0.11401885835591632}

In [150]:
 # For Base Model
 results1 = compute_rouge(peft_predictions, references)


In [151]:
results1

{'rouge1': 0.46512743879577495,
 'rouge2': 0.27495258692000146,
 'rougeL': 0.3848212998565702}

In [153]:
test_peft_predictions = []
for i in range(len(test['instruction'])):
        
    input_text = tokenizer.decode(test['input_ids'][i],skip_special_tokens=True)
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    peft_model_outputs = peft_model.generate(input_ids=input_ids)
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
    test_peft_predictions.append(peft_model_text_output)



In [155]:
results_test = compute_rouge(test_peft_predictions, references)
results_test

{'rouge1': 0.06926812208442544,
 'rouge2': 0.0025555572725384046,
 'rougeL': 0.06537295042991191}

In [157]:
test_base_predictions = []
for i in range(len(test['instruction'])):
        
    input_text = tokenizer.decode(test['input_ids'][i],skip_special_tokens=True)
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    base_model_outputs = base_model.generate(input_ids=input_ids)
    base_model_text_output = tokenizer.decode(base_model_outputs[0], skip_special_tokens=True)
    test_base_predictions.append(base_model_text_output)



In [159]:
results_test = compute_rouge(test_base_predictions, references)
results_test

{'rouge1': 0.027325159990254328,
 'rouge2': 0.0004965243296921549,
 'rougeL': 0.02550013752843942}