<a href="https://colab.research.google.com/github/WalaaAShaaban/NvidiaDocumentaionProject-Fine-tune/blob/main/NvidiaDocumentaionProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install opendatasets datasets accelerate --quiet
import opendatasets as od
od.download('https://www.kaggle.com/datasets/gondimalladeepesh/nvidia-documentation-question-and-answer-pairs')

Skipping, found downloaded files in "./nvidia-documentation-question-and-answer-pairs" (use force=True to force download)


In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM , AutoTokenizer, Trainer, TrainingArguments
import torch
import re

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

In [5]:
df = pd.read_csv('/content/nvidia-documentation-question-and-answer-pairs/NvidiaDocumentationQandApairs.csv')[:500][['question', 'answer']]
df.head()

Unnamed: 0,question,answer
0,What is Hybridizer?,Hybridizer is a compiler from Altimesh that en...
1,How does Hybridizer generate optimized code?,Hybridizer uses decorated symbols to express p...
2,What are some parallelization patterns mention...,The text mentions using parallelization patter...
3,How can you benefit from accelerators without ...,You can benefit from accelerators' compute hor...
4,What is an example of using Hybridizer?,An example in the text demonstrates using Para...


In [6]:
df.shape

(500, 2)

In [7]:
def clean_text(text):
  text = text.lower()
  text = re.sub('[^A-Za-z0-9]+', " ", text)
  return text

In [8]:
df['question'] = df['question'].apply(clean_text)
df['answer'] = df['answer'].apply(clean_text)

In [9]:
df.head()

Unnamed: 0,question,answer
0,what is hybridizer,hybridizer is a compiler from altimesh that en...
1,how does hybridizer generate optimized code,hybridizer uses decorated symbols to express p...
2,what are some parallelization patterns mention...,the text mentions using parallelization patter...
3,how can you benefit from accelerators without ...,you can benefit from accelerators compute hors...
4,what is an example of using hybridizer,an example in the text demonstrates using para...


In [10]:
train = df.sample(frac=0.7)
test = df.drop(train.index)

val  = test.sample(frac=0.5)
test = test.drop(val.index)

In [11]:
print("Train Shape" , train.shape)
print("Validation Shape" , val.shape)
print("Test Shape" , test.shape)

Train Shape (350, 2)
Validation Shape (75, 2)
Test Shape (75, 2)


# **Model & Tokenizer**

In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base', torch_dtype=torch.float32)
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
def tokenize_function(example):
  start_prompt = 'the user ia asking this question \n Q: '
  end_prompt = 'the answer of the question is : \n A: '

  prompt = [start_prompt + question + end_prompt for question in example['question']]
  example['input_ids'] = tokenizer(prompt, padding='max_length', truncation=True, return_tensors='pt').input_ids
  example['labels'] = tokenizer(example['answer'], padding='max_length', truncation=True, return_tensors='pt').input_ids

  return example

"""train_dataset = Dataset.from_pandas(train)
print(train_dataset)

Dataset({
    features: ['question', 'answer', '__index_level_0__'],
    num_rows: 4976
})"""

In [14]:
train_dataset = Dataset.from_pandas(train)
train_tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
train_tokenized_dataset = train_tokenized_dataset.remove_columns(['question', 'answer', '__index_level_0__'])

# Test
test_dataset = Dataset.from_pandas(test)
test_tokenized_dataset = test_dataset.map(tokenize_function, batched=True)
test_tokenized_dataset = test_tokenized_dataset.remove_columns(['question', 'answer', '__index_level_0__'])

# validation
val_dataset = Dataset.from_pandas(val)
val_tokenized_dataset = val_dataset.map(tokenize_function, batched=True)
val_tokenized_dataset = val_tokenized_dataset.remove_columns(['question', 'answer', '__index_level_0__'])

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

In [15]:
print(train_tokenized_dataset)

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 350
})


# **Training**

In [16]:
epochs = 5
lr = 1e-3
batch_size = 2

training_path = './training_nvidia_chatbot'

training_args = TrainingArguments(
    output_dir=training_path,
    save_total_limit=2,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    learning_rate = lr,
    num_train_epochs = epochs,
    evaluation_strategy = 'epoch'

)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_tokenized_dataset,
    eval_dataset = val_tokenized_dataset

)
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.246346
2,No log,0.237404
3,0.463400,0.244445
4,0.463400,0.270851
5,0.463400,0.301782


TrainOutput(global_step=875, training_loss=0.2935507572719029, metrics={'train_runtime': 709.2919, 'train_samples_per_second': 2.467, 'train_steps_per_second': 1.234, 'total_flos': 1198325366784000.0, 'train_loss': 0.2935507572719029, 'epoch': 5.0})

In [19]:
model_path = './nvidia_chatbot_final_model'
trainer.model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./nvidia_chatbot_final_model/tokenizer_config.json',
 './nvidia_chatbot_final_model/special_tokens_map.json',
 './nvidia_chatbot_final_model/spiece.model',
 './nvidia_chatbot_final_model/added_tokens.json',
 './nvidia_chatbot_final_model/tokenizer.json')

# **Evaluation & Testing**

In [20]:
eval_results = trainer.evaluate(eval_dataset= test_tokenized_dataset)
print(eval_results)

{'eval_loss': 0.308639794588089, 'eval_runtime': 9.094, 'eval_samples_per_second': 8.247, 'eval_steps_per_second': 4.179, 'epoch': 5.0}


In [23]:
text = 'what is cuda nsight'
start_prompt = 'the user ia asking this question \n Q: '
end_prompt = 'the answer of the question is : \n A: '

full_prompt = start_prompt + text + end_prompt

In [26]:
testing_model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(DEVICE)
testing_tokenizer = AutoTokenizer.from_pretrained(model_path)

tokenizer_text = testing_tokenizer(full_prompt, return_tensors='pt').input_ids.to(DEVICE)
model_output = testing_model.generate(tokenizer_text)[0]
final_output = testing_tokenizer.decode(model_output)
print(final_output)



<pad> cuda nsightthe is a suite of openacc directives that provide
