In [None]:
import pandas as pd
df = pd.read_csv("/content/LLM-Sample-Input-File.csv")
df

Unnamed: 0,Company Name,Category,Sub Cat,Period,Value - Randomized
0,Potato Inc.,Revenue By Product,Tablets,2022 Q1,57984000000
1,Potato Inc.,Revenue By Product,Tablets,2022 Q2,68814000000
2,Potato Inc.,Revenue By Product,Tablets,2022 Q3,21672000000
3,Potato Inc.,Revenue By Product,Tablets,2022 Q4,35870000000
4,Potato Inc.,Revenue By Product,Tablets,2023 Q2,50176000000
...,...,...,...,...,...
65,Potato Inc.,Revenue By Region,Americas,2022 Q3,337000000000
66,Potato Inc.,Revenue By Region,Americas,2022 Q4,279000000000
67,Potato Inc.,Revenue By Region,Americas,2023 Q1,197000000000
68,Potato Inc.,Revenue By Region,Americas,2023 Q2,189000000000


In [None]:
df.dtypes

Company Name            object
Category                object
Sub Cat                 object
Period                  object
 Value - Randomized     object
dtype: object

In [None]:
!pip install transformers



In [None]:
"""https://huggingface.co/deepset/roberta-base-squad2"""
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"

# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'How much revenue did Potato Inc. make from selling Smartphones in Q2 2022?',
    'context': 'On questioning questions it should give us the output as 253,000,000,000 from the table from the corresponding input values'
}
res = nlp(QA_input)

# b) Load model & tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## QA Metrics Material
- [Metrics on QA (implementation and explaination)](https://qa.fastforwardlabs.com/methods/background/2020/04/28/Intro-to-QA.html)

In [None]:
res # score == F1 Score: it's computed on the individual words in the prediction vs the true words provided in context

{'score': 9.215888894686941e-06,
 'start': 57,
 'end': 72,
 'answer': '253,000,000,000'}

## RoBERTa Architecture
if you are curious...

In [None]:
model

RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

# How to fine-tune a QA Model?
- Definitley need a GPU. Else, you are looking at a fine-tuning phase that is at least 10 times slower to train.
- Let's leverage an already built training script.
    - [Located here](https://github.com/huggingface/transformers/blob/b90745c5901809faef3136ed09a689e7d733526c/examples/run_squad.py); The execution is in the cell below.
    
What is this script doing?
- Gets whichever pretrained model you want to use (we are using RoBERTa in this case, but you can use a different pretrained model)
- Input dataset is converted into features
    - The featured dataset is saved in cache, so you don't have to necessarily rerun this process once more for this model.
- Ensure that the <b> --do_train </b> is enabled; This commences the training.
- When training is done, the outputs of the model are saved in a <b> output_dir / checkpoint - step_number</b>

## Using Popular Libraries

In [None]:
!pip install datasets



In [None]:
# squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Load your CSV data
df = pd.read_csv('/content/LLM-Sample-Input-File.csv')
df.columns = df.columns.str.strip()

# Prepare the data in SQuAD format
squad_data = {
    'id': df.index.tolist(),
    'title': df['Company Name'].tolist(),
    'context': df.apply(lambda row: f"{row['Category']} {row['Sub Cat']} {row['Period']}", axis=1).tolist(),
    'question': df.apply(lambda row: f"How much revenue did {row['Company Name']} make from {row['Sub Cat']} in {row['Period']}?", axis=1).tolist(),
    'answers': df['Value - Randomized'].apply(lambda x: {'text': str(x), 'answer_start': 0}).tolist()
}


# Split the data into train and validation sets
train_size = int(len(df) * 0.8)
train_data = squad_data.copy()
validation_data = {key: value[train_size:] for key, value in squad_data.items()}
train_data = {key: value[:train_size] for key, value in squad_data.items()}

# Create DatasetDict
samitData = DatasetDict({
    'train': Dataset.from_dict(train_data),
    'validation': Dataset.from_dict(validation_data)
})

# Print information about the created dataset
print(samitData)


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 56
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 14
    })
})


In [None]:
# More specific information about the squad dataset can be found here: https://huggingface.co/datasets/squad#data-instances
samitData["train"][0]
# id -> hash of the context
# title -> Document where the context resides
# Context -> Information where the answer resides
# Question -> What question are you trying to find the answer to?
# Answers -> What is the answer to the question? And the location on where in the text the answer begins (span)

{'id': 0,
 'title': 'Potato Inc.',
 'context': 'Revenue By Product Tablets 2022 Q1',
 'question': 'How much revenue did Potato Inc. make from Tablets in 2022 Q1?',
 'answers': {'answer_start': 0, 'text': '57,984,000,000'}}

In [None]:
# Preprocess the data to a BERT format
def preprocess_function(examples):
    """Courtesy of https://huggingface.co/docs/transformers/tasks/question_answering"""
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = int(answer.get("answer_start",0))
        end_char = start_char + len(answer["text"])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
tokenized_squad = samitData.map(preprocess_function, batched=True, remove_columns=samitData["train"].column_names)

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

### CPU Training Result... Let's think about GPU's okay?


In [None]:
pip install accelerate==0.20.3

Collecting accelerate==0.20.3
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3


In [None]:
import accelerate
print(accelerate.__version__)

0.20.3


In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_name) # remember that model_name is deepset/roberta-base-squad2

In [None]:
# Let's start training!
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=0.01,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=11,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",  # Save the model at the end of each epoch
    save_total_limit=1  # Keep only the last 1 checkpoint
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,6.238325
2,No log,6.238322
3,No log,6.238324


TrainOutput(global_step=168, training_loss=6.602627708798363, metrics={'train_runtime': 51.9891, 'train_samples_per_second': 3.231, 'train_steps_per_second': 3.231, 'total_flos': 43897855131648.0, 'train_loss': 6.602627708798363, 'epoch': 3.0})

### Question Answering

In [None]:
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer

# Load the fine-tuned model and tokenizer
model_name = "./results/checkpoint-168"  # Adjust the path to your fine-tuned model
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Use the pipeline for question-answering
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)

# question = "How much revenue did Potato Inc. make from selling Smartphones in Q2 2022?"
question = input("Ask Question: ")
# Example context (you can replace this with your own context)
context = "253,000,000,000 should be the output from the table"

# Get the answer
answer = qa_pipeline(question=question, context=context)

# Print the answer
print("Answer:", answer['answer'])


Ask Question: How much revenue did Potato Inc. make from selling Smartphones in Q2 2022?
Answer: 253,000


    By Samit Dhawal