In [19]:
import torch
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration, Trainer, TrainingArguments
from datasets import load_dataset, load_metric

In [20]:
# Step 1: Load and Preprocess a Sample of the Dataset
def load_and_preprocess_sample_data(sample_size=1000):
    # Load the Natural Questions dataset
    dataset = load_dataset("natural_questions", split="train[:1%]")  # Use 1% of the dataset
    
    # Limit the dataset to a smaller sample size
    dataset = dataset.select(range(min(sample_size, len(dataset))))
    
    # Filter relevant fields: question, document_plaintext, long_answer
    def preprocess_function(examples):
        inputs = [q.strip() for q in examples['question.text']]
        targets = [a.strip() for a in examples['annotations.long_answer[0].plaintext']]
        model_inputs = tokenizer(inputs, max_length=512, truncation=True)
        labels = tokenizer(targets, max_length=512, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    
    # Tokenize the dataset
    tokenized_datasets = dataset.map(preprocess_function, batched=True)
    return tokenized_datasets

In [21]:
# Step 2: Load Model and Tokenizer
def load_model_and_tokenizer():
    tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
    retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
    model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-base", retriever=retriever)
    return tokenizer, retriever, model

In [22]:
# Step 3: Define Training Arguments
def setup_training_args():
    training_args = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        logging_steps=100,
        save_steps=500,
        evaluation_strategy="steps",
        fp16=True,  # Enable mixed precision
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="bleu",
    )
    return training_args

In [23]:
# Step 4: Define Evaluation Metrics
def compute_metrics(pred):
    metric = load_metric("sacrebleu")
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = metric.compute(predictions=[decoded_preds], references=[[l] for l in decoded_labels])
    return {"bleu": result["score"]}

In [24]:
# Step 5: Train the Model
def train_model(model, tokenized_datasets, training_args):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets,
        eval_dataset=tokenized_datasets,  # Use the same dataset for validation (small-scale project)
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    return trainer

In [25]:
# Step 6: Save the Model
def save_model(trainer):
    trainer.save_model("./rag_model")
    tokenizer.save_pretrained("./rag_model")

In [26]:
# Step 7: Perform Inference
def perform_inference(question):
    model = RagSequenceForGeneration.from_pretrained("./rag_model", retriever=retriever)
    tokenizer = RagTokenizer.from_pretrained("./rag_model")
    
    inputs = tokenizer(question, return_tensors="pt")
    generated_ids = model.generate(inputs["input_ids"])
    answer = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

In [None]:
# Main Execution
if __name__ == "__main__":
    # Check GPU availability
    print("GPU Available:", torch.cuda.is_available())
    
    # Step 1: Load and preprocess a sample of the data
    sample_size = 1000  # Use 1000 examples for training
    tokenized_datasets = load_and_preprocess_sample_data(sample_size)
    
    # Step 2: Load model and tokenizer
    tokenizer, retriever, model = load_model_and_tokenizer()
    
    # Step 3: Setup training arguments
    training_args = setup_training_args()
    
    # Step 4: Train the model
    trainer = train_model(model, tokenized_datasets, training_args)
    
    # Step 5: Save the trained model
    save_model(trainer)
    
    # Step 6: Perform inference
    question = "What is the capital of France?"
    answer = perform_inference(question)
    print(f"Question: {question}")
    print(f"Answer: {answer}")

GPU Available: True


Resolving data files:   0%|          | 0/287 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/287 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/287 [00:00<?, ?files/s]

Downloading data:   0%|          | 0.00/193M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/185M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/189M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/190M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/196M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/190M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/195M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/307373 [00:00<?, ? examples/s]