#### Load and Tokenize the Dataset

In [None]:
from datasets import Dataset
from transformers import BertTokenizer

# Example data with scores
# data = [
#     {"context": "Context 1", "question": "Question 1", "answer": "Correct Answer 1", "score": 1.0},
#     {"context": "Context 1", "question": "Question 1", "answer": "Partially correct Answer 1", "score": 0.5},
#     {"context": "Context 1", "question": "Question 1", "answer": "Incorrect Answer 1", "score": 0.0},
#     # Add more examples...
# ]

dataset = Dataset.from_list(data)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(
        examples["question"], 
        examples["answer"], 
        truncation=True, 
        padding="max_length",
        return_tensors="pt"
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)


#### Reward model Training 

In [None]:
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Define the reward model
class RewardModel(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.classifier = torch.nn.Linear(config.hidden_size, 1)  # Regression output

reward_model = RewardModel.from_pretrained("bert-base-uncased")
reward_model.num_labels = 1
reward_model.config.problem_type = "regression"

# Training arguments
training_args = TrainingArguments(
    output_dir="./reward_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=reward_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train the reward model
trainer.train()


####  Evaluate the Reward Model

In [None]:
def evaluate_reward_model(model, eval_dataset):
    model.eval()
    predictions = []
    references = []

    for example in eval_dataset:
        inputs = tokenizer(
            example['question'],
            example['answer'],
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        with torch.no_grad():
            outputs = model(**inputs)
            predicted_score = outputs.logits.squeeze().item()
        
        predictions.append(predicted_score)
        references.append(example['score'])
    
    return compute_metrics(predictions, references)

# Compute metrics
from sklearn.metrics import mean_squared_error, r2_score

def compute_metrics(predictions, references):
    mse = mean_squared_error(references, predictions)
    r2 = r2_score(references, predictions)
    return {"mse": mse, "r2": r2}

# Evaluate the model
metrics = evaluate_reward_model(reward_model, tokenized_dataset)
print("Evaluation Metrics:", metrics)


In [2]:
# Data Preparation: Ensure your dataset includes context, question, answer, and scores.
# Load and Tokenize Dataset: Use transformers and datasets libraries to prepare the dataset.
# Define Reward Model: Use a BERT model for sequence regression to predict the scores.
# Train the Model: Use Trainer from transformers to fine-tune the reward model.
# Evaluate the Model: Compute metrics like MSE and R2 to evaluate the model's performance.
# Using the same dataset for both the reward model and the baby LLM is feasible given the constraints. 
# It ensures consistency and makes the most of the limited data available. 
# Adjusting hyperparameters and possibly augmenting the dataset with additional examples, 
# if possible, can help improve the model's performance.