## Meta llama rational steps

In [1]:
from groq import Groq
from dotenv import dotenv_values

In [6]:
class MetaLlamaRationalSteps:

    def __init__(self, max_tokens=1200):
        CONFIG = dotenv_values("config/.env")

        self.client = Groq(api_key=CONFIG["GROQ_API_KEY"])
        self.model_name = CONFIG["MODEL_NAME"]
        self.max_tokens = max_tokens
    
    def question_steps_answer(self, question, system_prompt="Answer the following question"):
        response = self.client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": f"{system_prompt}"
                },
                {
                    "role": "user",
                    "content": f"{question}"
                }
            ],
            model=self.model_name,
            max_tokens=self.max_tokens
        )
        answer = response.choices[0].message.content

        try:
            rational_step = answer.split("</think>")[0].replace("<think>", "")
            final_answer = answer.split("</think>")[1].replace("<answer>", "").replace("</answer>", "").replace("\n", "")
        except:
            rational_step = "format error"
            final_answer = "format error"

        return rational_step, final_answer

In [3]:
system_prompt = "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>"

In [7]:
model_rational_steps = MetaLlamaRationalSteps()

In [46]:
question="Find a word that relates the word 'novel' and the word 'hotel'"
rational_step = model_rational_steps.question_steps_answer(question=question, system_prompt=system_prompt)

## Reward prediction model

In [26]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sentence_transformers import SentenceTransformer
import numpy
import time

In [9]:
class ModelRewardPrediction(nn.Module):
    def __init__(self, layer_config_embedding, layer_config_general, n_embeddings=2):
        super(ModelRewardPrediction, self).__init__()
        
        # Create embedding layers
        self.embedding_layers = nn.ModuleList()
        for _ in range(n_embeddings):
            layers = []
            input_dim = 768
            layer_dims = [int(dim) for dim in layer_config_embedding.split()]
            
            for dim in layer_dims:
                layers.append(nn.Linear(input_dim, dim))
                layers.append(nn.ReLU())
                input_dim = dim
            
            layers.pop()  # Remove the last ReLU
            self.embedding_layers.append(nn.Sequential(*layers))
        
        # Create general layers
        layers = []
        input_dim = n_embeddings * layer_dims[-1]
        layer_dims = [int(dim) for dim in layer_config_general.split()]
        
        for dim in layer_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim
        
        layers.pop()  # Remove the last ReLU
        layers.append(nn.Sigmoid())  # Add Sigmoid for the final layer
        
        self.general_layers = nn.Sequential(*layers)

    def forward(self, x_list):
        # Apply embedding layers to each input
        x_output_list = []
        for i, x in enumerate(x_list):
            x_output_list.append(self.embedding_layers[i](x))
        
        # Concatenate the outputs
        x = torch.cat(x_output_list, dim=1)
        
        # Apply general layers
        return self.general_layers(x)


In [12]:
layer_config_embedding = "512 256"
layer_config_general = "256 128 1"
model = ModelRewardPrediction(layer_config_embedding, layer_config_general)
x1 = torch.randn(2, 768)
x2 = torch.randn(2, 768)
output = model([x1, x2])
print(output)

tensor([[0.4909],
        [0.4907]], grad_fn=<SigmoidBackward0>)


In [11]:
x1.shape

torch.Size([1, 768])

## Reinforcement learning training loop

In [13]:
model_embeddings = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
def get_embedding(sentence):
    embedding = model_embeddings.encode(sentence)
    return embedding

In [14]:
AIME_Dataset = pd.read_csv("data/AIME_Dataset_1983_2024.csv")
AIME_Dataset = AIME_Dataset[['Question', 'Answer']]
AIME_Dataset_clarification_prompt = " The Answer must only contain a nummber, not an explanation."

In [15]:
layer_config_embedding = "512 256"
layer_config_general = "256 128 1"
model_reward_accuracy = ModelRewardPrediction(layer_config_embedding, layer_config_general, n_embeddings=2)

In [16]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model_reward_accuracy.parameters(), lr=0.001)

In [37]:
training_dict = {
    'Question_number': [],
    'Question': [],
    'Score': [],
    'Loss': []
}

In [38]:
for i in range(4):
    question = AIME_Dataset.iloc[i]['Question']
    answer = AIME_Dataset.iloc[i]['Answer']

    question_embedding_list = []
    rational_step_embedding_list = []
    score_list = []
    for j in range(3):
        rational_step, model_answer = model_rational_steps.question_steps_answer(question=question, system_prompt=system_prompt+AIME_Dataset_clarification_prompt)
        model_answer = model_answer.replace(" ", "")

        score = 0
        if model_answer == answer:
            score = 1
        
        score = torch.tensor(score).float().view(1, 1)
        question_embedding = torch.tensor(get_embedding(question)).unsqueeze(0)
        rational_step_embedding = torch.tensor(get_embedding(rational_step)).unsqueeze(0)

        score_list.append(score)
        question_embedding_list.append(question_embedding)
        rational_step_embedding_list.append(rational_step_embedding)    
    
    score = torch.cat(score_list, dim=0)
    question_embedding = torch.cat(question_embedding_list, dim=0)
    rational_step_embedding = torch.cat(rational_step_embedding_list, dim=0)

    optimizer.zero_grad()
    outputs = model_reward_accuracy([question_embedding, rational_step_embedding])
    loss = criterion(outputs, score)
    loss.backward()
    optimizer.step()

    training_dict['Question_number'].append(i)
    training_dict['Question'].append(question)
    training_dict['Score'].append(score.mean().item())
    training_dict['Loss'].append(loss.item())

    time.sleep(60)
    

In [39]:
training_df = pd.DataFrame(training_dict)

In [40]:
training_df

Unnamed: 0,Question_number,Question,Score,Loss
0,0,"Let $x$ , $y$ and $z$ all exceed $1$ and let $...",0.666667,0.682087
1,1,"Let $f(x)=|x-p|+|x-15|+|x-p-15|$ , where $0 < ...",0.666667,0.682019
2,2,What is the product of the real roots of the e...,0.0,0.728643
3,3,A machine-shop cutting tool has the shape of a...,1.0,0.659012


In [47]:
int(time.time())

1738352235

* Obtain a df with the results on every step
* Pass all to actual code
* Comment code