In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np


In [2]:

# --- 0. Setup: A Toy Model and Helper Functions ---

# A very simple language model for demonstration purposes
class ToyLLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.vocab_size = vocab_size

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # We only care about the output of the last token for next-word prediction
        output = self.fc(lstm_out[:, -1, :])
        return output


In [3]:

# Helper function to generate text from the model
def generate_text(model, tokenizer, seed_text, max_len=10):
    model.eval()
    tokens = tokenizer.encode(seed_text)
    input_tensor = torch.tensor([tokens]).to(next(model.parameters()).device)
    
    for _ in range(max_len):
        with torch.no_grad():
            output = model(input_tensor)
            # Get the predicted next token (the one with the highest probability)
            next_token = output.argmax(1).item()
            tokens.append(next_token)
            input_tensor = torch.tensor([tokens]).to(next(model.parameters()).device)
            
            if tokenizer.decode([next_token]) == '<end>':
                break
                
    return tokenizer.decode(tokens)


In [4]:

# Simple tokenizer for our toy vocabulary
class SimpleTokenizer:
    def __init__(self, corpus):
        # Flatten the corpus and find unique words
        words = sorted(list(set(word for sentence in corpus for word in sentence.split())))
        self.word_to_idx = {word: i for i, word in enumerate(words)}
        self.idx_to_word = {i: word for word, i in self.word_to_idx.items()}
        
    def encode(self, text):
        return [self.word_to_idx[word] for word in text.split()]

    def decode(self, tokens):
        return ' '.join([self.idx_to_word[token] for token in tokens])



In [6]:
# --- 1. Phase 1: Foundational Pre-training ---
def simulate_pretraining(model, tokenizer, corpus, epochs=50):
    print("\n--- Starting Phase 1: Foundational Pre-training ---")
    print("Goal: Teach the model basic language structure by predicting the next word.")
    
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.CrossEntropyLoss()
    
    # Create input/output pairs for next-word prediction
    inputs, targets = [], []
    for sentence in corpus:
        tokens = tokenizer.encode(sentence)
        for i in range(1, len(tokens)):
            inputs.append(tokens[:i])
            targets.append(tokens[i])
            
    for epoch in range(epochs):
        total_loss = 0
        for i in range(len(inputs)):
            input_seq = torch.tensor([inputs[i]])
            target_val = torch.tensor([targets[i]])
            
            optimizer.zero_grad()
            output = model(input_seq)
            loss = criterion(output, target_val)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        if (epoch + 1) % 10 == 0:
            print(f"  Pre-training Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(inputs):.4f}")

    print("--- Pre-training Complete ---")
    return model


In [7]:

# --- 2. Phase 2: Supervised Fine-Tuning (SFT) ---
def simulate_sft(model, tokenizer, sft_data, epochs=30):
    print("\n--- Starting Phase 2: Supervised Fine-Tuning (SFT) ---")
    print("Goal: Teach the model to follow instructions in a question-answer format.")
    
    optimizer = optim.Adam(model.parameters(), lr=0.005)
    criterion = nn.CrossEntropyLoss()
    
    for epoch in range(epochs):
        total_loss = 0
        for prompt, ideal_response in sft_data.items():
            # The input is the prompt, and the target is the ideal response
            input_tokens = tokenizer.encode(prompt)
            target_tokens = tokenizer.encode(ideal_response)
            
            # We'll train the model to generate the response one word at a time
            for i in range(len(target_tokens)):
                current_input = torch.tensor([input_tokens + target_tokens[:i]])
                current_target = torch.tensor([target_tokens[i]])
                
                optimizer.zero_grad()
                output = model(current_input)
                loss = criterion(output, current_target)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

        if (epoch + 1) % 10 == 0:
            print(f"  SFT Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(sft_data):.4f}")
            
    print("--- SFT Complete ---")
    return model



In [8]:
# --- 3. Phase 3: Reinforcement Learning with Human Feedback (RLHF) ---

# Step 3a: Simulate a Reward Model
def get_reward_score(response):
    """
    A simple, rule-based reward model. In reality, this would be a separate,
    trained neural network.
    """
    score = 0
    if "i can help" in response: score += 1.5 # Prefers helpfulness
    if "of course" in response: score += 1.0
    if "sun is hot" in response: score += 1.0 # Prefers factual correctness
    if "sky is blue" in response: score += 1.0
    if "i am a bot" in response: score -= 2.0 # Penalizes unhelpful or robotic answers
    if len(response.split()) < 5: score -= 1.0 # Penalizes short, uninformative answers
    return score


In [9]:

# Step 3b: Fine-tune with Reinforcement Learning
def simulate_rlhf(model, tokenizer, prompts, iterations=50):
    print("\n--- Starting Phase 3: Reinforcement Learning with Human Feedback (RLHF) ---")
    print("Goal: Refine the model based on preferences (what makes a 'good' answer).")
    
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    for i in range(iterations):
        # Pick a random prompt and generate a response
        prompt = random.choice(prompts)
        response_str = generate_text(model, tokenizer, prompt, max_len=15)
        
        # Get a score from our simulated reward model
        reward = get_reward_score(response_str)
        
        # This is a simplified version of a policy gradient update (like PPO).
        # We calculate a "loss" that is inversely proportional to the reward.
        # A high reward means a low loss, and a low reward means a high loss.
        # This encourages the model to generate responses that get high rewards.
        loss = -reward 
        
        # We need to backpropagate this "loss" through the generation process.
        # This is complex, so we'll simulate it by treating the reward as a loss
        # on the log probabilities of the generated tokens.
        
        # A simplified pseudo-loss calculation
        log_probs = []
        input_tokens = tokenizer.encode(prompt)
        response_tokens = tokenizer.encode(response_str.replace(prompt, '').strip())
        
        for token in response_tokens:
            input_tensor = torch.tensor([input_tokens])
            output = model(input_tensor)
            log_prob = torch.log_softmax(output, dim=1)[0, token]
            log_probs.append(log_prob)
            input_tokens.append(token)
            
        if log_probs:
            policy_loss = -torch.stack(log_probs).mean() * reward
            
            optimizer.zero_grad()
            policy_loss.backward()
            optimizer.step()
        
        if (i + 1) % 10 == 0:
            print(f"  RLHF Iteration {i+1}/{iterations}, Prompt: '{prompt}', Reward: {reward:.2f}")

    print("--- RLHF Complete ---")
    return model



In [10]:

# --- Main Execution ---
#if __name__ == "__main__":
    # --- Data and Vocabulary Setup ---
pretrain_corpus = [
        "the sun is hot <end>",
        "the sky is blue <end>",
        "i like to code <end>",
        "what is your name <end>",
        "my name is bot <end>",
        "how can i help you <end>",
    ]
    
    # Add a special <end> token to our vocabulary
full_corpus = pretrain_corpus + ["<end>", "of course i can help", "i am a bot"]
tokenizer = SimpleTokenizer(full_corpus)
vocab_size = len(tokenizer.word_to_idx)
    
sft_dataset = {
        "what is the sun": "the sun is hot <end>",
        "what color is the sky": "the sky is blue <end>",
        "can you help me": "of course i can help <end>"
    }
    
rlhf_prompts = ["can you help me", "what is your name", "what is the sun"]

    # --- Initialize the Model ---
toy_model = ToyLLM(vocab_size=vocab_size, embed_dim=10, hidden_dim=20)

    # --- Run the Pipeline ---
print("--- Initial Untrained Model ---")
print(f"Prompt: 'the sun is' -> Response: '{generate_text(toy_model, tokenizer, 'the sun is')}'")
    
    # Phase 1
base_model = simulate_pretraining(toy_model, tokenizer, pretrain_corpus)
print("\n--- Model after Pre-training ---")
print(f"Prompt: 'the sun is' -> Response: '{generate_text(base_model, tokenizer, 'the sun is')}'")
print(f"Prompt: 'can you help me' -> Response: '{generate_text(base_model, tokenizer, 'can you help me')}'")

    # Phase 2
sft_model = simulate_sft(base_model, tokenizer, sft_dataset)
print("\n--- Model after SFT ---")
print(f"Prompt: 'what is the sun' -> Response: '{generate_text(sft_model, tokenizer, 'what is the sun')}'")
print(f"Prompt: 'can you help me' -> Response: '{generate_text(sft_model, tokenizer, 'can you help me')}'")
print(f"Prompt: 'what is your name' -> Response: '{generate_text(sft_model, tokenizer, 'what is your name')}'") # May still give a poor answer

    # Phase 3
rlhf_model = simulate_rlhf(sft_model, tokenizer, rlhf_prompts)
print("\n--- Final Model after RLHF ---")
print("Note how the model now prefers the more 'helpful' sounding response.")
print(f"Prompt: 'what is the sun' -> Response: '{generate_text(rlhf_model, tokenizer, 'what is the sun')}'")
print(f"Prompt: 'can you help me' -> Response: '{generate_text(rlhf_model, tokenizer, 'can you help me')}'")
print(f"Prompt: 'what is your name' -> Response: '{generate_text(rlhf_model, tokenizer, 'what is your name')}'") #

--- Initial Untrained Model ---
Prompt: 'the sun is' -> Response: 'the sun is sun of of you sun of you sun of you'

--- Starting Phase 1: Foundational Pre-training ---
Goal: Teach the model basic language structure by predicting the next word.
  Pre-training Epoch 10/50, Loss: 0.2506
  Pre-training Epoch 20/50, Loss: 0.0949
  Pre-training Epoch 30/50, Loss: 0.0771
  Pre-training Epoch 40/50, Loss: 0.0709
  Pre-training Epoch 50/50, Loss: 0.0677
--- Pre-training Complete ---

--- Model after Pre-training ---
Prompt: 'the sun is' -> Response: 'the sun is hot <end>'


KeyError: 'me'