1. Installation and Setup with veRL

In [8]:
# First, let's try to install veRL correctly
# !pip uninstall -y vel-rl vel  # Clean any existing installations
!pip install git+https://github.com/facebookresearch/vel.git # Commented out due to git clone error

# If the above doesn't work, let's use a more reliable approach
!pip install stable-baselines3
!pip install gym
!pip install shimmy>=0.2.1 # Install shimmy for Gym compatibility with Stable-Baselines3

import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
from sentence_transformers import SentenceTransformer, util
import random
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback



  return datetime.utcnow().replace(tzinfo=utc)




2. Create Education Dataset

In [4]:
def create_education_dataset():
    data = [
        {
            "question": "What is photosynthesis?",
            "expected_response": "Photosynthesis is the process where plants use sunlight, water and carbon dioxide to create oxygen and energy in the form of sugar.",
            "difficulty": "beginner"
        },
        {
            "question": "Solve 2x + 5 = 15",
            "expected_response": "First, subtract 5 from both sides: 2x = 10. Then divide both sides by 2: x = 5",
            "difficulty": "intermediate"
        },
        {
            "question": "What is Newton's first law of motion?",
            "expected_response": "Newton's first law states that an object at rest stays at rest and an object in motion stays in motion with the same speed and in the same direction unless acted upon by an unbalanced force.",
            "difficulty": "intermediate"
        },
        {
            "question": "Explain the water cycle",
            "expected_response": "The water cycle describes how water evaporates from the Earth's surface, rises into the atmosphere, cools and condenses into rain or snow in clouds, and falls again to the surface as precipitation.",
            "difficulty": "beginner"
        },
        {
            "question": "What is the Pythagorean theorem?",
            "expected_response": "The Pythagorean theorem states that in a right triangle, the square of the hypotenuse is equal to the sum of the squares of the other two sides: a¬≤ + b¬≤ = c¬≤",
            "difficulty": "intermediate"
        },
        {
            "question": "What causes seasons on Earth?",
            "expected_response": "Seasons are caused by the tilt of Earth's axis as it orbits the Sun, not by changes in distance from the Sun.",
            "difficulty": "intermediate"
        },
        {
            "question": "How do plants reproduce?",
            "expected_response": "Plants can reproduce sexually through flowers and seeds, or asexually through methods like runners, bulbs, or cuttings.",
            "difficulty": "beginner"
        }
    ]
    return Dataset.from_list(data)

education_dataset = create_education_dataset()
print("Education dataset created with", len(education_dataset), "samples")
print("\nSample data:")
for i in range(2):
    print(f"Q: {education_dataset[i]['question']}")
    print(f"A: {education_dataset[i]['expected_response']}\n")

Education dataset created with 7 samples

Sample data:
Q: What is photosynthesis?
A: Photosynthesis is the process where plants use sunlight, water and carbon dioxide to create oxygen and energy in the form of sugar.

Q: Solve 2x + 5 = 15
A: First, subtract 5 from both sides: 2x = 10. Then divide both sides by 2: x = 5



3. Initialize Language Model

In [9]:
class EducationLanguageModel:
    def __init__(self):
        self.model_name = "microsoft/DialoGPT-small"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
        self.similarity_model = SentenceTransformer('all-MiniLM-L6-v2')

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)
        print(f"Model loaded on: {self.device}")

    def generate_response(self, question, max_length=100):
        try:
            inputs = self.tokenizer.encode(question + self.tokenizer.eos_token,
                                         return_tensors='pt').to(self.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_length=len(inputs[0]) + max_length,
                    num_return_sequences=1,
                    temperature=0.8,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id,
                    attention_mask=torch.ones_like(inputs)
                )

            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            response = response.replace(question, "").strip()
            return response
        except Exception as e:
            print(f"Error in generation: {e}")
            return "I'm not sure how to answer that yet."

    def calculate_similarity(self, text1, text2):
        try:
            embeddings = self.similarity_model.encode([text1, text2])
            similarity = util.cos_sim(embeddings[0], embeddings[1])
            return similarity.item()
        except:
            return 0.0

# Initialize the language model
lm_model = EducationLanguageModel()

Model loaded on: cpu


4. Create Custom Gym Environment

In [10]:
class EducationEnv(gym.Env):
    """Custom Gym environment for educational text generation"""

    def __init__(self, language_model, dataset):
        super(EducationEnv, self).__init__()

        self.language_model = language_model
        self.dataset = dataset
        self.current_idx = 0
        self.step_count = 0
        self.max_steps = 5

        # Define action and observation space
        # Action space: token indices (simplified for demo)
        self.action_space = gym.spaces.Box(
            low=0, high=language_model.tokenizer.vocab_size-1,
            shape=(50,), dtype=np.int32
        )

        # Observation space: question embedding
        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf, shape=(384,), dtype=np.float32
        )

        self.reset()

    def reset(self):
        self.current_idx = (self.current_idx + 1) % len(self.dataset)
        self.current_data = self.dataset[self.current_idx]
        self.step_count = 0

        # Get question embedding as observation
        question_embedding = self.language_model.similarity_model.encode(
            self.current_data['question']
        )

        return question_embedding.astype(np.float32)

    def step(self, action):
        self.step_count += 1

        # Convert action to text (simplified - in practice you'd use the actual generation)
        try:
            # For demo purposes, we'll use the language model to generate response
            # but in real RL, the action would directly control the generation
            generated_response = self.language_model.generate_response(
                self.current_data['question']
            )
        except:
            generated_response = ""

        # Calculate reward
        reward = self._calculate_reward(generated_response)

        # Check if episode is done
        done = self.step_count >= self.max_steps

        # Get next observation
        next_obs = self.reset() if done else self._get_observation()

        info = {
            'question': self.current_data['question'],
            'generated_response': generated_response,
            'expected_response': self.current_data['expected_response'],
            'similarity': self.reward_components['similarity']
        }

        return next_obs, reward, done, info

    def _calculate_reward(self, generated_response):
        expected_response = self.current_data['expected_response']

        # 1. Semantic similarity (main component)
        similarity = self.language_model.calculate_similarity(
            generated_response, expected_response
        )

        # 2. Length appropriateness
        response_length = len(generated_response.split())
        length_score = max(0, 1 - abs(response_length - 40) / 80)

        # 3. Keyword presence
        question_words = set(self.current_data['question'].lower().split()[:3])
        response_words = set(generated_response.lower().split())
        keyword_score = len(question_words.intersection(response_words)) / len(question_words) if question_words else 0.3

        # Combined reward
        total_reward = 0.7 * similarity + 0.15 * length_score + 0.15 * keyword_score

        self.reward_components = {
            'similarity': similarity,
            'length': length_score,
            'keywords': keyword_score
        }

        return total_reward

    def _get_observation(self):
        question_embedding = self.language_model.similarity_model.encode(
            self.current_data['question']
        )
        return question_embedding.astype(np.float32)

    def render(self, mode='human'):
        print(f"Question: {self.current_data['question']}")
        print(f"Expected: {self.current_data['expected_response']}")

# Create environment
env = DummyVecEnv([lambda: EducationEnv(lm_model, education_dataset)])



5. Custom PPO Training with Stable-Baselines3

In [11]:
class TrainingCallback(BaseCallback):
    def __init__(self, check_freq=100, verbose=1):
        super(TrainingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.rewards = []

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            if len(self.model.ep_info_buffer) > 0:
                mean_reward = np.mean([ep_info['r'] for ep_info in self.model.ep_info_buffer])
                self.rewards.append(mean_reward)
                print(f"Step {self.n_calls}, Mean Reward: {mean_reward:.3f}")

                # Test the model
                if hasattr(self, 'test_model'):
                    self.test_model()

        return True

    def test_model(self):
        print("\n--- Testing Current Model ---")
        test_env = EducationEnv(lm_model, education_dataset)
        obs = test_env.reset()

        for i in range(2):  # Test 2 questions
            action = np.random.randint(0, lm_model.tokenizer.vocab_size-1, (50,))
            obs, reward, done, info = test_env.step(action)

            print(f"Test {i+1}:")
            print(f"Q: {info['question']}")
            print(f"A: {info['generated_response'][:80]}...")
            print(f"Similarity: {info['similarity']:.3f}")
            print(f"Reward: {reward:.3f}\n")

            if done:
                obs = test_env.reset()

def train_with_ppo():
    """Train using Stable-Baselines3 PPO"""

    print("Starting PPO Training with Stable-Baselines3...")
    print("=" * 60)

    # PPO configuration
    model = PPO(
        "MlpPolicy",
        env,
        learning_rate=1e-4,
        n_steps=256,
        batch_size=64,
        n_epochs=10,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
        ent_coef=0.01,
        verbose=1,
        tensorboard_log="./ppo_education_tensorboard/"
    )

    # Create callback
    callback = TrainingCallback(check_freq=50)

    # Train the model
    total_timesteps = 2000  # Reduced for demo purposes

    print(f"Training for {total_timesteps} timesteps...")
    model.learn(
        total_timesteps=total_timesteps,
        callback=callback,
        log_interval=50
    )

    print("Training completed!")
    return model, callback.rewards

# Start training
trained_model, rewards_history = train_with_ppo()

Starting PPO Training with Stable-Baselines3...
Using cpu device
Training for 2000 timesteps...
Logging to ./ppo_education_tensorboard/PPO_1
Training completed!


6. Simplified PPO Implementation (If SB3 has issues)

In [12]:
class SimplePPOTrainer:
    """Simplified PPO implementation for education task"""

    def __init__(self, language_model, learning_rate=1e-5):
        self.language_model = language_model
        self.optimizer = torch.optim.AdamW(language_model.model.parameters(), lr=learning_rate)

    def compute_advantages(self, rewards, values, gamma=0.99, gae_lambda=0.95):
        """Compute advantages using Generalized Advantage Estimation"""
        advantages = []
        advantage = 0

        for t in reversed(range(len(rewards))):
            delta = rewards[t] + gamma * values[t + 1] - values[t] if t < len(rewards) - 1 else 0
            advantage = delta + gamma * gae_lambda * advantage
            advantages.insert(0, advantage)

        return torch.tensor(advantages)

    def train_step(self, questions, generated_responses, expected_responses, rewards, epsilon=0.2):
        """Single PPO training step"""

        losses = []

        for i, question in enumerate(questions):
            # Get the probability of generating the response
            full_text = question + " " + generated_responses[i]
            inputs = self.language_model.tokenizer.encode(full_text, return_tensors='pt')
            inputs = inputs.to(self.language_model.device)

            # Forward pass with gradients
            outputs = self.language_model.model(inputs, labels=inputs)
            current_loss = outputs.loss

            # Simple PPO update (simplified)
            advantage = torch.tensor(rewards[i], device=self.language_model.device)

            # Policy gradient loss
            policy_loss = -current_loss * advantage

            losses.append(policy_loss)

        # Backward pass
        if losses:
            total_loss = torch.stack(losses).mean()
            self.optimizer.zero_grad()
            total_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.language_model.model.parameters(), max_norm=1.0)
            self.optimizer.step()

        return total_loss.item() if losses else 0.0

def simple_training_loop():
    """Simple training loop without complex RL dependencies"""

    print("Starting Simplified Training Loop...")
    print("=" * 60)

    trainer = SimplePPOTrainer(lm_model)

    num_epochs = 5
    episodes_per_epoch = 10

    for epoch in range(num_epochs):
        epoch_rewards = []
        epoch_losses = []

        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        print("-" * 40)

        for episode in range(episodes_per_epoch):
            # Sample random question
            idx = random.randint(0, len(education_dataset) - 1)
            data = education_dataset[idx]

            question = data['question']
            expected = data['expected_response']

            # Generate response
            generated = lm_model.generate_response(question)

            # Calculate reward
            similarity = lm_model.calculate_similarity(generated, expected)
            reward = similarity  # Simple reward based on similarity

            # Training step (simplified)
            loss = trainer.train_step([question], [generated], [expected], [reward])

            epoch_rewards.append(reward)
            epoch_losses.append(loss)

            if episode % 2 == 0:
                print(f"Episode {episode + 1}:")
                print(f"Q: {question}")
                print(f"A: {generated[:60]}...")
                print(f"Similarity: {similarity:.3f}, Loss: {loss:.4f}")
                print("-" * 30)

        # Epoch summary
        avg_reward = np.mean(epoch_rewards)
        avg_loss = np.mean(epoch_losses)

        print(f"\nEpoch {epoch + 1} Summary:")
        print(f"Average Reward: {avg_reward:.3f}")
        print(f"Average Loss: {avg_loss:.4f}")
        print("=" * 50)

# Run simplified training
simple_training_loop()

Starting Simplified Training Loop...

Epoch 1/5
----------------------------------------


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Episode 1:
Q: What is the Pythagorean theorem?
A: It's not even the third quarter yet....
Similarity: 0.032, Loss: -0.1610
------------------------------
Episode 3:
Q: Solve 2x + 5 = 15
A: what is the formula?...
Similarity: 0.226, Loss: -2.7597
------------------------------
Episode 5:
Q: How do plants reproduce?
A: This is what happens when you don't let plants reproduce....
Similarity: 0.555, Loss: -2.8958
------------------------------
Episode 7:
Q: Explain the water cycle
A: What??? I thought water was from our stomachs?...
Similarity: 0.246, Loss: -2.5012
------------------------------
Episode 9:
Q: Solve 2x + 5 = 15
A: 3 : 5 1 :5 2 : 5 3 : 5...
Similarity: 0.327, Loss: -3.1901
------------------------------

Epoch 1 Summary:
Average Reward: 0.321
Average Loss: -2.2585

Epoch 2/5
----------------------------------------
Episode 1:
Q: What is photosynthesis?
A: It's a form of photosynthesis....
Similarity: 0.755, Loss: -4.8794
------------------------------
Episode 3:
Q: What is p

7. Evaluation and Testing

In [13]:
def evaluate_model():
    """Comprehensive model evaluation"""

    print("üß™ MODEL EVALUATION")
    print("=" * 60)

    test_questions = [
        "What is photosynthesis?",
        "How do plants make food?",
        "Explain gravity simply",
        "What is 8 √ó 7?",
        "Why do we have seasons?"
    ]

    similarities = []

    for i, question in enumerate(test_questions, 1):
        response = lm_model.generate_response(question)

        # For evaluation, we'll calculate similarity with expected patterns
        if "photosynthesis" in question.lower():
            expected_keywords = ["sunlight", "plants", "carbon", "oxygen", "energy"]
        elif "food" in question.lower():
            expected_keywords = ["photosynthesis", "sunlight", "energy", "plants"]
        elif "gravity" in question.lower():
            expected_keywords = ["force", "pull", "earth", "mass"]
        elif "8 √ó 7" in question or "8*7" in question:
            expected_keywords = ["56"]
        elif "seasons" in question.lower():
            expected_keywords = ["tilt", "axis", "earth", "sun", "orbit"]
        else:
            expected_keywords = []

        # Calculate keyword-based score
        keyword_score = sum(1 for kw in expected_keywords if kw in response.lower()) / len(expected_keywords) if expected_keywords else 0.5

        # Calculate length score
        length_score = min(1.0, len(response.split()) / 30)

        overall_score = 0.7 * keyword_score + 0.3 * length_score
        similarities.append(overall_score)

        print(f"Test {i}:")
        print(f"Q: {question}")
        print(f"A: {response}")
        print(f"Score: {overall_score:.3f}")
        print(f"Length: {len(response.split())} words")
        print("-" * 50)

    # Final evaluation
    avg_score = np.mean(similarities)
    print(f"\nüìä FINAL EVALUATION:")
    print(f"Average Score: {avg_score:.3f}")

    if avg_score > 0.7:
        print("‚úÖ EXCELLENT - Model is performing well!")
    elif avg_score > 0.5:
        print("‚úÖ GOOD - Model is learning!")
    elif avg_score > 0.3:
        print("‚ö†Ô∏è FAIR - Needs more training")
    else:
        print("‚ùå POOR - Significant improvement needed")

# Run evaluation
evaluate_model()

üß™ MODEL EVALUATION
Test 1:
Q: What is photosynthesis?
A: It's so simple.
Score: 0.030
Length: 3 words
--------------------------------------------------
Test 2:
Q: How do plants make food?
A: Why did you go into your computer and open a new tab??
Score: 0.120
Length: 12 words
--------------------------------------------------
Test 3:
Q: Explain gravity simply
A: They're so cool.
Score: 0.030
Length: 3 words
--------------------------------------------------
Test 4:
Q: What is 8 √ó 7?
A: If you don't have any interest in 8, why would you want to read it?
Score: 0.150
Length: 15 words
--------------------------------------------------
Test 5:
Q: Why do we have seasons?
A: No. Please no.
Score: 0.030
Length: 3 words
--------------------------------------------------

üìä FINAL EVALUATION:
Average Score: 0.072
‚ùå POOR - Significant improvement needed


8. Interactive Demo

In [None]:
def interactive_demo():
    """Interactive chat with the educated model"""

    print("ü§ñ EDUCATION TUTOR DEMO")
    print("Type 'quit' to exit the demo")
    print("=" * 50)

    while True:
        question = input("\nüßë‚Äçüéì Student: ").strip()

        if question.lower() in ['quit', 'exit', 'bye']:
            print("ü§ñ Tutor: Goodbye! Keep learning!")
            break

        if not question:
            continue

        response = lm_model.generate_response(question)
        print(f"ü§ñ Tutor: {response}")

# Start interactive demo
print("\n" + "="*60)
print("üöÄ STARTING INTERACTIVE EDUCATION TUTOR")
print("="*60)
interactive_demo()