In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# NovaMind AI: Adaptive Content Recommendation with Policy Gradient Methods -- Implementation Notebook

## Environment Setup

In [None]:
!pip install -q gymnasium torch numpy matplotlib seaborn scipy

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

torch.manual_seed(42)
np.random.seed(42)

print("Setup complete!")

## 1. Simulated Learner Environment

We simulate an educational platform where learners interact with content modules. The environment models learner knowledge, engagement, and progression.

In [None]:
class LearnerEnvironment:
    """
    Simulated environment for adaptive content recommendation.
    Models learner behavior as an MDP.
    """
    def __init__(self, num_modules=100, state_dim=32, num_topics=10, max_steps=15):
        self.num_modules = num_modules
        self.state_dim = state_dim
        self.num_topics = num_topics
        self.max_steps = max_steps

        # Module properties
        self.module_difficulty = np.random.uniform(0, 1, num_modules)
        self.module_topic = np.random.randint(0, num_topics, num_modules)
        self.module_quality = np.random.uniform(0.5, 1.0, num_modules)

        # Module embeddings for scoring
        self.module_embeddings = np.random.randn(num_modules, 16).astype(np.float32)

    def reset(self, learner_level=None):
        """Reset for a new learner session."""
        if learner_level is None:
            learner_level = np.random.uniform(0.1, 0.9)

        self.learner_level = learner_level
        self.knowledge = np.full(self.num_topics, learner_level)
        self.step_count = 0
        self.modules_seen = set()
        self.session_engagement = 1.0  # Decays if bad recommendations

        return self._get_state()

    def _get_state(self):
        """Construct the state vector."""
        state = np.zeros(self.state_dim, dtype=np.float32)
        state[:self.num_topics] = self.knowledge
        state[self.num_topics] = self.learner_level
        state[self.num_topics + 1] = self.step_count / self.max_steps
        state[self.num_topics + 2] = self.session_engagement
        state[self.num_topics + 3] = len(self.modules_seen) / self.num_modules
        return state

    def step(self, action):
        """Learner interacts with recommended module."""
        assert 0 <= action < self.num_modules

        difficulty = self.module_difficulty[action]
        topic = self.module_topic[action]
        quality = self.module_quality[action]

        # Completion probability depends on difficulty-level match
        difficulty_gap = abs(difficulty - self.knowledge[topic])
        completion_prob = max(0.1, 1.0 - 2.0 * difficulty_gap) * quality

        completed = np.random.random() < completion_prob

        # Compute reward
        reward = 0.0
        if completed:
            reward += 1.0
            # Knowledge gain
            gain = 0.05 * (1.0 - self.knowledge[topic])
            self.knowledge[topic] = min(1.0, self.knowledge[topic] + gain)
            # Quiz bonus (simulated)
            quiz_score = min(1.0, self.knowledge[topic] + np.random.normal(0, 0.1))
            reward += 0.5 * max(0, quiz_score)
        else:
            reward -= 0.3
            self.session_engagement *= 0.9

        # Engagement bonus for appropriate difficulty
        if difficulty_gap < 0.15:
            reward += 0.3

        # Novelty penalty for repeated modules
        if action in self.modules_seen:
            reward -= 0.5

        self.modules_seen.add(action)
        self.step_count += 1

        done = (self.step_count >= self.max_steps) or (self.session_engagement < 0.3)
        info = {
            "completed": completed,
            "difficulty_gap": difficulty_gap,
            "knowledge_gain": gain if completed else 0,
            "topic": topic
        }

        return self._get_state(), reward, done, info

    def get_candidates(self, k=20):
        """Return top-k candidate modules based on topic relevance."""
        topic_scores = np.zeros(self.num_modules)
        for i in range(self.num_modules):
            topic = self.module_topic[i]
            # Prefer topics where knowledge is moderate (zone of proximal development)
            topic_scores[i] = 1.0 - abs(self.knowledge[topic] - self.module_difficulty[i])
            if i not in self.modules_seen:
                topic_scores[i] += 0.2  # Novelty bonus

        top_k = np.argsort(topic_scores)[-k:]
        return top_k

env = LearnerEnvironment()
state = env.reset()
print(f"State dimension: {state.shape}")
print(f"Number of modules: {env.num_modules}")
print(f"Initial learner level: {env.learner_level:.3f}")

## 2. Exploratory Data Analysis

In [None]:
# Collect episodes with random policy
def collect_random_episodes(env, num_episodes=500):
    episode_rewards = []
    episode_lengths = []
    completion_rates = []

    for _ in range(num_episodes):
        state = env.reset()
        total_reward = 0
        completions = 0
        steps = 0
        done = False

        while not done:
            candidates = env.get_candidates(k=20)
            action = np.random.choice(candidates)
            state, reward, done, info = env.step(action)
            total_reward += reward
            if info["completed"]:
                completions += 1
            steps += 1

        episode_rewards.append(total_reward)
        episode_lengths.append(steps)
        completion_rates.append(completions / max(steps, 1))

    return episode_rewards, episode_lengths, completion_rates

rewards, lengths, completions = collect_random_episodes(env)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(rewards, bins=30, color='steelblue', edgecolor='white', alpha=0.8)
axes[0].set_xlabel('Episode Reward')
axes[0].set_title('Reward Distribution (Random Policy)')
axes[0].axvline(np.mean(rewards), color='red', linestyle='--', label=f'Mean: {np.mean(rewards):.2f}')
axes[0].legend()

axes[1].hist(lengths, bins=15, color='seagreen', edgecolor='white', alpha=0.8)
axes[1].set_xlabel('Episode Length')
axes[1].set_title('Session Length Distribution')

axes[2].hist(completions, bins=20, color='coral', edgecolor='white', alpha=0.8)
axes[2].set_xlabel('Completion Rate')
axes[2].set_title('Module Completion Rate')
axes[2].axvline(np.mean(completions), color='red', linestyle='--', label=f'Mean: {np.mean(completions):.2%}')
axes[2].legend()

plt.tight_layout()
plt.show()

print(f"Random policy baseline:")
print(f"  Mean reward: {np.mean(rewards):.2f}")
print(f"  Mean session length: {np.mean(lengths):.1f}")
print(f"  Mean completion rate: {np.mean(completions):.2%}")

## 3. Baseline: Rule-Based Recommender

In [None]:
class RuleBasedRecommender:
    """Baseline: always recommend the module closest in difficulty to learner level."""

    def recommend(self, state, candidates, env):
        learner_knowledge = state[:env.num_topics]
        best_action = candidates[0]
        best_gap = float('inf')

        for c in candidates:
            topic = env.module_topic[c]
            gap = abs(env.module_difficulty[c] - learner_knowledge[topic])
            if gap < best_gap:
                best_gap = gap
                best_action = c

        return best_action

# Evaluate baseline
baseline = RuleBasedRecommender()
baseline_rewards = []
baseline_completions = []

for _ in range(500):
    state = env.reset()
    total_reward = 0
    completions = 0
    steps = 0
    done = False

    while not done:
        candidates = env.get_candidates(k=20)
        action = baseline.recommend(state, candidates, env)
        state, reward, done, info = env.step(action)
        total_reward += reward
        if info["completed"]:
            completions += 1
        steps += 1

    baseline_rewards.append(total_reward)
    baseline_completions.append(completions / max(steps, 1))

print(f"Rule-based baseline:")
print(f"  Mean reward: {np.mean(baseline_rewards):.2f}")
print(f"  Mean completion rate: {np.mean(baseline_completions):.2%}")

## 4. Actor-Critic Content Recommender

In [None]:
class ContentActor(nn.Module):
    """Policy network for content recommendation."""
    def __init__(self, state_dim, module_embed_dim=16, hidden_dim=128):
        super().__init__()
        self.state_encoder = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 64)
        )
        self.scorer = nn.Sequential(
            nn.Linear(64 + module_embed_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, state, candidate_embeddings):
        """
        state: (state_dim,)
        candidate_embeddings: (K, embed_dim)
        Returns: probabilities over K candidates
        """
        encoded = self.state_encoder(state)  # (64,)
        K = candidate_embeddings.shape[0]
        encoded_expanded = encoded.unsqueeze(0).expand(K, -1)  # (K, 64)

        combined = torch.cat([encoded_expanded, candidate_embeddings], dim=-1)  # (K, 64+16)
        scores = self.scorer(combined).squeeze(-1)  # (K,)
        probs = F.softmax(scores, dim=-1)
        return probs

    def sample_action(self, state, candidate_embeddings, candidates):
        probs = self.forward(state, candidate_embeddings)
        dist = torch.distributions.Categorical(probs)
        idx = dist.sample()
        return candidates[idx.item()], dist.log_prob(idx), dist.entropy()


class ContentCritic(nn.Module):
    """Value network: estimates expected session return."""
    def __init__(self, state_dim, hidden_dim=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, state):
        return self.net(state).squeeze(-1)

actor = ContentActor(state_dim=32)
critic = ContentCritic(state_dim=32)

# Count parameters
actor_params = sum(p.numel() for p in actor.parameters())
critic_params = sum(p.numel() for p in critic.parameters())
print(f"Actor parameters: {actor_params:,}")
print(f"Critic parameters: {critic_params:,}")
print(f"Total parameters: {actor_params + critic_params:,}")

## 5. Training Loop

In [None]:
GAMMA = 0.95
ENTROPY_COEFF = 0.01
CRITIC_COEFF = 0.5
NUM_EPISODES = 2000
LR_ACTOR = 3e-4
LR_CRITIC = 1e-3

actor = ContentActor(state_dim=32)
critic = ContentCritic(state_dim=32)
actor_opt = torch.optim.Adam(actor.parameters(), lr=LR_ACTOR)
critic_opt = torch.optim.Adam(critic.parameters(), lr=LR_CRITIC)

reward_history = []
completion_history = []
entropy_history = []

for episode in range(NUM_EPISODES):
    state = env.reset()
    log_probs, entropies, values, rewards_ep = [], [], [], []
    completions_ep = 0
    done = False

    while not done:
        state_t = torch.tensor(state, dtype=torch.float32)
        candidates = env.get_candidates(k=20)
        cand_embeds = torch.tensor(env.module_embeddings[candidates], dtype=torch.float32)

        # Actor: sample action
        action, log_prob, entropy = actor.sample_action(state_t, cand_embeds, candidates)

        # Critic: estimate value
        value = critic(state_t)

        # Step environment
        next_state, reward, done, info = env.step(action)
        if info["completed"]:
            completions_ep += 1

        log_probs.append(log_prob)
        entropies.append(entropy)
        values.append(value)
        rewards_ep.append(reward)

        state = next_state

    # Compute returns
    returns = []
    G = 0
    for r in reversed(rewards_ep):
        G = r + GAMMA * G
        returns.insert(0, G)
    returns_t = torch.tensor(returns, dtype=torch.float32)

    # Stack tensors
    log_probs_t = torch.stack(log_probs)
    values_t = torch.stack(values)
    entropies_t = torch.stack(entropies)

    # Advantages
    advantages = returns_t - values_t.detach()
    if len(advantages) > 1:
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    # Losses
    actor_loss = -(log_probs_t * advantages).mean()
    critic_loss = F.mse_loss(values_t, returns_t)
    entropy_loss = -entropies_t.mean()

    total_loss = actor_loss + CRITIC_COEFF * critic_loss + ENTROPY_COEFF * entropy_loss

    # Update
    actor_opt.zero_grad()
    critic_opt.zero_grad()
    total_loss.backward()
    actor_opt.step()
    critic_opt.step()

    ep_reward = sum(rewards_ep)
    ep_completion = completions_ep / max(len(rewards_ep), 1)
    reward_history.append(ep_reward)
    completion_history.append(ep_completion)
    entropy_history.append(entropies_t.mean().item())

    if (episode + 1) % 200 == 0:
        avg_r = np.mean(reward_history[-200:])
        avg_c = np.mean(completion_history[-200:])
        avg_e = np.mean(entropy_history[-200:])
        print(f"Ep {episode+1:5d} | Reward: {avg_r:.2f} | Completion: {avg_c:.2%} | Entropy: {avg_e:.2f}")

print("\nTraining complete!")

## 6. Evaluation

In [None]:
# Evaluate trained model vs baseline
def evaluate(env, model_fn, num_episodes=500):
    rewards, completions, lengths = [], [], []
    for _ in range(num_episodes):
        state = env.reset()
        total_reward, comps, steps = 0, 0, 0
        done = False
        while not done:
            action = model_fn(state, env)
            state, reward, done, info = env.step(action)
            total_reward += reward
            if info["completed"]:
                comps += 1
            steps += 1
        rewards.append(total_reward)
        completions.append(comps / max(steps, 1))
        lengths.append(steps)
    return {"reward": np.mean(rewards), "completion": np.mean(completions),
            "length": np.mean(lengths), "rewards_list": rewards, "completions_list": completions}

# Model inference functions
def pg_recommend(state, env):
    state_t = torch.tensor(state, dtype=torch.float32)
    candidates = env.get_candidates(k=20)
    cand_embeds = torch.tensor(env.module_embeddings[candidates], dtype=torch.float32)
    with torch.no_grad():
        probs = actor(state_t, cand_embeds)
    return candidates[probs.argmax().item()]

def baseline_recommend(state, env):
    candidates = env.get_candidates(k=20)
    return baseline.recommend(state, candidates, env)

pg_results = evaluate(env, pg_recommend)
bl_results = evaluate(env, baseline_recommend)

print(f"{'Metric':<25s} {'Baseline':>10s} {'Policy Gradient':>15s} {'Lift':>10s}")
print("-" * 62)
for metric in ["reward", "completion", "length"]:
    bl_val = bl_results[metric]
    pg_val = pg_results[metric]
    lift = (pg_val - bl_val) / abs(bl_val) * 100 if bl_val != 0 else 0
    fmt = ".2%" if metric == "completion" else ".2f"
    print(f"{metric:<25s} {bl_val:>10{fmt}} {pg_val:>15{fmt}} {lift:>9.1f}%")

## 7. Results Visualization

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Training reward curve
window = 50
smoothed_r = np.convolve(reward_history, np.ones(window)/window, mode='valid')
axes[0, 0].plot(range(window-1, len(reward_history)), smoothed_r, color='steelblue', linewidth=2)
axes[0, 0].axhline(bl_results["reward"], color='red', linestyle='--', label='Baseline')
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Session Reward')
axes[0, 0].set_title('Training Reward Curve')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Completion rate
smoothed_c = np.convolve(completion_history, np.ones(window)/window, mode='valid')
axes[0, 1].plot(range(window-1, len(completion_history)), smoothed_c, color='seagreen', linewidth=2)
axes[0, 1].axhline(bl_results["completion"], color='red', linestyle='--', label='Baseline')
axes[0, 1].set_xlabel('Episode')
axes[0, 1].set_ylabel('Completion Rate')
axes[0, 1].set_title('Training Completion Rate')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Entropy
smoothed_e = np.convolve(entropy_history, np.ones(window)/window, mode='valid')
axes[1, 0].plot(range(window-1, len(entropy_history)), smoothed_e, color='coral', linewidth=2)
axes[1, 0].set_xlabel('Episode')
axes[1, 0].set_ylabel('Policy Entropy')
axes[1, 0].set_title('Policy Entropy Over Training')
axes[1, 0].grid(True, alpha=0.3)

# Comparison bar chart
metrics = ['Reward', 'Completion\nRate', 'Session\nLength']
bl_vals = [bl_results['reward'], bl_results['completion'], bl_results['length']]
pg_vals = [pg_results['reward'], pg_results['completion'], pg_results['length']]

x = np.arange(len(metrics))
w = 0.35
axes[1, 1].bar(x - w/2, bl_vals, w, label='Rule-based Baseline', color='lightcoral')
axes[1, 1].bar(x + w/2, pg_vals, w, label='Policy Gradient', color='steelblue')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(metrics)
axes[1, 1].set_title('Final Evaluation: Baseline vs Policy Gradient')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 8. A/B Test Simulation

In [None]:
# Simulate A/B test
np.random.seed(42)

n_users = 500
bl_session_rewards = []
pg_session_rewards = []

for _ in range(n_users):
    # Baseline user
    state = env.reset()
    total_r = 0
    done = False
    while not done:
        action = baseline_recommend(state, env)
        state, r, done, _ = env.step(action)
        total_r += r
    bl_session_rewards.append(total_r)

    # PG user (same initial conditions)
    state = env.reset(learner_level=env.learner_level)
    total_r = 0
    done = False
    while not done:
        action = pg_recommend(state, env)
        state, r, done, _ = env.step(action)
        total_r += r
    pg_session_rewards.append(total_r)

# Statistical test
t_stat, p_value = stats.ttest_ind(pg_session_rewards, bl_session_rewards)
lift = (np.mean(pg_session_rewards) - np.mean(bl_session_rewards)) / abs(np.mean(bl_session_rewards)) * 100

print(f"A/B Test Results (n={n_users} per group):")
print(f"  Baseline mean reward:       {np.mean(bl_session_rewards):.2f}")
print(f"  Policy Gradient mean reward: {np.mean(pg_session_rewards):.2f}")
print(f"  Lift: {lift:.1f}%")
print(f"  t-statistic: {t_stat:.3f}")
print(f"  p-value: {p_value:.6f}")
print(f"  Significant at alpha=0.05: {'Yes' if p_value < 0.05 else 'No'}")