<a href="https://colab.research.google.com/github/SheikhMudassarHanif/GenAi/blob/main/GRPO_BASICS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

In [None]:
#simple vocab
vocab=['apple','cherry','date','elderberry','banana']

In [None]:
np.random.seed(42)
logits=np.random.randn(len(vocab))

In [None]:
logits

array([ 0.49671415, -0.1382643 ,  0.64768854,  1.52302986, -0.23415337])

In [None]:
def softmax(x):
  exp_x=np.exp(x-np.max(x))
  return exp_x/np.sum(exp_x)

In [None]:
def sample_word(logits):
  probs=softmax(logits)
  word_idx=np.random.choice(len(vocab),p=probs)
  return word_idx,probs

In [None]:
def reward(word_idx):
  """ for demo if our word starts with vowel it returns 1 else 0
  """
  word=vocab[word_idx]
  return 1.0 if word[0].lower() in 'aeiou' else 0

In [None]:
def group_label(word_idx):
  word=vocab[word_idx]
  return "vowel" if word[0].lower() in 'aeiou' else "consonant"

In [None]:
#hyper params
num_iterations=50
num_episodes=20 #num of how many samples per iterations
learning_rate=0.1

In [None]:
for iteration in range(1,num_iterations+1):
  trajectories=[]
  for _ in range(num_episodes):
    word_idx,probs=sample_word(logits)
    r=reward(word_idx)
    grp=group_label(word_idx)
    trajectories.append((word_idx,r,grp,probs))


  group_rewards={}
  group_counts={}
  for word_idx,r,grp,probs in trajectories:
    group_rewards.setdefault(grp,0)
    group_counts.setdefault(grp,0)
    group_rewards[grp]+=r
    group_counts[grp]+=1

  group_avg={grp:group_rewards[grp]/group_counts[grp] for grp in group_rewards if group_counts[grp]>0}
  grad=np.zeros_like(logits)
  total_advantage=0.0
  for word_idx,r,grp,probs in trajectories:
    adv=r-group_avg[grp]
    total_advantage+=adv
    grad_sample=-probs.copy()
    grad_sample[word_idx]+=1.0
    grad+=grad_sample*adv

  logits+=learning_rate*grad

 #reporting for this iteration
  avg_advantage=total_advantage/num_episodes
  print(f"iteration:{iteration:02d}")
  print("updated Probablities")
  for word,p in zip(vocab,probs):
    print(f"{word:12s}:{p:.3f}")
  print(f", Avg Advantage :{avg_advantage:.3f}\n")





iteration:01
updated Probablities
apple       :0.168
cherry      :0.089
date        :0.195
elderberry  :0.468
banana      :0.081
, Avg Advantage :0.000

iteration:02
updated Probablities
apple       :0.168
cherry      :0.089
date        :0.195
elderberry  :0.468
banana      :0.081
, Avg Advantage :0.000

iteration:03
updated Probablities
apple       :0.168
cherry      :0.089
date        :0.195
elderberry  :0.468
banana      :0.081
, Avg Advantage :0.000

iteration:04
updated Probablities
apple       :0.168
cherry      :0.089
date        :0.195
elderberry  :0.468
banana      :0.081
, Avg Advantage :0.000

iteration:05
updated Probablities
apple       :0.168
cherry      :0.089
date        :0.195
elderberry  :0.468
banana      :0.081
, Avg Advantage :0.000

iteration:06
updated Probablities
apple       :0.168
cherry      :0.089
date        :0.195
elderberry  :0.468
banana      :0.081
, Avg Advantage :0.000

iteration:07
updated Probablities
apple       :0.168
cherry      :0.089
date      

In [None]:
print("FInal Vocab Probablites (after GRPO)")
for word,p in zip(vocab,softmax(logits)):
  print(f"{word:12s}:{p:.3f}")

FInal Vocab Probablites (after GRPO)
apple       :0.168
cherry      :0.089
date        :0.195
elderberry  :0.468
banana      :0.081
