# CSE 546: Reinforcement Learning (Final Project)
## NegotiableAI - Adaptive Reinforcement Learning for Real-Time Buyer–Seller Negotiations

## Submitted by:
Sada Kakarla - 50605634 - sadakaka
<br> Shivansh Gupta - 50604127 - sgupta67
<br> Aditi Sinha - 50593917 - asinha25

### Importing Libraries

In [97]:
import gymnasium as gym
from gymnasium import spaces 
import numpy as np
import torch
import torch.nn as nn
from torch.distributions import Categorical
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import matplotlib.pyplot as plt

import plotly.express as px
import pandas as pd

### Defining the Negotiation Agent Environmnent 

In [None]:
# Defining the Negotiation Environment between the buyer and seller

class Negotiation_Agent(gym.Env):
    def __init__(self, seller_min_amt, buyer_reservation_price, max_rounds=20,
                 initial_selling_price=None, gamma_seller=1.0, gamma_buyer=1.0,
                 shaping_lambda=1.0, time_penalty=0.5):
        super().__init__()
        self.max_rounds              = max_rounds
        self.seller_min_amt          = seller_min_amt
        self.buyer_reservation_price = buyer_reservation_price
        self.gamma_seller            = gamma_seller
        self.gamma_buyer             = gamma_buyer
        self.shaping_lambda          = shaping_lambda
        self.time_penalty            = time_penalty

        if initial_selling_price is None:
            initial_selling_price = np.random.randint(
                int(buyer_reservation_price), int(buyer_reservation_price * 1.5)
            )
        self.initial_selling_price = initial_selling_price

        low  = np.array([seller_min_amt, 0, 0, 0], dtype=np.float32)
        high = np.array([self.initial_selling_price, max_rounds, 1, 1], dtype=np.float32)
        self.observation_space = spaces.Box(low=low, high=high, dtype=np.float32)
        self.action_space      = spaces.Discrete(3)

    def reset(self):
        self.round             = 0
        self.current_offer     = self.initial_selling_price
        self.last_seller_offer = self.initial_selling_price
        self.last_buyer_offer  = None
        self.turn              = 0  # 0=Buyer, 1=Seller
        self.deal_status       = False
        return self._obs()

    def _obs(self):
        return np.array([
            self.current_offer,
            self.round,
            self.turn,
            float(self.deal_status)
        ], dtype=np.float32)

    def _buyer_potential(self, price):
        return (self.initial_selling_price - price) / max(1, self.initial_selling_price - self.seller_min_amt)

    def _seller_potential(self, price):
        return (price - self.seller_min_amt) / max(1, self.initial_selling_price - self.seller_min_amt)

    def step(self, action):
        done   = False
        reward = {'buyer': -self.time_penalty, 'seller': -self.time_penalty}

        margin_seller = max(1, self.initial_selling_price - self.seller_min_amt)
        margin_buyer  = max(1, self.buyer_reservation_price  - self.seller_min_amt)

        # max rounds timeout
        if self.round >= self.max_rounds:
            done = True
            reward['buyer'] = reward['seller'] = -10.0
            return self._obs(), reward, done, {}

        # ACCEPT
        if action == 1:
            if self.round < 1:
                return self._obs(), {'buyer': -1.0,'seller':-1.0}, False, {}
            done            = True
            self.deal_status = True
            final = self.current_offer
            seller_pot = (final - self.seller_min_amt) / margin_seller
            buyer_pot  = (self.buyer_reservation_price - final) / margin_buyer
            reward['seller'] += self.gamma_seller * seller_pot
            reward['buyer']  += self.gamma_buyer  * buyer_pot

        # REJECT
        elif action == 2:
            done = True
            reward['buyer'] = reward['seller'] = -5.0

        # COUNTEROFFER
        elif isinstance(action, tuple) and action[0] == "Counteroffer":
            new_p = action[1]
            # BUYER
            if self.turn == 0:
                if new_p >= self.current_offer or new_p < self.seller_min_amt or new_p > self.buyer_reservation_price:
                    reward['buyer'] = -3.0
                else:
                    old_pot = (self.buyer_reservation_price - self.current_offer) / margin_buyer
                    new_pot = (self.buyer_reservation_price - new_p) / margin_buyer
                    reward['buyer'] += self.shaping_lambda * (new_pot - old_pot)
                    self.current_offer    = new_p
                    self.last_buyer_offer = new_p
                    self.turn             = 1
                    self.round           += 1
            # SELLER
            else:
                if new_p <= self.current_offer or new_p > self.last_seller_offer:
                    reward['seller'] = -3.0
                else:
                    old_pot = (self.current_offer - self.seller_min_amt) / margin_seller
                    new_pot = (new_p - self.seller_min_amt) / margin_seller
                    reward['seller'] += self.shaping_lambda * (new_pot - old_pot)
                    self.current_offer      = new_p
                    self.last_seller_offer  = new_p
                    self.turn               = 0
                    self.round             += 1

        # INVALID OFFER
        else:
            reward['buyer'] = reward['seller'] = -2.0

        # ran out of rounds 
        if self.round >= self.max_rounds and not done:
            done = True
            reward['buyer']  = -10.0
            reward['seller'] = -20.0

        return self._obs(), reward, done, {}

    def render(self, mode='human'):
        turn_str = "Buyer" if self.turn==0 else "Seller"
        print(f"Round {self.round}, Turn: {turn_str}, Offer: {self.current_offer}, Deal: {self.deal_status}")

### Proximal Policy Optimization (PPO) Algorithm

In [56]:
# ─── 2) PPO Policy ─────────────────────────────────────────────────────────────
class PPOPolicy(nn.Module):
    def __init__(self, obs_dim, hidden_dim=64):
        super().__init__()
        self.backbone   = nn.Sequential(
            nn.Linear(obs_dim, hidden_dim), nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim), nn.Tanh(),
        )
        self.act_head   = nn.Linear(hidden_dim, 3)
        self.price_head = nn.Linear(hidden_dim, 1)
        self.val_head   = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        h = self.backbone(x)
        return self.act_head(h), self.price_head(h), self.val_head(h)

    def get_action(self, obs, env):
        logits, price_raw, value = self.forward(obs)
        dist  = Categorical(logits=logits)

        #(1) sample the discrete branch
        a_idx = dist.sample()           # a 0-dim LongTensor in {0,1,2}
        logp  = dist.log_prob(a_idx)    # log‐prob of that choice
        ent   = dist.entropy()          # entropy of the discrete policy

        # (2) build the env‐callable action
        if a_idx.item() == 0:
            # Counteroffer branch
            cur = int(env.current_offer)
            if env.turn == 0:
                lo = env.seller_min_amt if env.last_buyer_offer is None else env.last_buyer_offer
                hi = cur - 1
            else:
                lo = cur + 1
                hi = env.last_seller_offer

            if hi <= lo:
                env_action = 1   # fallback to “Accept”
            else:
                frac  = torch.sigmoid(price_raw).item()
                price = lo + int((hi - lo) * frac)
                env_action = ("Counteroffer", price)
        else:
            # Accept (1) or Reject (2)
            env_action = a_idx.item()

        # (3) return both for training and for env.step
        return env_action, a_idx, logp, value, ent


In [57]:
# GAE
def compute_gae(rews, vals, gamma=0.99, lam=0.95):
    gae  = 0.0
    out  = []
    vals = vals + [0.0]
    for t in reversed(range(len(rews))):
        δ   = rews[t] + gamma * vals[t+1] - vals[t]
        gae = δ + gamma * lam * gae
        out.insert(0, gae)
    return out


### Training the agent with PPO updates

In [None]:
def trial_train(env, policy, opt,
          batch_size=32,      # collect more episodes per update
          epochs=3000,
          gamma=0.99,
          lam=0.95,
          clip=0.2,
          ent_coef=0.01,
          value_coef=0.5,
          K_epochs=4,         # do multiple passes over each batch
          minibatch_size=64):
    device = next(policy.parameters()).device

    pol_losses, val_losses, avg_rews, success_rates = [], [], [], []

    for ep in range(1, epochs+1):
        
        #  COLLECT ROLLOUT 
        obs_buf, act_buf, old_logp_buf = [], [], []
        val_buf, rew_buf = [], []
        ep_rewards = []
        batch_successes = []

        for _ in range(batch_size):
            o, done = env.reset(), False
            while not done:
                low, high = env.observation_space.low, env.observation_space.high
                o_n = (o - low) / (high - low + 1e-8)

                t = torch.tensor(o_n, dtype=torch.float32, device=device)
                action, a_idx, logp, value, ent = policy.get_action(t, env)
                if isinstance(action, tuple):
                    a_idx = torch.tensor(0, device=device)

                obs_buf.append(t)
                act_buf.append(a_idx)
                old_logp_buf.append(logp)
                val_buf.append(value.squeeze())

                o2, rd, done, _ = env.step(action)
                rew_buf.append(rd['buyer'])
                o = o2

            ep_rewards.append(sum(rew_buf[-env.max_rounds:]))
            batch_successes.append(1 if env.deal_status else 0)

        # batch‐tensors
        obs_batch      = torch.stack(obs_buf)
        old_logp_batch = torch.stack(old_logp_buf)
        vals_batch     = torch.stack(val_buf)
        act_batch      = torch.stack(act_buf)
        rews           = rew_buf

        # Compute Advantages & GAE
        advantages = []
        gae = 0.0
        vs = vals_batch.tolist() + [0.0]
        for i in reversed(range(len(rews))):
            delta = rews[i] + gamma * vs[i+1] - vs[i]
            gae = delta + gamma * lam * gae
            advantages.insert(0, gae)
        adv_batch = torch.tensor(advantages, dtype=torch.float32, device=device)
        adv_batch = (adv_batch - adv_batch.mean()) / (adv_batch.std() + 1e-8)
        ret_batch = adv_batch + vals_batch

        success_rate = sum(batch_successes) / batch_size
        success_rates.append(success_rate)
        
        # PPO Updates
        total_ploss = 0.0
        total_vloss = 0.0
        total_ent   = 0.0

        for _ in range(K_epochs):
            perm = torch.randperm(obs_batch.size(0), device=device)
            for start in range(0, obs_batch.size(0), minibatch_size):
                idx        = perm[start:start+minibatch_size]
                ob_mb      = obs_batch[idx]
                old_lp_mb  = old_logp_batch[idx]
                adv_mb     = adv_batch[idx]
                ret_mb     = ret_batch[idx]
                act_mb     = act_batch[idx]

                logits, price_raw, val_pred = policy(ob_mb)
                dist = Categorical(logits=logits)

                new_logp = dist.log_prob(act_mb)
                entropy  = dist.entropy().mean()

                ratio = (new_logp - old_lp_mb).exp()
                pg1   = ratio * adv_mb
                pg2   = torch.clamp(ratio, 1-clip, 1+clip) * adv_mb
                ploss = -torch.min(pg1, pg2).mean()

                vpred        = val_pred.squeeze()
                vals_mb      = vals_batch[idx]
                vpred_clipped= vals_mb + (vpred - vals_mb).clamp(-clip, clip)
                vloss1       = (ret_mb - vpred).pow(2)
                vloss2       = (ret_mb - vpred_clipped).pow(2)
                vloss        = 0.5 * torch.max(vloss1, vloss2).mean()

                total_ploss += ploss
                total_vloss += vloss
                total_ent   += entropy

        # single backward per epoch
        loss = total_ploss + value_coef * total_vloss - ent_coef * total_ent
        opt.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(policy.parameters(), 0.5)
        opt.step()

        pol_losses.append(total_ploss.item())
        val_losses.append(total_vloss.item())
        # entropies.append(total_ent.item())
        avg_rews.append(sum(ep_rewards) / batch_size)

        if ep % 100 == 0:
            print(f"Ep {ep:04d} | P-loss {pol_losses[-1]:.3f}  "
                  f"V-loss {val_losses[-1]:.3f}  Success_Rate {success_rate} "
                  f"Avg_Reward {avg_rews[-1]:.3f} "
                  f"Deal Status {env.deal_status}")

    return pol_losses, val_losses, avg_rews, success_rates

In [119]:
seller_min_amt_new = 700
buyer_reservation_price_new = 1200
trial_env    = Negotiation_Agent(seller_min_amt_new, buyer_reservation_price_new, max_rounds=10)
policy = PPOPolicy(trial_env.observation_space.shape[0])
opt    = torch.optim.Adam(policy.parameters(), lr=3e-5)

p_loss, v_loss, avg_r, success_r = trial_train(trial_env, policy, opt,
                                    batch_size=32, epochs=3000)

chkpt = {
    "state.dict": policy.state_dict(),
    "success rates": success_r
}

torch.save(chkpt, "ppo_negotiation_weights_2.pth")
print("✅ Done")


Ep 0100 | P-loss -1.369  V-loss 5.792  Success_Rate 0.21875 Avg_Reward -19.726 Deal Status False
Ep 0200 | P-loss -0.327  V-loss 3.686  Success_Rate 0.25 Avg_Reward -19.351 Deal Status False
Ep 0300 | P-loss 1.298  V-loss 3.501  Success_Rate 0.3125 Avg_Reward -19.708 Deal Status True
Ep 0400 | P-loss -3.088  V-loss 7.708  Success_Rate 0.21875 Avg_Reward -20.242 Deal Status False
Ep 0500 | P-loss -0.041  V-loss 3.378  Success_Rate 0.28125 Avg_Reward -17.586 Deal Status False
Ep 0600 | P-loss 0.042  V-loss 3.928  Success_Rate 0.15625 Avg_Reward -20.218 Deal Status False
Ep 0700 | P-loss -0.195  V-loss 4.309  Success_Rate 0.21875 Avg_Reward -20.822 Deal Status False
Ep 0800 | P-loss 0.000  V-loss 1.968  Success_Rate 0.34375 Avg_Reward -17.493 Deal Status False
Ep 0900 | P-loss 0.551  V-loss 3.865  Success_Rate 0.125 Avg_Reward -24.729 Deal Status False
Ep 1000 | P-loss -0.039  V-loss 3.781  Success_Rate 0.25 Avg_Reward -19.899 Deal Status True
Ep 1100 | P-loss -0.686  V-loss 4.155  Succes

#### Visualizing Policy Loss & Value Loss for every 10th episode 

In [122]:
p_loss_new = p_loss[::10]
v_loss_new = v_loss[::10]

In [None]:
# Plotting Policy Loss and Value Loss

df_plot = pd.DataFrame({
    'Episodes': range(len(p_loss_new)),
    'Value Loss': v_loss_new,
    'Policy Loss': p_loss_new
})

fig = px.line(
    df_plot,
    x='Episodes',
    y=['Value Loss','Policy Loss'],
    title='Policy Loss & Value Loss',
    labels={'value':'Value','variable':'Series'},
    color_discrete_sequence=['blue','red']
)

fig.show()

In [None]:
# Plotting Success Rate 

df_plot_sucess = pd.DataFrame({
    'Episodes': range(len(success_r[::5])),
    "Success_rate": success_r[::5]
})

fig_success = px.line(
    df_plot_sucess,
    x='Episodes',
    y="Success_rate",
    title='Success Rate Over Training Episodes',
    labels={'value':'Value','variable':'Series'},
    # color_discrete_sequence=['blue','red']
)

fig_success.show()

### Evaluation using the above Trained Model

#### (i) Counteroffer over Multiple Negotiation Rounds

In [None]:
# Load checkpoint
ckpt = torch.load("ppo_negotiation.pth")
policy_test = PPOPolicy(trial_env.observation_space.shape[0])

#  Rebuild policy & load weights
policy_test.load_state_dict(ckpt)
policy_test.eval()

# Recreate environment
env_test = Negotiation_Agent(
    seller_min_amt=700,
    buyer_reservation_price=1200,
    max_rounds=10
)

obs, done = env_test.reset(), False
print(f"\nInitial Selling Price: {env_test.initial_selling_price}\n")

with torch.no_grad():
    while not done:
        actor     = "Buyer" if env_test.turn == 0 else "Seller"
        old_offer = env_test.current_offer

        # normalize obs if your training did that, or pass raw obs if not
        t = torch.tensor(obs, dtype=torch.float32)
        action, _, _, _, _ = policy.get_action(t, env_test)

        if isinstance(action, tuple):
            print(f"{actor} proposes ${action[1]}")
        elif action == 2:
            print(f"{actor} rejects at ${old_offer}")

        obs, reward, done, _ = env_test.step(action)

        if action == 1 and done:
            print(f"{actor} accepts at ${old_offer}")

        print(f" -> New state: Offer={obs[0]:.0f}, Round={int(obs[1])}\n")

print(f"Final reward → buyer: {reward['buyer']:.3f}   seller: {reward['seller']:.3f}")
print("Negotiation ended.\n")


Initial Selling Price: 1261

Buyer proposes $948
 -> New state: Offer=948, Round=1

Seller proposes $1087
 -> New state: Offer=1087, Round=2

Buyer proposes $1009
 -> New state: Offer=1009, Round=3

Seller proposes $1044
 -> New state: Offer=1044, Round=4

Buyer accepts at $1044
 -> New state: Offer=1044, Round=4

Final reward → buyer: -0.188   seller: 0.113
Negotiation ended.



#### (ii) Buyer accepts the offer

In [None]:
#  Load checkpoint
ckpt = torch.load("ppo_negotiation.pth")
policy_test = PPOPolicy(trial_env.observation_space.shape[0])

#  Rebuild policy & load weights
policy_test.load_state_dict(ckpt)
policy_test.eval()

#  Recreate environment
env_test = Negotiation_Agent(
    seller_min_amt=700,
    buyer_reservation_price=1200,
    max_rounds=10
)

obs, done = env_test.reset(), False
print(f"\nInitial Selling Price: {env_test.initial_selling_price}\n")

with torch.no_grad():
    while not done:
        actor     = "Buyer" if env_test.turn == 0 else "Seller"
        old_offer = env_test.current_offer

        # normalize obs if your training did that, or pass raw obs if not
        t = torch.tensor(obs, dtype=torch.float32)
        action, _, _, _, _ = policy.get_action(t, env_test)

        if isinstance(action, tuple):
            print(f"{actor} proposes ${action[1]}")
        elif action == 2:
            print(f"{actor} rejects at ${old_offer}")

        obs, reward, done, _ = env_test.step(action)

        if action == 1 and done:
            print(f"{actor} accepts at ${old_offer}")

        print(f" -> New state: Offer={obs[0]:.0f}, Round={int(obs[1])}\n")

print(f"Final reward → buyer: {reward['buyer']:.3f}   seller: {reward['seller']:.3f}")
print("Negotiation ended.\n")


Initial Selling Price: 1384

Buyer proposes $975
 -> New state: Offer=975, Round=1

Seller proposes $1140
 -> New state: Offer=1140, Round=2

Buyer accepts at $1140
 -> New state: Offer=1140, Round=2

Final reward → buyer: -0.380   seller: 0.143
Negotiation ended.



#### (iii) Buyer rejects the offer

In [None]:
#  Load checkpoint
ckpt = torch.load("ppo_negotiation_weights.pth")
policy_test = PPOPolicy(trial_env.observation_space.shape[0])

#  Rebuild policy & load weights
policy_test.load_state_dict(ckpt['state.dict'])
policy_test.eval()

#  Recreate environment
env_test = Negotiation_Agent(
    seller_min_amt=700,
    buyer_reservation_price=1200,
    max_rounds=10
)

obs, done = env_test.reset(), False
print(f"\nInitial Selling Price: {env_test.initial_selling_price}\n")

with torch.no_grad():
    while not done:
        actor     = "Buyer" if env_test.turn == 0 else "Seller"
        old_offer = env_test.current_offer

        # normalize obs if your training did that, or pass raw obs if not
        t = torch.tensor(obs, dtype=torch.float32)
        action, _, _, _, _ = policy.get_action(t, env_test)

        if isinstance(action, tuple):
            print(f"{actor} proposes ${action[1]}")
        elif action == 2:
            print(f"{actor} rejects at ${old_offer}")

        obs, reward, done, _ = env_test.step(action)

        if action == 1 and done:
            print(f"{actor} accepts at ${old_offer}")

        print(f" -> New state: Offer={obs[0]:.0f}, Round={int(obs[1])}\n")

print(f"Final reward → buyer: {reward['buyer']:.3f}   seller: {reward['seller']:.3f}")
print("Negotiation ended.\n")


Initial Selling Price: 1708

Buyer rejects at $1708
 -> New state: Offer=1708, Round=0

Final reward → buyer: -5.000   seller: -5.000
Negotiation ended.

