In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import grad
from torch.utils.data import DataLoader
from torch.autograd import grad
from torch.optim import Adam

import numpy as np
import torch
from scipy.stats import norm


In [3]:
from IPython.core.display import HTML
HTML("""
<style>
body { font-family: "Helvetica Neue", sans-serif; font-size: 15px; }
h1, h2, h3 { color: #34495e; }
p { line-height: 1.6; }
</style>
""")

In [None]:
class OPNN(nn.Module):

    def __init__(self, input_dim = 3, hidden_dim = 3, num_hidden_layers = 2):

        super(OPNN, self).__init__()

        # input layers
        layers = [
            
            nn.Linear(input_dim, hidden_dim),
            nn.Tanh(),
            nn.Softmax()
            
        ]

        # hidden layers 
        for _ in range(num_hidden_layers - 1):

            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.Tanh())

        # output layer
        layers.append(nn.Linear(hidden_dim, 1))

        self.model = nn.Sequential(*layers) # unbounded/continuous

    def forward(self, x):

        return self.model(x)

        

note: tanh is smoother than ReLU and works better for PDE tasks (that's why no GELU)
final layer doesn't have activation, allows output range to be unconstrained
requires_grad for auto grad on the PDE residuals

S: Spot Price
K: Strike Price
T: Time to Maturity

Deeper Networks will overfit/struggle with PDE loss

In [5]:
# default 3, 3, 2
model = OPNN()

# sample batch S = 100, K = 100, T = 0.5
# output.shape = [1, 1]
sample_input = torch.tensor([[100.0, 100.0, 0.5]], requires_grad=True) # shape [1, 3]
normalized_input = sample_input / torch.tensor([100.0, 100.0, 1.0]) # normalization

output = model(normalized_input)
print(output)

tensor([[0.0379]], grad_fn=<AddmmBackward0>)


In [6]:
output = model(normalized_input)

# https://docs.pytorch.org/docs/stable/generated/torch.ones_like.html

# calc first derivatives
grad_output = torch.ones_like(output)
dC_dinput = grad(output, sample_input, create_graph=True)[0]
print(dC_dinput)

# calc second derivatives
dC_dS = dC_dinput[:, 0]
d2C_dS2 = grad(dC_dS, sample_input, grad_outputs=torch.ones_like(dC_dS), create_graph=True)[0][:, 0]
print(d2C_dS2)

tensor([[ 0.0004, -0.0008, -0.0911]], grad_fn=<DivBackward0>)
tensor([2.4495e-08], grad_fn=<SelectBackward0>)


In [7]:
# weight test
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {total_params}")

Total trainable parameters: 28


In [8]:
# checking weights

for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name}: mean={param.data.mean():.4f}, std={param.data.std():.4f}")

model.0.weight: mean=-0.0771, std=0.3553
model.0.bias: mean=0.2223, std=0.2306
model.2.weight: mean=0.0153, std=0.3365
model.2.bias: mean=0.4348, std=0.1111
model.4.weight: mean=-0.0871, std=0.2986
model.4.bias: mean=0.0611, std=nan


  print(f"{name}: mean={param.data.mean():.4f}, std={param.data.std():.4f}")


In [9]:
# symmetry/invariance check

# call prices should INCREASE with S, and convex



S_values = torch.tensor([[80.0, 100.0, 0.5],
                         [90.0, 100.0, 0.5],
                         [100.0, 100.0, 0.5],
                         [110.0, 100.0, 0.5]], requires_grad=True)

S_values_norm = S_values / torch.tensor([100.0, 100.0, 1.0])
prices = model(S_values_norm)
print("Monotonicity test:", prices.squeeze())


Monotonicity test: tensor([0.0299, 0.0339, 0.0379, 0.0419], grad_fn=<SqueezeBackward0>)


The output DECREASES as S increases which is wrong

In [10]:
def black_scholes_call_price(S, K, T, r, sigma):
    """Vectorized Black-Scholes formula for call options"""
    d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
    d2 = d1 - sigma * np.sqrt(T)
    return S * norm.cdf(d1) - K * np.exp(-r * T) * norm.cdf(d2)

def generate_black_scholes_dataset(n_samples=10000, r=0.05, sigma=0.2, seed=42):
    np.random.seed(seed)

    # Sample inputs: S, K, T
    S = np.random.uniform(50, 150, size=n_samples)
    K = np.random.uniform(50, 150, size=n_samples)
    T = np.random.uniform(0.01, 1.0, size=n_samples)

    # Compute prices using BS formula
    C = black_scholes_call_price(S, K, T, r, sigma)

    # Stack inputs and outputs
    X = np.stack([S, K, T], axis=1)
    y = C.reshape(-1, 1)

    # Convert to torch tensors
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.float32)

    return X_tensor, y_tensor


In [11]:
X_train, y_train = generate_black_scholes_dataset(n_samples=10000)

print("Sample input (S, K, T):", X_train[:3])
print("Sample target prices (C):", y_train[:3])

Sample input (S, K, T): tensor([[ 87.4540,  87.3641,   0.7327],
        [145.0714,  83.2912,   0.1927],
        [123.1994,  67.6154,   0.3532]])
Sample target prices (C): tensor([[ 7.6124],
        [62.5787],
        [56.7675]])


Looks like the simulator did well, within market range \

Sample input (S, K, T): \
tensor([[ 87.4540,  87.3641,  0.7327],  # near-the-money, moderate T \
        [145.0714,  83.2912,  0.1927],  # deep in-the-money \
        [123.1994,  67.6154,  0.3532]]) # deep in-the-money, mid-maturity \

Sample target prices (C): \
tensor([[ 7.6124],   # makes sense for near-the-money \
        [62.5787],  # very high because S >> K \ 
        [56.7675]]) # similar case: S >> K \

## implementing loss

In [None]:
# squared pde residual (physics informed)

def bs_pde_loss(model, inputs, r=0.05, sigma=0.2):
    """
    Computes the Black-Scholes PDE residual loss.

    inputs: tensor of shape [B, 3] = [S, K, T]
    """
    inputs = inputs.clone().detach().requires_grad_(True)
    S = inputs[:, 0:1]
    K = inputs[:, 1:2]
    T = inputs[:, 2:3]

    
    norm_inputs = inputs / torch.tensor([100.0, 100.0, 1.0])

    C = model(norm_inputs)

    # calculating the first derivative loss

    dC = grad(C, inputs, grad_outputs=torch.ones_like(C), create_graph=True)[0]
    dC_dS = dC[:, 0:1]
    dC_dT = dC[:, 2:3]


    # second derivative loss
    d2C_dS2 = grad(dC_dS, inputs, grad_outputs=torch.ones_like(dC_dS), create_graph=True)[0][:, 0:1]

    # linear transform of the 
    pde_residual = dC_dT + 0.5 * sigma**2 * S**2 * d2C_dS2 + r * S * dC_dS - r * C

    # mse of the residual
    loss = torch.mean(pde_residual**2)
    return loss


In [14]:
# sample input batch, untrained models should give non-zero pde residuals

inputs, _ = generate_black_scholes_dataset(n_samples=32)
pde_loss = bs_pde_loss(model, inputs)
print("pde loss:", pde_loss.item())


pde loss: 0.008035659790039062


In [15]:
# supervised model loss, standard

# MSE between predicted and BS price

def supervised_loss(model, inputs, targets):

    norm_inputs = inputs / torch.tensor([100.0, 100.0, 1.0])

    preds = model(norm_inputs)

    return torch.nn.functional.mse_loss(preds, targets)



In [16]:
def total_loss(model, inputs, targets, lambda_sup=1.0, lambda_pde=1.0, r=0.05, sigma=0.2):
    sup_loss = supervised_loss(model, inputs, targets)
    pde_loss = bs_pde_loss(model, inputs, r=r, sigma=sigma)
    return lambda_sup * sup_loss + lambda_pde * pde_loss

In [None]:
optimizer = Adam(model.parameters(), lr=0.001)
num_epochs = 500 # for now, should go to 

# in theory, pde constraint often slows convergence

for epoch in range(num_epochs):

    optimizer.zero_grad()
    loss = total_loss(model, X_train_batch, y_train_batch,
                      lambda_sup=1.0, lambda_pde=1.0)
    loss.backward()
    optimizer.step()


NameError: name 'num_epochs' is not defined