https://pytorch.org/get-started/locally/

In [None]:
!pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126

Looking in indexes: https://download.pytorch.org/whl/cu126


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

$\frac{\partial L}{\partial w}$ &nbsp;&nbsp;&nbsp;:&nbsp;&nbsp;&nbsp; loss.backward()

$\frac{\partial L}{\partial w}$ &nbsp;&nbsp;&nbsp;:&nbsp;&nbsp;&nbsp;&nbsp; W.grad

W = W - (learning_rate * W.grad) &nbsp;&nbsp;&nbsp; :&nbsp;&nbsp;&nbsp; optimizer.step()

Now more Pytorch way

In [None]:
class OneParameterModal(nn.Module):
    def __init__(self):
        super().__init__()
        # A learnable parameter tensor
        self.my_weight = nn.Parameter(torch.tensor([0.2]))

    def forward(self, x):
        #  y = my_weight @ x
        # torch.matmul(x, self.my_weight)
        return self.my_weight @ x

# Initialize the model
model = OneParameterModal()

# Accessing parameters tracked by the module
for name, param in model.named_parameters():
    print(f"Parameter name: {name}, Shape: {param.shape}, Requires grad: {param.requires_grad}")

Parameter name: my_weight, Shape: torch.Size([1]), Requires grad: True


In [None]:
# --- 1. Setup ---
# Define a Loss Function (Mean Squared Error is common for regression)
criterion = nn.MSELoss()

# Define an Optimizer (Stochastic Gradient Descent is simple and effective)
optimizer = optim.SGD(model.parameters(), lr=0.1) # Learning Rate (lr) set to 0.1
# It will update only the parameters in model.parameters()

In [None]:
# Define the ideal input (x) and the target output (target_y)
x = torch.tensor([1.0])
target_y = torch.tensor([1.0]) # We want the output to be 1.0

num_of_epochs = 5

In [None]:
# --- 2. The Training Loop (e.g., 5 epochs) ---
print("Starting Training...")
print(f"Initial Weight: {model.my_weight.item():.4f}")
print("-" * 30)

# for i in range(5):
for epoch in range(num_of_epochs):
    # Zero the gradients from the previous step (Equivalent to W.grad.zero_())
    optimizer.zero_grad()

    # 1. Forward Pass: Get the prediction (y_pred)
    y_pred = model(x)

    # 2. Calculate Loss (Now using MSE)
    loss = criterion(y_pred, target_y)

    # 3. Backward Pass: Calculate the gradient of the loss with respect to the weight
    loss.backward()

    # 4. Optimization Step: Update parameter (weight) (The Manual Gradient Descent Step)
    optimizer.step()

    # Print progress
    print(f"Epoch {epoch+1}: Loss = {loss.item():.6f}, New Weight = {model.my_weight.item():.4f}")

print("-" * 30)
print(f"Final Predicted y: {model(x).item():.4f}")
print(f"Final Target y:    {target_y.item():.4f}")

Starting Training...
Initial Weight: 0.2000
------------------------------
Epoch 1: Loss = 0.640000, New Weight = 0.3600
Epoch 2: Loss = 0.409600, New Weight = 0.4880
Epoch 3: Loss = 0.262144, New Weight = 0.5904
Epoch 4: Loss = 0.167772, New Weight = 0.6723
Epoch 5: Loss = 0.107374, New Weight = 0.7379
------------------------------
Final Predicted y: 0.7379
Final Target y:    1.0000


  return F.mse_loss(input, target, reduction=self.reduction)


Using Bias

In [None]:
# With Bias
# --- 1. Define Data and Hyperparameters ---
x_data = torch.tensor([[1.0]])
y_true = torch.tensor([[1.0]])
# --- Model Parameter (W) ---
W = torch.tensor([[0.2]], requires_grad=True)
b = torch.tensor([[0.2]], requires_grad=True)
# --- Hyperparameters
learning_rate = 0.1
num_epochs = 200


for epoch in range(num_epochs):

    ### STEP 1: Forward Pass (Calculate Prediction) ###
    y_pred = (W @ x_data) + b

    ### STEP 2: Calculate Loss ###
    loss = (y_true - y_pred).pow(2)

    ### STEP 3: Backward Pass (Calculate Gradients) ###
    loss.backward()

    ### STEP 4: Update Parameters (The Manual Gradient Descent Step) ###
    with torch.no_grad():
        # W = W - (learning_rate * W.grad)
        W -= (learning_rate * W.grad)
        b -= (learning_rate * b.grad)

    ### STEP 5: Reset means Zero the Gradients for the next iteration###
    W.grad.zero_()
    b.grad.zero_()

    # --- Print and Break Condition ---
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.3f}, Weight: {W.item():.4f}, bias: {W.item():.4f}, y: {y_pred}")

    # Break condition requested by the user
    if loss.item() < 0.0001:
        print(f"\nBreak condition met: Loss ({loss.item():.6f}) < 0.1")
        break

print("---------------------------------------")
print(f"Final optimized weight (W): {W.item():.4f}, bias (b): {b.item():.4f}")

Epoch [1/200], Loss: 0.360, Weight: 0.3200, bias: 0.3200, y: tensor([[0.4000]], grad_fn=<AddBackward0>)
Epoch [2/200], Loss: 0.130, Weight: 0.3920, bias: 0.3920, y: tensor([[0.6400]], grad_fn=<AddBackward0>)
Epoch [3/200], Loss: 0.047, Weight: 0.4352, bias: 0.4352, y: tensor([[0.7840]], grad_fn=<AddBackward0>)
Epoch [4/200], Loss: 0.017, Weight: 0.4611, bias: 0.4611, y: tensor([[0.8704]], grad_fn=<AddBackward0>)
Epoch [5/200], Loss: 0.006, Weight: 0.4767, bias: 0.4767, y: tensor([[0.9222]], grad_fn=<AddBackward0>)
Epoch [6/200], Loss: 0.002, Weight: 0.4860, bias: 0.4860, y: tensor([[0.9533]], grad_fn=<AddBackward0>)
Epoch [7/200], Loss: 0.001, Weight: 0.4916, bias: 0.4916, y: tensor([[0.9720]], grad_fn=<AddBackward0>)
Epoch [8/200], Loss: 0.000, Weight: 0.4950, bias: 0.4950, y: tensor([[0.9832]], grad_fn=<AddBackward0>)
Epoch [9/200], Loss: 0.000, Weight: 0.4970, bias: 0.4970, y: tensor([[0.9899]], grad_fn=<AddBackward0>)
Epoch [10/200], Loss: 0.000, Weight: 0.4982, bias: 0.4982, y: te

Now Pytorch way

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# --- 1. Define Data and Hyperparameters ---
x_data = torch.tensor([[1.0]])
y_true = torch.tensor([[1.0]])
learning_rate = 0.1
num_epochs = 5

# ------------------------------------------------------------------
# --- 2. Define the Model Class (Replaces W and b tensors) ---
# ------------------------------------------------------------------
class SimpleLinearModel(nn.Module):
    def __init__(self):
        super().__init__()
        # nn.Linear automatically creates W and b tensors with requires_grad=True
        # It takes (taking input_features, producing output_features)
        # self.linear = nn.Linear(1, 1)
        self.linear = nn.Linear(in_features=1, out_features=1)

        # # Set initial weights and bias to match your original problem
        self.linear.weight.data = torch.tensor([[0.2]])
        self.linear.bias.data = torch.tensor([0.2])

        # Inspect the layer's parameters (automatically created)
        print(f"\nWeight shape: {self.linear.weight.shape}")
        print(f"Bias shape: {self.linear.bias.shape}")

    def forward(self, x):
      #  y = weight @ x + bias
        return self.linear(x)

# Instantiate the model
model = SimpleLinearModel()

# ------------------------------------------------------------------
# --- 3. Define Loss Function and Optimizer ---
# ------------------------------------------------------------------
# Standard Squared Loss (MSE)
criterion = nn.MSELoss()

# Standard Stochastic Gradient Descent (SGD) optimizer
# It takes the model's parameters and the learning rate
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# ------------------------------------------------------------------
# --- 4. Training Loop (Simplified) ---
# ------------------------------------------------------------------
for epoch in range(num_epochs):

    ### STEP 1: Reset Gradients (Equivalent to W.grad.zero_()) ###
    optimizer.zero_grad()

    ### STEP 2: Forward Pass (Calculate Prediction) ###
    y_pred = model(x_data)

    ### STEP 3: Calculate Loss (Now using MSE) ###
    loss = criterion(y_pred, y_true)

    ### STEP 4: Backward Pass (Calculate Gradients) ###
    loss.backward()

    ### STEP 5: Update Parameters (The Manual Gradient Descent Step) ###
    # This automatically updates W and b using the calculated gradients and learning_rate
    optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.6f}, W: {W:.4f}, b: {b:.4f}, y_pred: {y_pred.item():.4f}")


print("---------------------------------------")
print(f"Final optimized weight (W): {W:.4f}, bias (b): {b:.4f}")
print(f"Final Prediction (W@x + b): {W*1.0 + b:.4f}")


Weight shape: torch.Size([1, 1])
Bias shape: torch.Size([1])


TypeError: unsupported format string passed to Tensor.__format__

nn.Linear

In [None]:
# Example: Expecting input features of size 20, producing output features of size 30
linear_layer = nn.Linear(in_features=4, out_features=2)

# Inspect the layer's parameters (automatically created)
print(f"\nWeight shape: {linear_layer.weight.shape}")
print(f"Bias shape: {linear_layer.bias.shape}")

Simple Model with Activation

In [None]:
import torch
import torch.nn as nn

# 1. Define the Model Class
class SimpleReluModel(nn.Module):
    def __init__(self):
        super(SimpleReluModel, self).__init__()
        # Linear layer: 2 input features, 1 output feature
        self.linear = nn.Linear(in_features=2, out_features=1)
        # The activation function
        self.relu = nn.ReLU()

    def forward(self, x):
        # 1. Apply linear transformation (W*x + b)
        x = self.linear(x)
        # 2. Apply ReLU activation (max(0, output))
        output = self.relu(x)
        return output

# 2. Instantiate the Model
model = SimpleReluModel()
print("Model Architecture:")
print(model)

# 3. Create Example Input Data
# Shape: (batch_size, num_features) -> (1, 2)
input_data = torch.randn(1, 2)
print("\nExample Input Data:")
print(input_data)

# 4. Pass the Input through the Model
output = model(input_data)

# 5. Print the Output
print("\nModel Output (Shape: [1, 1]):")
print(output)

It starts with hidden layer or input layer?

Universal Approximation Theorem

https://alexlenail.me/NN-SVG/

In [None]:
# X = torch.linspace(0, 1, 100).unsqueeze(1)
# X = torch.linspace(0, 1, 100)
# X

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

# --- 1. Define the Function to Approximate (f(x) = sin(2*pi*x)) ---
def target_function(x):
    return torch.sin(2 * torch.pi * x)

# --- 2. Create Training Data ---
# 100 data points in the domain [0, 1]
N_SAMPLES = 100
X = torch.linspace(0, 1, N_SAMPLES).unsqueeze(1) # Input: shape (100, 1) -> 1 feature
Y = target_function(X)                          # Target: shape (100, 1)

# --- 3. Define the Universal Approximator Network ---
class UniversalApproximator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(UniversalApproximator, self).__init__()
        # Single hidden layer is sufficient for the UAT
        self.hidden = nn.Linear(input_size, hidden_size)
        self.output = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Use ReLU as the non-polynomial activation function
        x = torch.relu(self.hidden(x))
        x = self.output(x) # Linear output layer for regression
        return x

# --- 4. Instantiate Model, Loss, and Optimizer ---
input_dim = 1      # n=1 feature (x)
hidden_dim = 50    # Arbitrary number of hidden neurons (the 'width')
output_dim = 1     # m=1 output (f(x))

model = UniversalApproximator(input_dim, hidden_dim, output_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# --- 5. Training Loop ---
NUM_EPOCHS = 2000
for epoch in range(NUM_EPOCHS):
    # Forward pass
    Y_pred = model(X)
    loss = criterion(Y_pred, Y)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 500 == 0:
        print(f'Epoch [{epoch+1}/{NUM_EPOCHS}], Loss: {loss.item():.6f}')

In [None]:
# --- 6. Plotting the Approximation ---
with torch.no_grad():
    Y_approx = model(X).squeeze().numpy()

X_np = X.squeeze().numpy()
Y_np = Y.squeeze().numpy()

plt.figure(figsize=(10, 6))
plt.plot(X_np, Y_np, label='Target Function (sin(2Ï€x))', color='blue', linewidth=2)
plt.plot(X_np, Y_approx, label='NN Approximation', color='red', linestyle='--', linewidth=2)
plt.title('Universal Approximation Theorem Demonstration (1D)')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.grid(True)
plt.show()

nn.Sequential

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# Data
x = torch.linspace(-3.14, 3.14, 400).unsqueeze(1)
y = torch.sin(x)

# Model
model = nn.Sequential(
    nn.Linear(1, 64),
    nn.ReLU(),
    nn.Linear(64, 1)
)

# Train
optimizer = optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

for epoch in range(3000):
    optimizer.zero_grad()
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    loss.backward()
    optimizer.step()

# Plot results
plt.plot(x.detach(), y, label='True sin(x)')
plt.plot(x.detach(), y_pred.detach(), label='Neural net approximation')
plt.legend()
plt.title('Universal Approximation with ReLU (PyTorch)')
plt.show()
