We first import all the necessary packages and dataset

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


Let's show a quick example about how pytorch handle the gradient calculation, assuming the model and loss function together is $y=x^2+1$

In [2]:
x = torch.tensor([3.0], requires_grad=True)

# Define a function y = x^2
y = x ** 2 + 1

# Compute the gradient
y.backward()
print(x.grad)

tensor([6.])


We can further show an example with loss function.  Assuming we want to learn $y = 2x+1$. Therefore, $2$ and $1$ is our target learnable
paraemter. Right now let's assume the model is $y=wx+b$

In [3]:
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=False)
y = torch.tensor([3.0, 5.0, 7.0], requires_grad=False)  # We give three observations. i.e., when x = 1, y = 3; x =2, y = 5

# Initialize weights with requires_grad=True so we can compute gradients
w = torch.tensor([1.0], requires_grad=True)  # initial guess for weight
b = torch.tensor([0.0], requires_grad=True)  # initial guess for bias
print(f'Before optimization, w is {w}')
print(f'Before optimization, b is {b}')


optimizer = torch.optim.Adam([w,b], lr = 0.1)
y_pred = w*x+b
optimizer.zero_grad() # we reset the optimizer here
loss = ((y_pred - y) ** 2).mean() # calculate the loss value 
loss.backward() # as shown before, this step calculates the gradient of both w and b
optimizer.step()   # it performs the change of w and b

# Print gradients
print("Gradient of w:", w.grad)  # ∂loss/∂w
print("Gradient of b:", b.grad)  # ∂loss/∂b

print(f'After optimization, w is {w}')
print(f'After optimization, b is {b}')


Before optimization, w is tensor([1.], requires_grad=True)
Before optimization, b is tensor([0.], requires_grad=True)
Gradient of w: tensor([-13.3333])
Gradient of b: tensor([-6.])
After optimization, w is tensor([1.1000], requires_grad=True)
After optimization, b is tensor([0.1000], requires_grad=True)


If we wrap the optimizer into a loop, it will eventually find the best w and b.

In [6]:
for i in range(20):
    optimizer = torch.optim.Adam([w,b], lr = 0.1)
    y_pred = w*x+b
    optimizer.zero_grad() # we reset the optimizer here
    loss = ((y_pred - y) ** 2).mean() # calculate the loss value 
    print(f"##### this is the {i} times try")
    print(f"current loss is {loss}")
    loss.backward() # as shown before, this step calculates the gradient of both w and b
    optimizer.step()   # it performs the change of w and b
    print(f'After optimization, w is {w}')
    print(f'After optimization, b is {b}')


##### this is the 0 times try
current loss is 7.829999923706055
After optimization, w is tensor([1.2000], requires_grad=True)
After optimization, b is tensor([0.2000], requires_grad=True)
##### this is the 1 times try
current loss is 6.186666011810303
After optimization, w is tensor([1.3000], requires_grad=True)
After optimization, b is tensor([0.3000], requires_grad=True)
##### this is the 2 times try
current loss is 4.736665725708008
After optimization, w is tensor([1.4000], requires_grad=True)
After optimization, b is tensor([0.4000], requires_grad=True)
##### this is the 3 times try
current loss is 3.479998826980591
After optimization, w is tensor([1.5000], requires_grad=True)
After optimization, b is tensor([0.5000], requires_grad=True)
##### this is the 4 times try
current loss is 2.41666579246521
After optimization, w is tensor([1.6000], requires_grad=True)
After optimization, b is tensor([0.6000], requires_grad=True)
##### this is the 5 times try
current loss is 1.5466665029525

Let's see an example of neural networks. 

We first process some dataset. The utilized dataset is iris dataset, a famous dataset for machine learning. https://en.wikipedia.org/wiki/Iris_flower_data_set

In [6]:
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
print(X[:5, :])
print(y[:5])
print(type(X))

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
[0 0 0 0 0]
<class 'numpy.ndarray'>


Normalization is a common process in machine learning, which normalize all the input data into [0-1] or [-1, 1]. 

It has several main benefits:
1. Faster Convergence during Training
2. Enhanced Stability of the Optimization Process
3. Improved Model Accuracy

package sklearn provides a useful function for different types of normalization. https://scikit-learn.org/stable/modules/preprocessing.html

In [22]:

# Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


All the variables before are stored using Numpy package. As just discussed, they are not designed for Neural Network computing

In [23]:

# Convert to PyTorch tensors
X_train = torch.from_numpy(X_train)
y_train = torch.from_numpy(y_train)
X_test = torch.from_numpy(X_test)
y_test = torch.from_numpy(y_test)


We can test different data slicing here. 

In [24]:
print(X_train[:, 0])
print(X_train[0, :])
print(X_train[0,0])

tensor([-0.4160,  0.5533,  0.6745,  0.9168,  1.6438, -0.1737,  2.1285, -0.2948,
        -0.9007,  2.2497, -0.0525, -0.7795, -1.0218, -0.9007, -1.0218,  0.5533,
        -1.2642, -1.0218, -0.9007, -0.2948, -0.9007, -0.1737,  2.2497, -1.5065,
         0.4322, -0.1737, -0.4160,  0.1898, -0.0525,  0.1898, -0.5372,  0.4322,
        -0.4160, -0.5372, -1.0218,  0.6745, -1.0218, -1.0218, -0.4160,  1.0380,
        -1.1430, -0.0525, -1.0218, -1.0218,  0.0687, -0.9007,  1.2803,  0.1898,
         0.3110,  2.2497, -0.4160, -1.7489, -1.8700,  0.1898,  1.6438, -1.5065,
        -0.9007, -1.7489,  0.5533,  0.5533, -1.5065,  1.1592,  0.5533, -1.3854,
         0.3110,  0.7957,  0.4322,  1.4015,  0.6745, -0.9007,  1.2803,  0.0687,
         0.7957, -0.1737, -0.7795,  0.3110, -1.6277,  0.9168, -0.4160, -0.6583,
        -0.2948,  1.7650,  1.0380, -0.9007, -1.1430,  1.0380,  1.6438, -1.1430,
         1.0380, -1.1430,  1.2803,  1.8862,  0.5533, -0.1737,  0.7957,  0.5533,
         0.6745, -0.2948,  0.0687, -0.53

The variable can also be put into either GPU or CPU, by the following commands. Remember, a variable in CPU cannot compute
with a variable in GPU!

In [None]:
device = torch.device('cpu') # if you want to put it to gpu, use 'cuda'
x_GPU = X_train.to(device)

 Define the Model 

In [None]:
class IrisNet(nn.Module):
    def __init__(self):
        super(IrisNet, self).__init__()
        self.fc1 = nn.Linear(4, 10) # recal the w and b in our simple example, the nn.Linear itself contains all the learnable parameters of the model.
        self.fc2 = nn.Linear(10, 3)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = IrisNet()

 Define Loss and Optimizer

In [None]:
criterion = nn.CrossEntropyLoss() #because it is a classification problem, we cannot use mean square error as the loss function. instead, we use the cross entropy. Be assure it is just a type of equation like MSE.
optimizer = optim.Adam(model.parameters(), lr=0.01) # we have seen this optimizer before.

Training Loop

In [None]:
epochs = 100
loss_values = []

for epoch in range(epochs):
    optimizer.zero_grad() #remember this training loop, it is the most standard way to train the model, i.e. adjusting the paraemters until loss value is minimal. 
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    
    loss_values.append(loss.item())
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")
        
# we actually can let the training process stop if the loss value is lower than a threshold.

Epoch [10/100], Loss: 0.8783
Epoch [20/100], Loss: 0.6024
Epoch [30/100], Loss: 0.4291
Epoch [40/100], Loss: 0.3350
Epoch [50/100], Loss: 0.2822
Epoch [60/100], Loss: 0.2415
Epoch [70/100], Loss: 0.2064
Epoch [80/100], Loss: 0.1759
Epoch [90/100], Loss: 0.1509
Epoch [100/100], Loss: 0.1309


Test Accuracy

In [11]:
with torch.no_grad():
    test_outputs = model(X_test)
    predicted = torch.argmax(test_outputs, dim=1)
    accuracy = (predicted == y_test).float().mean()
    print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 1.00
