### GPU

In [1]:
import os
#设置一个名为CUDA_DEVICE_ORDER的环境变量，并将其值设置为"PCI_BUS_ID"
#指定了CUDA设备的顺序按照PCI总线ID进行排序
# Set the CUDA device order to PCI_BUS_ID
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"

#指定了可见的CUDA设备的索引，索引为1，第二个索引从0开始 Set the CUDA visible devices to GPU with index 1
os.environ["CUDA_VISIBLE_DEVICES"]="1"

### IMPORTS

In [2]:
import torch
#提供常用的数据集，模型和图像转换功能
import torchvision
#Python Imaging Library，用于图像处理和操作                              
import PIL
#用于在循环中显示进度条的库                                    
from tqdm import tqdm
#用于计算均方误差的函数                           
from sklearn.metrics import mean_squared_error
# 用于在Jupyter Notebook中显示内容
from IPython.display import display
#用于绘制图表和可视化数据的库
import matplotlib
import matplotlib.pyplot as plt
#用于进行数值计算和数组操作的库
import numpy as np

#包含用于加载和处理自动驾驶数据集的代码
from autopilot_dataset import AutopilotDataset
# 包含自动驾驶模型的定义和训练代码
from autopilot_model import AutopilotModel
# 包含用于预处理图像和其他实用函数的代码。
from autopilot_utils import preprocess_image

<br>

### HYPERPARAMETERS

In [3]:
# 设置训练的批次大小,表示每次训练模型时使用的样本数量 Number of samples per batch during training
BATCH_SIZE = 128
# 设置最大训练轮数,表示模型将进行多少次完整的训练迭代 Maximum number of training epochs
MAX_EPOCHS = 50
# 设置早停的耐心值,表示如果在连续多少个训练轮后模型的性能没有改善，训练过程将被提前终止 
# Number of epochs to wait before early stopping if the validation loss doesn't improve
EARLY_STOPPING_PATIENCE = 10

# 设置初始学习率,表示模型在训练开始时使用的学习率 Initial learning rate for the optimizer
INITIAL_LR = 0.0005
# 设置学习率减小的耐心值, 学习率调整耐心值，表示如果在连续多少个训练轮后模型的性能没有改善，学习率将被降低 
# Number of epochs to wait before reducing the learning rate
LR_REDUCER_PATIENCE = 2
# 设置学习率减小的因子,表示在学习率调整时将当前学习率乘以的比例   Factor by which to reduce the learning rate
LR_REDUCER_FACTOR = 0.9

# 设置可接受的测试损失阈值,表示当模型在测试集上的损失低于该值时被认为是令人满意的 Maximum acceptable testing loss for evaluating the model
ACCEPTABLE_TESTING_LOSS = 0.1

# 设置输入图像的帧大小 Size of the input frames (assumed to be square)
FRAME_SIZE = 224

# 设置保存训练好的模型的目录路径 Directory path for storing model files TODO: 不知道要不要修改路径
MODELS_DIR = "/home/greg/models/jetson/"
# 设置数据集的目录路径 Directory path for storing dataset files
DATASETS_DIR = "/home/greg/datasets/jetson/"

# 设置模型的版本或标识符  Version identifier for the model
VERSION = "2_16"
 
# 设置保存训练好的模型的文件路径 File path for saving/loading the model
MODEL_PATH = MODELS_DIR + VERSION + "_resnet18" + ".pth"
# 设置训练数据集的目录路径 Directory path for the training dataset
TRAINING_DATASET = DATASETS_DIR + "training/"
# 设置验证数据集的目录路径 Directory path for the validation dataset
VALIDATION_DATASET = DATASETS_DIR + "validation/"
# 设置测试数据集的目录路径 Directory path for the testing dataset
TESTING_DATASET = DATASETS_DIR + "testing/"

<br>

### DATA

In [None]:
# 创建训练数据集对象，并进行数据增强和内存保留设置, 它设置帧的大小, 应用了随机水平翻转、随机噪声、随机模糊和随机颜色调整等数据增强操作。
# 用keep_images_in_ram参数指定将图像保存在内存中
# Initialize the AutopilotModel with pre-trained weights
training_dataset = AutopilotDataset(TRAINING_DATASET,
                                    FRAME_SIZE,
                                    random_horizontal_flip=True,
                                    random_noise=True,
                                    random_blur=True,
                                    random_color_jitter=True,
                                    keep_images_in_ram=True)
# 创建训练数据集的数据加载器，指定批次大小和打乱顺序 
training_loader = torch.utils.data.DataLoader(training_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=True)

# 创建验证数据集对象，不进行数据增强，并设置内存保留
validation_dataset = AutopilotDataset(VALIDATION_DATASET,
                                      FRAME_SIZE,
                                      random_horizontal_flip=False,
                                      random_noise=False,
                                      random_blur=False,
                                      random_color_jitter=False,
                                      keep_images_in_ram=True)

# 创建验证数据集的数据加载器，指定批次大小和打乱顺序
#将validation_dataset作为数据源，设置批处理大小为BATCH_SIZE，并打开数据集的随机打乱顺序功能
validation_loader = torch.utils.data.DataLoader(validation_dataset,
                                                batch_size=BATCH_SIZE,
                                                shuffle=True)

# 创建测试数据集对象，不进行数据增强，并设置内存保留
testing_dataset = AutopilotDataset(TESTING_DATASET,
                                   FRAME_SIZE,
                                   random_horizontal_flip=False,
                                   random_noise=False,
                                   random_blur=False,
                                   random_color_jitter=False,
                                   keep_images_in_ram=True)
# 创建测试数据集的数据加载器，指定批次大小为1，不打乱顺序
testing_loader = torch.utils.data.DataLoader(testing_dataset,
                                                batch_size=1,
                                                shuffle=False)

<br>

### MODEL

In [5]:
# 创建自动驾驶模型对象，并加载预训练权重 Initialize the AutopilotModel with pre-trained weights
model = AutopilotModel(pretrained=True)
# 创建优化器对象，使用Adam优化器，并指定学习率为初始学习率
# Initialize Adam optimizer with model parameters, Adam optimizer is used to update the model's parameters during training
optimizer = torch.optim.Adam(model.parameters(), lr=INITIAL_LR)
# 创建学习率调度器对象，当验证损失不再改善时，减小学习率 
# ReduceLROnPlateau scheduler adjusts the learning rate based on the validation loss. It reduces the learning rate when the validation loss plateaus.

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       'min',
                                                       patience=LR_REDUCER_PATIENCE,
                                                       factor=LR_REDUCER_FACTOR,
                                                       verbose=True)
# 创建损失函数对象，使用均方(MES)误差损失函数
# Initialize the mean squared error (MSE) loss function. MSE loss is commonly used for regression problems, 
# such as predicting steering and throttle values. It computes the mean squared difference between the predicted and target values.
loss_function = torch.nn.MSELoss()

<br>

### TRAINING

In [None]:
# 创建空列表以保存训练和验证损失, 初始化了用于记录训练和验证损失 List to store training losses for each epoch and List to store validation losses for each epoch
training_losses = []
validation_losses = []

# 初始化连续没有改善的轮数 Initializes the number of consecutive rounds without improvement 
# # Counter for tracking the number of epochs without improvement in validation loss
epochs_without_improvement = 0 

# 定义绘制损失函数曲线的函数,绘制训练损失和验证损失的图形 Function to plot the training and validation losses
def plot_losses():
    # 创建图形和轴对象
    fig, ax = plt.subplots()
    # 绘制训练损失曲线
    ax.plot([x for x in range(len(training_losses))], training_losses, label='training_loss')
    # 绘制验证损失曲线
    ax.plot([x for x in range(len(validation_losses))], validation_losses, label='validation_loss')
    # 设置横轴、纵轴和标题标签
    ax.set(xlabel='epochs', ylabel='loss', title='Training Progress')
    # 显示网格
    ax.grid()
    # 显示图例
    plt.legend()
    # 显示图形
    plt.show()

# 定义运行每个轮次的函数     Define the function that runs each round
def run_epoch(tepoch, name, training):
    epoch_loss = 0.0
    iterations = 0
    
    for _, images, annotations in tepoch:
        # 更新进度条描述 Updated the progress bar description
        tepoch.set_description(f"{name} Epoch {epoch}")
        
        # 将图像和注释数据移至GPU   # Move images and annotation data to the GPU
        images = images.cuda()
        annotations = annotations.cuda()
        
        if training:\
            # 清零优化器梯度  Clear the optimizer gradient
            optimizer.zero_grad()
            # 设置模型为训练模式  Set the model to training mode
            model.train()
            # 运行模型前向传播    Run model forward propagation
            outputs = model(images)
        else:
            with torch.no_grad():
                # 设置模型为评估模式  Set the model to evaluation mode
                model.eval()
                # 运行模型前向传播  Run model forward propagation
                outputs = model(images)

        # 计算损失函数   Computed loss function
        loss = loss_function(outputs, annotations)
        
        if training:
            # 反向传播计算梯度  Backpropagation computes gradients
            loss.backward()
            # 根据梯度更新模型参数   Model parameters are updated according to the gradient
            optimizer.step()

        # 累加损失 Cumulative loss
        epoch_loss += loss.item()
        # 计算迭代次数  Count the number of iterations
        iterations += 1
        
    return float(epoch_loss/iterations)

# 进行训练循环 Training loop
for epoch in range(MAX_EPOCHS):
    # 使用tqdm库创建训练数据集的进度条  
    # Training phase
    with tqdm(training_loader, unit="batch") as training_epoch:
        # 运行一个训练轮次，并返回平均训练损失   Run a training round and return the average training loss
        avg_training_loss = run_epoch(training_epoch, "Training", training=True)
        # 将平均训练损失添加到训练损失列表     Adds the average validation loss to the validation loss list
        training_losses.append(avg_training_loss)
    
    # 使用tqdm库创建验证数据集的进度条
    # Validation phase
    with tqdm(validation_loader, unit="batch") as validation_epoch:
        # 运行一个验证轮次，并返回平均验证损失   Run a training round and return the average training loss
        avg_validation_loss = run_epoch(validation_epoch, "Validation", training=False)
        # 使用学习率调度器根据验证损失调整学习率    Use the learning rate scheduler to adjust the average learning rate based on validation losses
        scheduler.step(avg_validation_loss)
        # 将平均验证损失添加到验证损失列表    Adds the average validation loss to the validation loss list
        validation_losses.append(avg_validation_loss)
        
        # 如果验证损失达到最低值，保存模型并重置连续没有改善的轮数
        # If the validation loss reaches a minimum value, save the model and reset the number of consecutive rounds that have not improved
        if avg_validation_loss <= np.min(validation_losses):
            epochs_without_improvement = 0
            print("validation loss decreased to " + str(avg_validation_loss) + ", saving model")
            model.save_to_path(MODEL_PATH)
        # 否则，增加连续没有改善的轮数
        else:
            epochs_without_improvement += 1

            # 如果连续没有改善的轮数超过指定的耐心值，停止训练
            # If the number of consecutive rounds without improvement exceeds the specified patience value, stop training
            if epochs_without_improvement >= EARLY_STOPPING_PATIENCE:
                print("validation loss of " + str(np.min(validation_losses)) + " hasn't improved in last " + str(EARLY_STOPPING_PATIENCE) + " epochs, stopping training")
                break
    # 绘制损失函数曲线   Plot the training and validation losses after each epoch          
    plot_losses()

<br>

### TESTING

In [None]:
# 使用模型加载保存的权重 Use the model to load the saved weights
model.load_from_path(MODEL_PATH)

# 创建空列表以保存结果和损失    Create an empty list to store results and losses
results = []
losses = []

# 设置模型为评估模式，禁用梯度计算   Set the model to evaluation mode and disable gradient calculation
with torch.no_grad():
    model.eval()
    
    # 遍历测试数据集  Iterate over the test data set
    for name, image, annotation in testing_loader:
        # 将图像数据移至GPU并进行预测  Move the image data to the GPU and make predictions
        prediction = model(image.cuda()).clamp(min=-1, max=1)
        # 计算损失，并将其转换为浮点数并移至CPU   # Calculate the loss and convert it to a floating point number and move it to the CPU
        loss = round(float(loss_function(prediction, annotation.cuda()).cpu()), 4)
        # 判断是否通过测试（损失是否小于可接受的测试损失）  Determine if the test is passed (if the loss is less than the acceptable test loss)
        passed = loss < ACCEPTABLE_TESTING_LOSS
        # 将损失和结果添加到列表中  Add losses and results to the list
        losses.append(loss)
        results.append(passed)
               
        # 对图像进行变换，以便可视化  Transform the image for visualization
        composed_transforms = torchvision.transforms.Compose([
            torchvision.transforms.Normalize([-0.485/0.229, -0.456/0.224, -0.406/0.225], [1/0.229, 1/0.224, 1/0.225]),
            torchvision.transforms.ToPILImage()
        ])
        image = composed_transforms(image[0])
        image = image.convert("RGB")
        # 显示图像  Display image
        display(image)
        
        # 打印图像名称、期望注释、预测注释、损失和测试结果
        # Print image names, expected comments, predicted comments, losses, and test results
        print(name[0])
        print("expected: "+str(annotation.float()[0]))
        print("predicted: "+str(prediction.cpu().float()[0]))
        print("loss: "+str(loss))
        print("passed: "+str(passed))
        print("")

# 打印测试结果得分和平均损失  Print test results score and average loss
print("SCORE: "+str(len([x for x in results if x]))+"/"+str(len(results))+", AVG LOSS: "+str(round(np.mean(losses), 4)))    