### CARGAR DEPENDENCIAS

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
import torch.optim as optim
import numpy as np
from model import *
import os
import torchvision.transforms.functional as F
from torchvision import transforms, models
import time
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
import random

### DEFINICION DE PARAMETROS

In [10]:
name = "CNNminitest"
architecture = "CNN_RNN" # "CNN", "CNN_RNN", "CNN_LSTM_STATE"
cnn_name = "efficientnet_b0"#"efficientnet_v2_s", "efficientnet_b0", "efficientnet_b1", "efficientnet_b2", "efficientnet_b3"
hidden_size = 256 # Número de neuronas en la capa oculta 512 usado por Iker
output_size = 2 # Giro y aceleración
input_size = (240, 135)  # 16:9 ratio
num_layers = 1
dropout = 0 # Se necesita num_layers > 1 para que el dropout tenga efecto
bias = True
cnn_train = True # Si se desea entrenar la parte CNN 

seq_len = 5 # Número de imágenes a considerar en la secuencia
batch_size = 32 # Número de secuencias a considerar en paralelo
num_epochs = 30 # Número de veces que se recorrerá el dataset
learning_rate = 0.001 

# Parámetros del early stopping
patience = 5  # Tolerancia de 5 epochs sin mejora
best_val_loss = float('inf')  # Inicia con el peor valor posible
early_stop_counter = 0  # Contador de epochs sin mejoras

# Definir las rutas de los directorios de datos y de guardado de modelos
train_data_dir = "./datasets/test"
validation_data_dir = "./datasets/test3"
save_dir = f"./trained_models/{name}"

# Definir el nombre del modelo a guardar
model_name = f"{name}-{architecture}-{cnn_name}-{seq_len}-{input_size[0]}-{input_size[1]}-{output_size}-{hidden_size}-epoch"
print(f"{model_name}")

os.makedirs(save_dir, exist_ok=True) # Crear directorio de guardado si no existe

# Definir el escritor de TensorBoard para visualización
writer = SummaryWriter(log_dir="./runs/" + model_name) 

CNNminitest-CNN_RNN-efficientnet_b0-5-240-135-2-256-epoch


### INICIAR MODELO Y CARGAR DATOS

In [11]:
# Inicializar el modelo

if architecture == "CNN":
    model = CNN(cnn_name, output_size, (3, *input_size), dropout, bias, cnn_train)
    seq_len = 1
elif architecture == "CNN_RNN":
    model = CNN_RNN(cnn_name, hidden_size, output_size, (3, *input_size), num_layers, dropout, bias)
elif architecture == "CNN_LSTM_STATE":
    model = CNN_LSTM_STATE(cnn_name, hidden_size, output_size, (3, *input_size), num_layers, dropout, bias)
    hidden_state = model.init_hidden(batch_size)


# Cargar el modelo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Usando CUDA" if torch.cuda.is_available() else "USANDO CPU")


# Cargar los datasets
train_dataset = RacingDataset(data_dir= train_data_dir, seq_len= seq_len, input_size=input_size, controller = True)
test_dataset = RacingDataset(data_dir= validation_data_dir, seq_len= seq_len, input_size=input_size, controller = True)

# Crear los dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# Optimización y función de pérdida
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#criterion = CrossEntropyLoss(weight=torch.tensor(weights).to(device))

criterion = nn.MSELoss()

#criterion = nn.CrossEntropyLoss()

Usando CUDA


### ENTRENAMIENTO

In [12]:
print("Iniciando entrenamiento...")

start_time = time.time()    # Tiempo de inicio del entrenamiento

# Ciclo de entrenamiento
for epoch in range(num_epochs): 

    epoch_start_time = time.time()  # Tiempo de inicio del epoch    

    # Inicializar el estado oculto (hidden_state) y cell_state
    if architecture == "CNN_LSTM_STATE":
        hidden_state = model.init_hidden(batch_size)  # Estado inicial para LSTM

    model.train() # Establecer el modo de entrenamiento
    running_loss = 0.0
    
    try:
        for i, (images, labels) in enumerate(train_loader):

            print(f"Trabajando en Epoch {epoch+1}. Progreso: {(i+1)/len(train_loader)*100:.2f}%", end='\r')

            #print(f"Dimensiones de las etiquetas en el batch {i+1}: {labels.size()}")

            input_sequence = images # (seq_len, batch_size, channels, height, width)
            input_sequence, labels = input_sequence.to(device), labels.to(device) # Mover los datos al dispositivo            
        
            optimizer.zero_grad()
            
            # Habilitar precisión mixta
            with torch.amp.autocast(device_type='cuda', dtype=torch.float16):

                if architecture == "CNN_LSTM_STATE":
                    # Verificar si el tamaño del hidden_state es compatible con el tamaño del lote actual
                    current_batch_size = input_sequence.size(0)
                    if hidden_state[0].size(1) != current_batch_size:
                        hidden_state = model.init_hidden(current_batch_size)
                        hidden_state = (hidden_state[0].to(device), hidden_state[1].to(device))
                        
                    outputs, hidden_state = model(input_sequence, hidden_state)  # Pasar hidden_state
                    # Desconectar hidden_state para evitar acumulación de gradientes
                    hidden_state = (hidden_state[0].detach(), hidden_state[1].detach())
                elif architecture == "CNN_RNN":
                    outputs = model(input_sequence)
                elif architecture == "CNN":
                    outputs = model(input_sequence)
                else:
                    print("Arquitectura no soportada")
                    #parar la ejecucion de todo el programa
                    raise SystemExit

                loss = criterion(outputs, labels) # Solo se considera la última etiqueta de la secuencia
            
            # Backpropagation y optimización
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()

        # Registrar la pérdida en TensorBoard
        writer.add_scalar('Loss/train', running_loss / len(train_loader), epoch)

        # Guardar el modelo después de cada epoch
        torch.save(model.state_dict(), os.path.join(save_dir, f'{model_name}_{epoch+1}.pth'))

        print("\nValidando modelo...")

        # Validación en el conjunto de datos de prueba
        model.eval()  # Establecer el modo de evaluación
        test_loss = 0.0

        # Inicializar el hidden_state para el set de validación
        if architecture == "CNN_LSTM_STATE":
            hidden_state_val = model.init_hidden(batch_size)
            #hidden_state_val = (hidden_state_val[0].to(device), hidden_state_val[1].to(device))

        with torch.no_grad():
            for i, (images, labels) in enumerate(validation_loader):
                
                print(f"Validando en Epoch {epoch+1}. Progreso: {(i+1)/len(validation_loader)*100:.2f}%", end='\r')

                input_test_sequence = images # (seq_len, batch_size, channels, height, width)
                input_test_sequence, labels = input_test_sequence.to(device), labels.to(device) # Mover los datos al dispositivo

                with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
                    if architecture == "CNN_LSTM_STATE":
                        current_batch_size = input_test_sequence.size(0)
                        # Actualizar el tamaño del hidden_state_val según el tamaño del lote actual
                        if hidden_state_val[0].size(1) != current_batch_size:                     
                            hidden_state_val = model.init_hidden(current_batch_size)
                            hidden_state_val = (hidden_state_val[0].to(device), hidden_state_val[1].to(device))    

                        outputs, hidden_state_val = model(input_test_sequence, hidden_state_val)
                        # Desconectar hidden_state_val para evitar acumulación de gradientes
                        hidden_state_val = (hidden_state_val[0].detach(), hidden_state_val[1].detach())
                    elif architecture == "CNN":
                        outputs = model(input_test_sequence)
                    elif architecture == "CNN_RNN":
                        outputs = model(input_test_sequence)

                    loss = criterion(outputs, labels)
                    test_loss += loss.item()

        val_loss = test_loss / len(validation_loader)
        writer.add_scalar('Loss/test', val_loss, epoch)

        # Comprobar si early stopping es necesario
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0  # Restablecer el contador
            torch.save(model.state_dict(), os.path.join(save_dir, f'{model_name}_{epoch+1}.pth'))  # Guardar el mejor modelo
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print(f"Early stopping triggered after {patience} epochs without improvement.")
                break

        end_time = time.time() - start_time  # Tiempo de finalización del epoch
        # Convertir end_time a formato hh:mm:ss
        end_time = time.strftime("%H:%M:%S", time.gmtime(end_time))
        epoch_time = time.time() - epoch_start_time  # Tiempo de finalización del epoch
        epoch_time = time.strftime("%H:%M:%S", time.gmtime(epoch_time))
        print(f'\nEpoch [{epoch+1}/{num_epochs}], '
            f'Train Loss: {running_loss/len(train_loader):.4f}, '
            f'Val Loss: {test_loss/len(validation_loader):.4f}, '
            f'Epoch Time: {epoch_time}, '
            f'Total Time: {end_time}'
            )
    except KeyboardInterrupt:
        print("\nEntrenamiento interrumpido.")
        break
print("Entrenamiento terminado.")

writer.close()  # Cerrar el escritor de TensorBoard

torch.cuda.empty_cache()
torch.cuda.ipc_collect()

Iniciando entrenamiento...
Trabajando en Epoch 1. Progreso: 100.00%
Validando modelo...
Validando en Epoch 1. Progreso: 100.00%
Epoch [1/30], Train Loss: 0.1852, Val Loss: 0.1454, Epoch Time: 00:01:04, Total Time: 00:01:04
Trabajando en Epoch 2. Progreso: 100.00%
Validando modelo...
Validando en Epoch 2. Progreso: 100.00%
Epoch [2/30], Train Loss: 0.0749, Val Loss: 0.1432, Epoch Time: 00:01:05, Total Time: 00:02:09
Trabajando en Epoch 3. Progreso: 100.00%
Validando modelo...
Validando en Epoch 3. Progreso: 100.00%
Epoch [3/30], Train Loss: 0.0592, Val Loss: 0.1219, Epoch Time: 00:01:05, Total Time: 00:03:14
Trabajando en Epoch 4. Progreso: 6.45%
Entrenamiento interrumpido.
Entrenamiento terminado.
