# Experiments for fine tuning a pre-trained CNN for sensor processing

VGG-11 has a latent space of 512. How do we reduce / expand this to the latent space we want???

* Idea 0: just use the features as it is
* Idea 1: fine tune on proprioception


## Extracting the VGG-19 features from some pictures from a task

In [None]:
import sys
sys.path.append("..")
from exp_run_config import Config
Config.PROJECTNAME = "BerryPicker"


from pathlib import Path
from demonstration.demonstration_helper import BCDemonstration

import matplotlib.pyplot as plt

In [None]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image

# Step 1: Load the pre-trained VGG-19 model
vgg19 = models.vgg19(pretrained=True)
vgg19.eval()  # Set the model to evaluation mode

In [None]:
run = "vgg19_orig"
exp = Config().get_experiment("sp_cnn", run)
# model_subdir = Path(exp["data_dir"], exp["model_dir"], "models", exp["model_name"], exp["model_subdir"])
# conv_vae_jsonfile = Path(model_subdir, "config.json")


In [None]:
task = "random-uncluttered"
demos_dir = Path(Config()["demos"]["directory"])
task_dir = Path(demos_dir, "demos", task)
for demo_dir in task_dir.iterdir():
    if not demo_dir.is_dir():
        pass


# These are the transforms that the image to what vgg-19 was trained on
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to 224x224
    transforms.ToTensor(),  # Convert to tensor
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],  # Normalization for ImageNet
        std=[0.229, 0.224, 0.225]
    )
])

bcd = BCDemonstration(demo_dir, sensorprocessor=None)
for i in range(1, bcd.maxsteps-1):
    imgtensor, image = bcd.get_image(i, transform=transform)

In [None]:
print(imgtensor)

In [None]:
imgtensor.dtype

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vgg19.to(device)

feature_extractor = vgg19.features
#result = vgg19(imgtensor)
result = feature_extractor(imgtensor)
result.shape


In [None]:
fig, axs = plt.subplots(1,1, constrained_layout=True)
axs.imshow(image)

In [None]:
class VGG19Embedding(nn.Module):
    def __init__(self, latent_size):
        super(VGG19Embedding, self).__init__()
        self.feature_extractor = vgg19.childrenfeatures
        self.flatten = nn.Flatten()  # Flatten the output for the fully connected layer
        self.fc = nn.Linear(512 * 7 * 7, latent_size)  # Adjust input size based on VGG19 output size
    
    def forward(self, x):
        # Extract features
        features = self.feature_extractor(x)
        # Flatten the feature map
        features_flat = self.flatten(features)
        # Project to the latent space
        latent = self.fc(features_flat)
        return latent

In [None]:
# Define the MLP regression model
class VGG19Regression(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(VGG19Regression, self).__init__()
        vgg19 = models.vgg19(pretrained=True)
        self.feature_extractor = vgg19.childrenfeatures
        self.flatten = nn.Flatten()  # Flatten the output for the fully connected layer
        self.model = nn.Sequential(
            nn.Linear(512 * 7 * 7, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )
        # freeze the parameters
        for param in self.feature_extractor.parameters():
            param.requires_grad = False        

    def forward(self, x):
        features = self.feature_extractor(x)
        flatfeatures = self.flatten(features)
        return self.model(flatfeatures)

In [None]:
sp = sp_conv_vae.get_sp_of_conv_vae_experiment("vae_01")

task = exp["proprioception_training_task"]
proprioception_input_file = Path(exp["data_dir"], 
                                            exp["proprioception_input_file"])
proprioception_target_file = Path(exp["data_dir"], 
                                            exp["proprioception_target_file"])

# FIXME: this is going to be different, because we are going to be working on the original pictures, not on the extracted values

tr = load_demonstrations_as_proprioception_training(sp, task, 
                                                    proprioception_input_file, proprioception_target_file)
inputs_training = tr["inputs_training"]
targets_training = tr["targets_training"]
inputs_validation = tr["inputs_validation"]
targets_validation = tr["targets_validation"]

# Initialize model, loss function, and optimizer
input_size = inputs_training.size(1)
hidden_size = 64
output_size = targets_training.size(1)

print(input_size)
print(output_size)

model = MLPRegression(input_size, hidden_size, output_size)
# criterion = nn.MSELoss()
# Experiment: would this be better???
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)