# Experiment 2: LLava Approach -Clip Encoder/Vicuna Decoder with LoRA

This experiment tests the LLava style projections using the HuggingFace Clip encoder and the Vicuna 7b decoder. The experiment has two stages as outlined in the LLAva paper.

- **Stage One**: pretrains with the encoder and decoder frozen. The only trainable parameters at this stage are in the MLP used for image projections.

- **Stage Two**: performs fine-tuning using low rank adaption for the Vicuna model with the mlp for image projection unfrozen. The pretrained MLP weights from stage one are loaded as the starting point for the MLP in stage two.

In [None]:
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer
import datetime
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"

"""
Add support for either running in collab by uploading this notebook and
mounting the directory or locally from the room or experiments folder"
"""
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

    # NOTE: change the drive path if running with a mounted google drive in collab
    project_root = "/content/drive/Othercomputers/My MacBook Pro/image-captioning"
else:
    cwd = os.getcwd()

    if cwd.endswith("experiments"):
        project_root = os.path.abspath(os.path.join(cwd, '..'))
    else:
        project_root = cwd

if project_root not in sys.path:
    sys.path.append(project_root)

print("Project root:", project_root)

if IN_COLAB:
    !pip install evaluate > /dev/null 2>&1
    !pip install pycocoevalcap > /dev/null 2>&1


from vision_language_model import VisionLanguageModel
import train as train
import data_processing as dp
import download_data as get_data
import evaluation as eval


device = torch.device("cuda" if torch.cuda.is_available() else
                      "mps" if torch.backends.mps.is_available() else "cpu")

experiment = "experiment_4"
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
# download data, keep it outside of the mounted directory if running in collab to avoid data transfer overhead
if IN_COLAB:
    data_dir = "/content/flickr30k_data"
else:
    data_dir = os.path.join(project_root, "flickr30k_data")

# only download data if it does not already exist
if not os.path.exists(data_dir) or not os.listdir(data_dir):
    os.makedirs(data_dir, exist_ok=True)
    get_data.download_and_partition(data_dir)
else:
    print(f"Data already exists in {data_dir}, skipping download.")

# setup saving directories
model_weights_dir = os.path.join(project_root, "model_weights")
evaluations_dir = os.path.join(project_root, "evaluations")
os.makedirs(model_weights_dir, exist_ok=True)
os.makedirs(evaluations_dir, exist_ok=True)

In [None]:
# load data
# Note: the train loader loads duplicate images with a 1:1 mapping of image to captions
# and the val/test loaders load images with a 1:N mapping of image to captions for evaluation
# these loaders will load batches of images rather than all images at once to avoid memory issues
train_loader = dp.batch_stream("captions.txt", os.path.join(data_dir, "train"), batch_size=4, eval_mode=False)
val_dir = os.path.join(data_dir, "val")
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")

# visualize some training images
batch_1 = next(train_loader)
batch_2 = next(train_loader)
dp.visualize_random_captions([batch_1, batch_2])

## Stage One: Pretraining MLP

In [None]:
tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
tokenizer.pad_token = tokenizer.eos_token

# model config for stage one
model = VisionLanguageModel(
    image_encoder_type="clip",
    llava_projections=True,
    cross_attention=False,
    debug=False,
    decoder_type="vicuna",
    d_model=4096,
    tokenizer=tokenizer
    ).to(device)

loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [None]:
%%time
# stage one: pretraining

max_batches=10000
num_epochs=1

train.train(
    model=model,
    train_dir=train_dir,
    val_dir=val_dir,
    loss_function=loss_fn,
    device=device,
    batch_size=2,
    num_epochs=num_epochs,
    training_type="mlp-pretrain",
    log_interval=1000,
    max_batches=max_batches,
    lr_scheduler=True,
    random_seed=1,
    learning_rate=2e-3,
    mlp_weights_path=os.path.join(model_weights_dir, f"{experiment}_pretrain_weights_{timestamp}.pt"),
    loss_plot_path=os.path.join(evaluations_dir, f"{experiment}_pretrain_loss_{timestamp}.jpg"),
    )

## Stage Two: Fine-tune

In [None]:
%%time
num_epochs=2
max_batches=2000

train.train(
    model=model,
    train_dir=train_dir,
    val_dir=val_dir,
    loss_function=loss_fn,
    device=device,
    batch_size=2,
    num_epochs=num_epochs,
    training_type="lora",
    max_batches=max_batches,
    log_interval=500,
    learning_rate=2e-4,
    random_seed=16,
    eval_every=2,
    model_weights_path=os.path.join(model_weights_dir, f"{experiment}_finetune_weights_{timestamp}.pt"),
    loss_plot_path=os.path.join(evaluations_dir, f"{experiment}_finetune_loss_{timestamp}.jpg"),
    all_epochs_loss_plot_path=os.path.join(evaluations_dir, f"{experiment}_all_epochs_loss_{timestamp}.jpg")
)

## Evaluate on Test Set

In [None]:
test_loader = dp.batch_stream("captions.txt", os.path.join(data_dir, "test"), batch_size=2, eval_mode=True, seed=32)

bleu, cider = eval.evaluate_bleu_cider(
    model=model,
    data_loader=test_loader,
    display_captions=True,
    save_captions_path=os.path.join(evaluations_dir, f"{experiment}_captions_{timestamp}.jpg"),
    max_new_tokens=20,
    max_batches=500
)

print(f"BLEU: {bleu:.4f}, CIDEr: {cider:.4f}")

In [None]:
# save experiment results
results_path = os.path.join(evaluations_dir, "results.csv")

eval.save_experiment_results(experiment, cider, bleu, results_path)