In [19]:
%%capture
!pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113
!pip install tqdm
!pip install scipy
!pip install opencv-python
!pip install scikit-image
!pip install tensorboard
!pip install matplotlib
!pip install timm==0.5.4
!pip install transformers datasets torch torchvision gdown

In [39]:
import argparse
import torch
import numpy as np
import logging
from pathlib import Path
import torch.nn as nn



In [2]:
!git clone https://github.com/gangweix/IGEV-plusplus/

Cloning into 'IGEV-plusplus'...
remote: Enumerating objects: 284, done.[K
remote: Counting objects: 100% (85/85), done.[K
remote: Compressing objects: 100% (85/85), done.[K
remote: Total 284 (delta 46), reused 0 (delta 0), pack-reused 199 (from 1)[K
Receiving objects: 100% (284/284), 17.56 MiB | 21.77 MiB/s, done.
Resolving deltas: 100% (150/150), done.


In [11]:
# Descargando el archivo kitti15.pth. Kitti15 is the pretrained weights with KITTI dataset.
import gdown
file_id = "1VG47N7gPzkg-FIhge0XHtlOVHDVmMdek"
gdown.download(f"https://drive.google.com/uc?id={file_id}", output="/content/kitti15.pth", quiet=False)


Downloading...
From (original): https://drive.google.com/uc?id=1VG47N7gPzkg-FIhge0XHtlOVHDVmMdek
From (redirected): https://drive.google.com/uc?id=1VG47N7gPzkg-FIhge0XHtlOVHDVmMdek&confirm=t&uuid=9a88f9d8-b091-4826-a4a4-371b05758de3
To: /content/kitti15.pth
100%|██████████| 50.8M/50.8M [00:00<00:00, 106MB/s] 


'/content/kitti15.pth'

In [24]:

def Args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--name', default='igev-stereo', help="name your experiment")
    parser.add_argument('--restore_ckpt', default=None, help='load the weights from a specific checkpoint')
    parser.add_argument('--logdir', default='./checkpoints', help='the directory to save logs and checkpoints')
    parser.add_argument('--mixed_precision', default=True, action='store_true', help='use mixed precision')
    parser.add_argument('--precision_dtype', default='float16', choices=['float16', 'bfloat16', 'float32'], help='Choose precision type: float16 or bfloat16 or float32')

    # Training parameters
    parser.add_argument('--batch_size', type=int, default=8, help="batch size used during training.")
    parser.add_argument('--train_datasets', default='sceneflow', choices=['sceneflow', 'kitti', 'middlebury_train', 'middlebury_finetune', 'eth3d_train', 'eth3d_finetune'], help="training datasets.")
    parser.add_argument('--lr', type=float, default=0.0002, help="max learning rate.")
    parser.add_argument('--num_steps', type=int, default=200000, help="length of training schedule.")
    parser.add_argument('--image_size', type=int, nargs='+', default=[256, 768], help="size of the random image crops used during training.")
    parser.add_argument('--train_iters', type=int, default=22, help="number of updates to the disparity field in each forward pass.")
    parser.add_argument('--wdecay', type=float, default=.00001, help="Weight decay in optimizer.")

    # Validation parameters
    parser.add_argument('--valid_iters', type=int, default=32, help='number of flow-field updates during validation forward pass')

    # Architecure choices
    parser.add_argument('--corr_levels', type=int, default=2, help="number of levels in the correlation pyramid")
    parser.add_argument('--corr_radius', type=int, default=4, help="width of the correlation pyramid")
    parser.add_argument('--n_downsample', type=int, default=2, help="resolution of the disparity field (1/2^K)")
    parser.add_argument('--n_gru_layers', type=int, default=3, help="number of hidden GRU levels")
    parser.add_argument('--hidden_dims', nargs='+', type=int, default=[128]*3, help="hidden state and context dimensions")
    parser.add_argument('--max_disp', type=int, default=768, help="max disp range")
    parser.add_argument('--s_disp_range', type=int, default=48, help="max disp of small disparity-range geometry encoding volume")
    parser.add_argument('--m_disp_range', type=int, default=96, help="max disp of medium disparity-range geometry encoding volume")
    parser.add_argument('--l_disp_range', type=int, default=192, help="max disp of large disparity-range geometry encoding volume")
    parser.add_argument('--s_disp_interval', type=int, default=1, help="disp interval of small disparity-range geometry encoding volume")
    parser.add_argument('--m_disp_interval', type=int, default=2, help="disp interval of medium disparity-range geometry encoding volume")
    parser.add_argument('--l_disp_interval', type=int, default=4, help="disp interval of large disparity-range geometry encoding volume")

    # Data augmentation
    parser.add_argument('--img_gamma', type=float, nargs='+', default=None, help="gamma range")
    parser.add_argument('--saturation_range', type=float, nargs='+', default=[0, 1.4], help='color saturation')
    parser.add_argument('--do_flip', default=False, choices=['h', 'v'], help='flip the images horizontally or vertically')
    parser.add_argument('--spatial_scale', type=float, nargs='+', default=[-0.4, 0.8], help='re-scale the images randomly')
    parser.add_argument('--noyjitter', action='store_true', help='don\'t simulate imperfect rectification')
    args = parser.parse_args()

    torch.manual_seed(666)
    np.random.seed(666)

    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s')

    Path(args.logdir).mkdir(exist_ok=True, parents=True)
    # train(args)
    return args


In [None]:
"""
--name                  # Name of the experiment (default: 'igev-stereo')
--restore_ckpt          # Path to a checkpoint to resume training (default: None)
--logdir                # Directory to save logs and checkpoints (default: './checkpoints')
--mixed_precision       # Enable mixed precision training (default: True)
--precision_dtype       # Precision type: 'float16', 'bfloat16', or 'float32' (default: 'float16')

--batch_size            # Batch size (default: 8)
--train_datasets        # Dataset for training (choices: 'sceneflow', 'kitti', 'middlebury_train', 'middlebury_finetune', 'eth3d_train', 'eth3d_finetune'; default: 'sceneflow')
--lr                    # Learning rate (default: 0.0002)
--num_steps             # Total training steps (default: 200000)
--image_size            # Size of random image crops (default: [256, 768])
--train_iters           # Number of disparity updates per forward pass (default: 22)
--wdecay                # Weight decay (default: 0.00001)

--valid_iters           # Number of disparity updates during validation (default: 32)

--corr_levels           # Number of levels in the correlation pyramid (default: 2)
--corr_radius           # Width of the correlation pyramid (default: 4)
--n_downsample          # Downsampling factor for disparity field (default: 2)
--n_gru_layers          # Number of GRU layers (default: 3)
--hidden_dims           # Hidden dimensions for GRU layers (default: [128, 128, 128])
--max_disp              # Maximum disparity range (default: 768)
--s_disp_range          # Max disparity for small-range volume (default: 48)
--m_disp_range          # Max disparity for medium-range volume (default: 96)
--l_disp_range          # Max disparity for large-range volume (default: 192)
--s_disp_interval       # Disparity interval for small-range volume (default: 1)
--m_disp_interval       # Disparity interval for medium-range volume (default: 2)
--l_disp_interval       # Disparity interval for large-range volume (default: 4)

--img_gamma             # Gamma correction range (default: None)
--saturation_range      # Color saturation range (default: [0, 1.4])
--do_flip               # Flip images ('h' for horizontal, 'v' for vertical) (default: False)
--spatial_scale         # Random image rescaling range (default: [-0.4, 0.8])
--noyjitter             # Disable imperfect rectification simulation (default: False)

"""

In [41]:
import sys
import os

# Add the root of the repository to sys.path
repo_path = '/content/IGEV-plusplus/'
sys.path.append(repo_path)

from core import igev_stereo


from igev_stereo import IGEVStereo

args = Args() # Create an instance of Args

model = igev_stereo.IGEVStereo(args) # Instantiate the model

pretrained_ckpt_path = "/content/checkpoints/pretrained_models/kitti15.pth"


In [46]:
for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        print(name)
for name, module in model.named_modules():
    print(f"{name}: {type(module)}")

: <class 'core.igev_stereo.IGEVStereo'>
cnet: <class 'core.extractor.MultiBasicEncoder'>
cnet.norm1: <class 'torch.nn.modules.batchnorm.BatchNorm2d'>
cnet.conv1: <class 'torch.nn.modules.conv.Conv2d'>
cnet.relu1: <class 'torch.nn.modules.activation.ReLU'>
cnet.layer1: <class 'torch.nn.modules.container.Sequential'>
cnet.layer1.0: <class 'core.extractor.ResidualBlock'>
cnet.layer1.0.conv1: <class 'torch.nn.modules.conv.Conv2d'>
cnet.layer1.0.conv2: <class 'torch.nn.modules.conv.Conv2d'>
cnet.layer1.0.relu: <class 'torch.nn.modules.activation.ReLU'>
cnet.layer1.0.norm1: <class 'torch.nn.modules.batchnorm.BatchNorm2d'>
cnet.layer1.0.norm2: <class 'torch.nn.modules.batchnorm.BatchNorm2d'>
cnet.layer1.1: <class 'core.extractor.ResidualBlock'>
cnet.layer1.1.conv1: <class 'torch.nn.modules.conv.Conv2d'>
cnet.layer1.1.conv2: <class 'torch.nn.modules.conv.Conv2d'>
cnet.layer1.1.relu: <class 'torch.nn.modules.activation.ReLU'>
cnet.layer1.1.norm1: <class 'torch.nn.modules.batchnorm.BatchNorm2d'>

In [70]:
# Agregar si encuentro mas
target_modules_candidates = [
    ".conv",  # Catches many convolutional layers
    ".conv1",
    ".conv2",
    ".conv3",
    ".q_proj", # If there are query projections in attention
    ".v_proj", # If there are value projections in attention
    ".k_proj", # If there are key projections in attention
    "g_proj", # Example, if some specific projection exists
    "c_proj",
    "attn_q", # If they have custom names like this
    "attn_v",
    "attn_k",
    "gru.weight_ih", # Convolutional GRU weights
    "gru.weight_hh"
]

actual_target_modules = []
for name, module in model.named_modules():
    for candidate in target_modules_candidates:
        if candidate in name and isinstance(module, (nn.Linear, nn.Conv2d)):
            # Filter by actual layer types LoRA can modify
            actual_target_modules.append(name)
            # Break to avoid adding the same module multiple times if multiple candidates match
            break
print(f"Discovered LoRA target modules: {actual_target_modules}")
print(f"Total LoRA target modules: {len(actual_target_modules)}")



Discovered LoRA target modules: ['cnet.conv1.base_layer', 'cnet.conv1.lora_A.default', 'cnet.conv1.lora_B.default', 'cnet.layer1.0.conv1.base_layer.base_layer', 'cnet.layer1.0.conv1.base_layer.lora_A.default', 'cnet.layer1.0.conv1.base_layer.lora_B.default', 'cnet.layer1.0.conv1.lora_A.default.base_layer', 'cnet.layer1.0.conv1.lora_A.default.lora_A.default', 'cnet.layer1.0.conv1.lora_A.default.lora_B.default', 'cnet.layer1.0.conv1.lora_B.default.base_layer', 'cnet.layer1.0.conv1.lora_B.default.lora_A.default', 'cnet.layer1.0.conv1.lora_B.default.lora_B.default', 'cnet.layer1.0.conv2.base_layer.base_layer', 'cnet.layer1.0.conv2.base_layer.lora_A.default', 'cnet.layer1.0.conv2.base_layer.lora_B.default', 'cnet.layer1.0.conv2.lora_A.default.base_layer', 'cnet.layer1.0.conv2.lora_A.default.lora_A.default', 'cnet.layer1.0.conv2.lora_A.default.lora_B.default', 'cnet.layer1.0.conv2.lora_B.default.base_layer', 'cnet.layer1.0.conv2.lora_B.default.lora_A.default', 'cnet.layer1.0.conv2.lora_B.def

In [71]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(
    target_modules=actual_target_modules,
    lora_dropout=0.1,
    bias="none",
    r=4,
    lora_alpha=16,
)

# In the paper, section experiments, they set the rank for RVLoRA to 4

In [72]:
model_with_lora = get_peft_model(model, peft_config)
model_with_lora.print_trainable_parameters()




trainable params: 2,277,076 || all params: 16,805,366 || trainable%: 13.5497
