In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
from pathlib import Path

import hydra
import torch
import wandb
from hydra.utils import instantiate
from omegaconf import DictConfig, OmegaConf

from opr.datasets.dataloader_factory import make_dataloaders
from opr.testing import test
from opr.training import epoch_loop
from opr.utils import flatten_dict, set_seed



In [3]:
from hydra import initialize, compose
from hydra.utils import instantiate

with initialize(version_base=None, config_path="configs"):
    cfg = compose(config_name='config_nclt_text.yaml')  # only_text
    # cfg = compose(config_name='config.yaml')  # lidar + cam

In [5]:
cfg.general

{'debug': False, 'seed': 31299, 'checkpoints_dir': 'checkpoints/', 'device': 'cuda', 'num_workers': 4, 'batch_expansion_th': 0.7, 'modalities': ['text_cam5', 'text_cam2', 'text_cam1', 'text_cam3', 'text_cam4'], 'test_modality': 'text', 'epochs': 60}

In [4]:
cfg.model

{'_target_': 'opr.models.base_models.ComposedModel', 'image_module': None, 'cloud_module': None, 'fusion_module': None, 'text_module': {'_target_': 'opr.models.base_models.MultiTextModule', 'text_module': {'_target_': 'opr.models.base_models.TextModule', 'text_emb_size': 100, 'hidden_size': 100}, 'fusion_module': {'_target_': 'opr.models.fusion.Concat'}}}

In [5]:
# if not cfg.general.debug and not cfg.wandb.disabled:
#     config_dict = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
#     wandb.init(
#         name=cfg.wandb.run_name,
#         project=cfg.wandb.project,
#         settings=wandb.Settings(start_method="thread"),
#         config=config_dict,
#     )
#     wandb.save(f"configs/{wandb.run.name}.yaml")
#     run_name = wandb.run.name
# else:
#     run_name = "debug"

In [6]:
# checkpoints_dir = (
#     Path(cfg.general.checkpoints_dir) / f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}_{run_name}"
# )
# if not checkpoints_dir.exists():
#     checkpoints_dir.mkdir(parents=True)

In [6]:
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump

# Train TFIDF and PCA

In [7]:
base_path = "/home/docker_opr/Datasets/NCLT_preprocessed"
train_df_path = os.path.join(base_path, "train.csv")

train_df = pd.read_csv(train_df_path)
train_df.head(2)

Unnamed: 0.1,Unnamed: 0,track,image,pointcloud,northing,easting
0,0,2012-01-08,1326030979526128,1326030979526128,0.338313,0.222726
1,1,2012-01-08,1326031015326922,1326031015326922,-2.798348,-9.238316


In [8]:
tracks = train_df["track"].unique()

descriptions = []

for track in tracks:
    cur_track_path = os.path.join(base_path, track)
    cur_track_df = train_df[train_df["track"] == track]
    images = cur_track_df.image.values
    images = [str(int(i))+".png" for i in images]
    for cam_id in range(1, 6):
        cam_df_path = os.path.join(cur_track_path, f"descriptions_Cam{cam_id}.csv")
        cam_df = pd.read_csv(cam_df_path)
        cam_descriptions = cam_df[cam_df["path"].isin(images)]["description"].values
        descriptions.append(cam_descriptions)
        
descriptions = np.hstack((descriptions))

In [9]:
descriptions.shape, len(train_df) * 5

((14640,), 14640)

In [10]:
def train_tfidf_pca(corpus, 
                    max_features_tfidf=None,
                    n_components_pca=100, 
                    base_savepath="./opr/datasets/"):
    vectorizer = TfidfVectorizer(max_features=max_features_tfidf)
    vectorizer.fit(corpus)
    print("n tfidf features = ", vectorizer.get_feature_names_out().shape)
    vectorized_corpus = vectorizer.transform(corpus).toarray()

    pca = PCA(n_components=n_components_pca)
    pca.fit(vectorized_corpus)
    
    vectorizer_savepath = os.path.join(base_savepath, 'vectorizer.joblib')
    pca_savepath = os.path.join(base_savepath, 'pca.joblib')
    dump(vectorizer, vectorizer_savepath) 
    dump(pca, pca_savepath) 

def text_transform(self, text):
    vect_data = self.vectorizer.transform([text]).toarray()
    pca_data = self.pca.transform(vect_data)
    pca_data = torch.tensor(pca_data, dtype=torch.float32)
    return pca_data

In [11]:
if "text_cam1" in cfg.general.modalities:
    print("text here")
    train_tfidf_pca(descriptions)

text here
n tfidf features =  (7857,)


In [12]:
# dataset_root = "/home/docker_opr/Datasets/NCLT_preprocessed"
# tracks = [i for i in os.listdir(dataset_root) if os.path.isdir(os.path.join(dataset_root, i))]
# df_dict = {}

# for track in tracks:
#     track_path = os.path.join(dataset_root, track)
#     df_dict[track] = {f"cam{n}" : pd.read_csv(os.path.join(track_path, f"descriptions_Cam{n}.csv")) for n in range(1, 6)}

In [13]:
# track = '2012-02-12'
# cam = "cam1"
# image = "1329070191225477"

# cam_df = df_dict[track][cam]
# text = cam_df[cam_df["path"] == f"{image}.png"]["description"].values[0]
# text

In [14]:
# print("=> Instantiating model...")
# model = instantiate(cfg.model)

# print("=> Instantiating loss...")
# loss_fn = instantiate(cfg.loss)

# print("=> Making dataloaders...")
# dataloaders = make_dataloaders(
#     dataset_cfg=cfg.dataset.dataset,
#     batch_sampler_cfg=cfg.dataset.sampler,
#     num_workers=cfg.dataset.num_workers,
# )

In [15]:
# batch, _, _ = next(iter(dataloaders["train"]))

In [16]:
# batch["text_emb_cam1"].shape

In [17]:
# model = instantiate(cfg.model)

In [18]:
# model(batch)["text"].shape

In [7]:
print("=> Instantiating model...")
model = instantiate(cfg.model)

print("=> Instantiating loss...")
loss_fn = instantiate(cfg.loss)

print("=> Making dataloaders...")
dataloaders = make_dataloaders(
    dataset_cfg=cfg.dataset.dataset,
    batch_sampler_cfg=cfg.dataset.sampler,
    num_workers=cfg.dataset.num_workers,
)

print("=> Instantiating optimizer...")
params_list = []
modalities = list(set([m.split("_")[0] for m in cfg.general.modalities]))
for modality in modalities:
    params_list.append(
        {
            "params": getattr(model, f"{modality}_module").parameters(),
            "lr": cfg.optimizer.learning_rates[f"{modality}_lr"],
        }
    )
optimizer = instantiate(cfg.optimizer.fn, params=params_list)
print("Instantiating scheduler...")
scheduler = instantiate(cfg.scheduler, optimizer=optimizer)

model = model.to(cfg.general.device)

=> Instantiating model...
=> Instantiating loss...
=> Making dataloaders...
=> Instantiating optimizer...
Instantiating scheduler...


In [9]:
model, optimizer, modalities

(ComposedModel(
   (text_module): MultiTextModule(
     (text_module): TextModule(
       (fc1): Linear(in_features=100, out_features=100, bias=True)
       (relu): ReLU()
       (fc2): Linear(in_features=100, out_features=100, bias=True)
     )
     (fusion_module): Concat()
   )
 ),
 Adam (
 Parameter Group 0
     amsgrad: False
     betas: (0.9, 0.999)
     capturable: False
     eps: 1e-08
     foreach: None
     initial_lr: 0.001
     lr: 0.001
     maximize: False
     weight_decay: 0.0001
 ),
 ['text'])

In [10]:
if not cfg.general.debug and not cfg.wandb.disabled:
    config_dict = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
    wandb.init(
        name=cfg.wandb.run_name,
        project=cfg.wandb.project,
        settings=wandb.Settings(start_method="thread"),
        config=config_dict,
    )
    wandb.save(f"configs/{wandb.run.name}.yaml")
    run_name = wandb.run.name
else:
    run_name = "debug"

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpetili[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
checkpoints_dir = (
    Path(cfg.general.checkpoints_dir) / f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}_{run_name}"
)
if not checkpoints_dir.exists():
    checkpoints_dir.mkdir(parents=True)

In [12]:
best_recall_at_1 = 0.0

# for epoch in range(40, 60):
for epoch in range(cfg.general.epochs):
    print(f"\n\n=====> Epoch {epoch+1}:")
    # TODO: resolve mypy typing here
    train_batch_size = dataloaders["train"].batch_sampler.batch_size  # type: ignore
    val_batch_size = dataloaders["val"].batch_sampler.batch_size  # type: ignore

    print("\n=> Training:\n")

    train_stats, train_rate_non_zero = epoch_loop(
        dataloader=dataloaders["train"],
        model=model,
        loss_fn=loss_fn,
        optimizer=optimizer,
        scheduler=scheduler,
        phase="train",
        device=cfg.general.device,
    )

    print(f"\ntrain_rate_non_zero = {train_rate_non_zero}")

    batch_expansion_th = cfg.general.batch_expansion_th
    if batch_expansion_th is not None:
        if batch_expansion_th == 1.0:
            print("Batch expansion rate is set to every epoch. Increasing batch size.")
            # TODO: resolve mypy typing here
            dataloaders["train"].batch_sampler.expand_batch()  # type: ignore
        elif train_rate_non_zero is None:
            print(
                "\nWARNING: 'batch_expansion_th' was set, but 'train_rate_non_zero' is None. ",
                "The batch size was not expanded.",
            )
        elif train_rate_non_zero < batch_expansion_th:
            print(
                "Average non-zero triplet ratio is less than threshold: ",
                f"{train_rate_non_zero} < {batch_expansion_th}",
            )
            # TODO: resolve mypy typing here
            dataloaders["train"].batch_sampler.expand_batch()  # type: ignore

    print("\n=> Validating:\n")

    val_stats, val_rate_non_zero = epoch_loop(
        dataloader=dataloaders["val"],
        model=model,
        loss_fn=loss_fn,
        optimizer=optimizer,
        phase="val",
        device=cfg.general.device,
    )

    print(f"\nval_rate_non_zero = {val_rate_non_zero}")

    print("\n=> Testing:\n")

    recall_at_n, recall_at_one_percent, mean_top1_distance = test(
        model=model,
        descriptor_key=cfg.general.test_modality,
        dataloader=dataloaders["test"],
        device=cfg.general.device,
    )

    stats_dict = {}
    stats_dict["test"] = {
        "mean_top1_distance": mean_top1_distance,
        "recall_at_1%": recall_at_one_percent,
        "recall_at_1": recall_at_n[0],
        "recall_at_3": recall_at_n[2],
        "recall_at_5": recall_at_n[4],
        "recall_at_10": recall_at_n[9],
    }
    stats_dict["train"] = train_stats
    stats_dict["train"]["batch_size"] = train_batch_size
    stats_dict["val"] = val_stats
    stats_dict["val"]["batch_size"] = val_batch_size

    # saving checkpoints
    checkpoint_dict = {
        "epoch": epoch + 1,
        "config": cfg,
        "stats_dict": stats_dict,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
    }
    torch.save(checkpoint_dict, checkpoints_dir / f"epoch_{epoch+1}.pth")
    # wandb logging
    if not cfg.general.debug and not cfg.wandb.disabled:
        wandb.log(flatten_dict(stats_dict))
        wandb.save(str((checkpoints_dir / f"epoch_{epoch+1}.pth").relative_to(".")))
    if recall_at_n[0] > best_recall_at_1:
        print("Recall@1 improved!")
        torch.save(checkpoint_dict, checkpoints_dir / "best.pth")
        best_recall_at_1 = recall_at_n[0]
        if not cfg.general.debug and not cfg.wandb.disabled:
            wandb.save(str((checkpoints_dir / "best.pth").relative_to(".")))



=====> Epoch 1:

=> Training:



train:   8%|▊         | 30/377 [00:02<00:24, 14.32it/s]