In [1]:
import pandas as pd

In [4]:
train = pd.read_csv("public/train.csv", index_col=0)
val = pd.read_csv("public/val.csv", index_col=0)
test = pd.read_csv("public/test.csv", index_col=0)

In [8]:
train.shape, val.shape, test.shape

((1359, 14), (496, 14), (1855, 14))

In [10]:
train.shape[0] + val.shape[0] + test.shape[0]

3710

In [13]:
full_df = pd.concat([train, val, test])

In [15]:
full_df.reset_index()

Unnamed: 0.1,Unnamed: 0,timestamp,track,front_cam_ts,back_cam_ts,lidar_ts,northing,easting,tz,qx,qy,qz,qw,back_description,front_description
0,0,1676034260851770,2023-02-10-08-04-19-twilight,1676034260821540,1676034260865870,1676034260851770,-23.839826,-17.655232,-1.474530,0.013051,0.028187,0.003870,0.999510,This image shows a man standing on the stairs ...,This is a picture of a city street with buildi...
1,1,1676034268718093,2023-02-10-08-04-19-twilight,1676034268687989,1676034268708533,1676034268718093,-19.060815,-19.003152,-2.513494,0.017344,0.019243,-0.121944,0.992199,This image shows two people standing outside i...,The image shows a street with buildings on eit...
2,2,1676034274365801,2023-02-10-08-04-19-twilight,1676034274404483,1676034274382407,1676034274365801,-14.289875,-20.252643,-2.563523,0.019104,0.025107,0.103990,0.994078,"The scene shows a long, multi-story building w...",This image shows a road with buildings on eith...
3,3,1676034280013503,2023-02-10-08-04-19-twilight,1676034280004376,1676034279987523,1676034280013503,-10.200171,-17.323240,-2.689153,0.006686,0.016921,0.394677,0.918640,The scene depicted in the image is a city stre...,This is a road that leads to a building with w...
4,4,1676034285358658,2023-02-10-08-04-19-twilight,1676034285387661,1676034285339092,1676034285358658,-6.752502,-13.812644,-2.623646,0.003145,0.013536,0.374985,0.926927,The image shows a long street with buildings o...,"The image shows a road in the winter, with sno..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3705,1850,1678905610167111,2023-03-15-13-25-48-night,1678905610166555,1678905610148946,1678905610167111,-208.692754,19.746243,-2.661155,0.010751,0.021335,0.389329,0.920789,"The scene is a dark, snowy street with buildin...",The scene shows a dark and cold night with sno...
3706,1851,1678905615814276,2023-03-15-13-25-48-night,1678905615783690,1678905615831108,1678905615814276,-204.946252,23.201232,-2.763870,-0.001526,0.012262,0.270024,0.962774,The image shows a woman standing on a sidewalk...,This scene shows a city street at night with a...
3707,1852,1678905621059160,2023-03-15-13-25-48-night,1678905621067311,1678905621028555,1678905621059160,-200.439570,25.320586,-2.827866,0.000658,0.011407,0.146494,0.989146,The image shows a narrow alleyway in the middl...,This is a city street at night with several st...
3708,1853,1678905626909006,2023-03-15-13-25-48-night,1678905626884315,1678905626895739,1678905626909006,-195.479406,26.327213,-2.666329,0.007004,-0.019667,-0.149021,0.988614,The image is of a building with a bright orang...,"The image shows a dark, snowy street with stre..."


# building indexes


In [7]:
import argparse
import pickle
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import torch
from pandas import DataFrame
from pytorch_metric_learning.distances import LpDistance


def parse_args() -> Tuple[Path, float, float]:
    """Parse input CLI arguments.

    Raises:
        ValueError: If the given '--dataset_root' directory does not exist.

    Returns:
        Path: Dataset root path.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--dataset_root",
        required=True,
        type=Path,
        help="The path to the preprocessed dataset root directory.",
    )
    parser.add_argument(
        "--pos_threshold",
        required=False,
        type=float,
        default=7.5,
        help="Positive distance threshold in meters. Defaults to 7.5.",
    )
    parser.add_argument(
        "--neg_threshold",
        required=False,
        type=float,
        default=25.0,
        help="Negative distance threshold in meters. Defaults to 25.0.",
    )
    args = parser.parse_args()

    dataset_root: Path = args.dataset_root
    if not dataset_root.exists():
        raise ValueError("Given dataset_root directory does not exist.")

    pos_threshold = args.pos_threshold
    neg_threshold = args.neg_threshold

    return dataset_root, pos_threshold, neg_threshold


def build_positives_index(df: DataFrame, distance_threshold: float) -> Dict[int, List[int]]:
    """Build index of positive elements for given DataFrame.

    Args:
        df (DataFrame): The dataset DataFrame
        distance_threshold (float): UTM distance threshold in meters for positive elements.

    Returns:
        Dict[int, List[int]]: Element with key `i` contains list of positive elements indexes.
    """
    utm_distance_fn = LpDistance(normalize_embeddings=False)
    positives_mask = (
        utm_distance_fn(torch.tensor(df[["northing", "easting"]].to_numpy(dtype=np.float64))).numpy()
        < distance_threshold
    )
    result: Dict[int, List[int]] = {}
    for i, row in enumerate(positives_mask):
        tmp = np.argwhere(row)
        result[i] = np.delete(tmp, np.argwhere(tmp == i)).tolist()
    return result


def build_nonnegatives_index(df: DataFrame, distance_threshold: float) -> Dict[int, List[int]]:
    """Build index of non-negative elements for given DataFrame.

    Args:
        df (DataFrame): The dataset DataFrame
        distance_threshold (float): UTM distance threshold in meters for negative elements.

    Returns:
        Dict[int, List[int]]: Element with key `i` contains list of non-negative elements indexes.
    """
    utm_distance_fn = LpDistance(normalize_embeddings=False)
    nonnegatives_mask = (
        utm_distance_fn(torch.tensor(df[["northing", "easting"]].to_numpy(dtype=np.float64))).numpy()
        < distance_threshold
    )
    result: Dict[int, List[int]] = {}
    for i, row in enumerate(nonnegatives_mask):
        result[i] = np.argwhere(row).squeeze().tolist()
    return result


def build_indexes(dataset_root: Path, pos_threshold: float, neg_threshold: float) -> None:
    for subset in ("train", "val", "test"):
        if (dataset_root / f"{subset}.csv").exists():
            subset_df = pd.read_csv(dataset_root / f"{subset}.csv", index_col=0)
            subset_positives_index = build_positives_index(subset_df, distance_threshold=pos_threshold)
            with open(dataset_root / f"{subset}_positives_index.pkl", "wb") as f:
                pickle.dump(subset_positives_index, f)
                print(f"Saved {subset}_positives_index.pkl")
            subset_negatives_index = build_nonnegatives_index(subset_df, distance_threshold=neg_threshold)
            with open(dataset_root / f"{subset}_nonnegatives_index.pkl", "wb") as f:
                pickle.dump(subset_negatives_index, f)
                print(f"Saved {subset}_nonnegatives_index.pkl")


if __name__ == "__main__":
#     dataset_root, pos_threshold, neg_threshold = parse_args()

    build_indexes(Path(r"C:\Users\pqlet\pass\hackathons\MIPT_2023\task1_scene\public_new_csv"), 7.5, 25)

Saved train_positives_index.pkl
Saved train_nonnegatives_index.pkl
Saved val_positives_index.pkl
Saved val_nonnegatives_index.pkl
Saved test_positives_index.pkl
Saved test_nonnegatives_index.pkl
