# Loading the data

In [None]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import pandas as pd

## Paths

In [None]:
urls_test_parquet = [
    "https://storage.yandexcloud.net/ds-ods/files/data/docs/competitions/Avitotechcomp2025/data_competition_2/test_part_0001.snappy.parquet",
    "https://storage.yandexcloud.net/ds-ods/files/data/docs/competitions/Avitotechcomp2025/data_competition_2/test_part_0002.snappy.parquet",
]

urls_test = []
urls_test+= [f"https://storage.yandexcloud.net/avitotechmlchallenge2025-2/test_title_images_decr/part_0001-0002-chunk_000{i}.zip" for i in range(1, 9)]

In [None]:
urls_train_parquet = [
    "https://storage.yandexcloud.net/ds-ods/files/data/docs/competitions/Avitotechcomp2025/data_competition_2/train_part_0001.snappy.parquet",
    "https://storage.yandexcloud.net/ds-ods/files/data/docs/competitions/Avitotechcomp2025/data_competition_2/train_part_0002.snappy.parquet",
    "https://storage.yandexcloud.net/ds-ods/files/data/docs/competitions/Avitotechcomp2025/data_competition_2/train_part_0003.snappy.parquet",
    "https://storage.yandexcloud.net/ds-ods/files/data/docs/competitions/Avitotechcomp2025/data_competition_2/train_part_0004.snappy.parquet",
]

urls_train = []
urls_train+= [f"https://storage.yandexcloud.net/avitotechmlchallenge2025-2/train_title_images_decr/train_images_part_0001-chunk_000{i}.zip" for i in range(1, 8)]
urls_train+= [f"https://storage.yandexcloud.net/avitotechmlchallenge2025-2/train_title_images_decr/train_images_part_0002-chunk_000{i}.zip" for i in range(1, 8)]
urls_train+= [f"https://storage.yandexcloud.net/avitotechmlchallenge2025-2/train_title_images_decr/train_images_part_0003-chunk_000{i}.zip" for i in range(1, 8)]
urls_train+= [f"https://storage.yandexcloud.net/avitotechmlchallenge2025-2/train_title_images_decr/train_images_part_0004-chunk_000{i}.zip" for i in range(1, 6)]


In [None]:
output_dir = "avitotech_data"
os.makedirs(output_dir, exist_ok=True)

## Download

In [None]:
def download_file(output_dir, url):
    filename = os.path.join(output_dir, url.split("/")[-1])
    if os.path.exists(filename):
        print(f"Already downloaded: {filename}")
        return filename

    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            total = int(r.headers.get('content-length', 0))
            with open(filename, "wb") as f, tqdm(
                desc=f"Downloading {os.path.basename(filename)}",
                total=total,
                unit='B',
                unit_scale=True,
                unit_divisor=1024,
            ) as bar:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        bar.update(len(chunk))
        return filename
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return None

In [None]:
with ThreadPoolExecutor(max_workers=12) as executor:
    list(executor.map(download_file, output_dir, urls_train_parquet))

In [None]:
with ThreadPoolExecutor(max_workers=12) as executor:
    list(executor.map(download_file, output_dir, urls_test_parquet))

In [None]:
os.chdir(output_dir)

In [None]:
photo_dir = "unzipped"
os.makedirs(photo_dir, exist_ok=True)

In [None]:
with ThreadPoolExecutor(max_workers=12) as executor:
    list(executor.map(download_file, "train", urls_train))

In [None]:
with ThreadPoolExecutor(max_workers=12) as executor:
    list(executor.map(download_file, "test", urls_test))

## Check data from two frames

In [None]:
df1 = pd.read_parquet("train_part_0001.snappy.parquet")
df2 = pd.read_parquet("test_part_0001.snappy.parquet")

In [None]:
clms = set(df1.columns) | set(df2.columns)

uni = set()
inter = set()
for cl in clms:
    if cl not in df1.columns or cl not in df2.columns:
        inter.add(cl)
    else:
        uni.add(cl)

In [None]:
uni

In [None]:
inter

In [None]:
df1.iloc[0]

# Insides of data

## Loading test + train

In [1]:
import pandas as pd
import os
import numpy as np
import json
import zipfile
import glob
import os
from tqdm import tqdm

In [2]:
os.chdir("avitotech_data\\avitotech_data")

In [38]:
df_train_1 = pd.read_parquet("train_part_0001.snappy.parquet")
df_train_2 = pd.read_parquet("train_part_0002.snappy.parquet")
df_train_3 = pd.read_parquet("train_part_0003.snappy.parquet")
df_train_4 = pd.read_parquet("train_part_0004.snappy.parquet")

df_test_1 = pd.read_parquet("test_part_0001.snappy.parquet")
df_test_2 = pd.read_parquet("test_part_0002.snappy.parquet")

In [4]:
print(f'In first  file: {len(df_train_1)}')
print(f'In second file: {len(df_train_2)}')
print(f'In third  file: {len(df_train_3)}')
print(f'In forth  file: {len(df_train_4)}')
print(f'Total : {len(df_train_1) + len(df_train_2) + len(df_train_3) + len(df_train_4)}')

print(f'In first test: {len(df_test_1)}')
print(f'In second test: {len(df_test_2)}')
print(f'Total : {len(df_test_1) + len(df_test_2)}')

In first  file: 500000
In second file: 500000
In third  file: 500000
In forth  file: 379555
Total : 1879555
In first test: 250000
In second test: 250000
Total : 500000


In [39]:
df_train = pd.concat([df_train_1, df_train_2, df_train_3, df_train_4])

df_test = pd.concat([df_test_1, df_test_2])

## Check id

### Nulls check

In [None]:
if sum(df_train['base_item_id'].isnull()) != 0 or sum(df_train['cand_item_id'].isnull()) != 0:
    print('Nulls!')
elif sum(df_train['base_item_id'].isna()) != 0 or sum(df_train['cand_item_id'].isna()) != 0:
    print('NAs!')
else:
    print('Excellent!')

### Cnt pairs

In [None]:
pairs = df_train[['base_item_id', 'cand_item_id']].values
unique_ids = set(pairs.flatten())

In [None]:
pairs = [sorted(pair) for pair in pairs]
pairs.sort(key=lambda x: x[0])
unique_pairs = set(tuple(pair) for pair in pairs)

In [None]:
print(f"Всего пар: {len(pairs):,}")
print(f"Всего уникальный пар: {len(unique_pairs):,}")

print(f"Всего уникальный ID: {len(unique_ids):,}")

In [None]:
same = 0

for pair in pairs:
    if pair[0] == pair[1]:
        same += 1

print(f'Количетсво сравнений одного предложения: {same}')

## Проверка описаний

### Уникальность значений и пересечение train - test

In [None]:
print(f"Количество категорий base: {len(df_train['base_category_name'].unique())}")
print(f"Категории base: {[el for el in df_train['base_category_name'].unique()]}")

print()

print(f"Количество категорий cand: {len(df_train['cand_category_name'].unique())}")
print(f"Категории cand: {[el for el in df_train['cand_category_name'].unique()]}")

In [None]:
print(f"Количество подкатегорий base: {len(df_train['base_subcategory_name'].unique())}")
print(f"Подкатегории base: {[el for el in df_train['base_subcategory_name'].unique()]}")

print()

print(f"Количество подкатегорий cand: {len(df_train['cand_subcategory_name'].unique())}")
print(f"Подкатегории cand: {[el for el in df_train['cand_subcategory_name'].unique()]}")

In [None]:
print(f"Количество подкатегорий base: {len(df_train['base_param1'].unique())}")
print(f"Подкатегории base: {sorted([el for el in df_train['base_param1'].unique() if el is not None])}")
print(f"Есть ли None: {None in df_train['base_param1'].unique()}")

print()

print(f"Количество подкатегорий base: {len(df_train['cand_param1'].unique())}")
print(f"Подкатегории base: {sorted([el for el in df_train['cand_param1'].unique() if el is not None])}")
print(f"Есть ли None: {None in df_train['cand_param1'].unique()}")

In [None]:
print(f"Количество подкатегорий base: {len(df_train['base_param2'].unique())}")
print(f"Подкатегории base: {sorted([el for el in df_train['base_param2'].unique() if el is not None])}")
print(f"Есть ли None: {None in df_train['base_param2'].unique()}")

print()

print(f"Количество подкатегорий base: {len(df_train['cand_param2'].unique())}")
print(f"Подкатегории base: {sorted([el for el in df_train['cand_param2'].unique() if el is not None])}")
print(f"Есть ли None: {None in df_train['cand_param2'].unique()}")

## Проверка цен

### Пустые значения

In [None]:
print(sum(df_train['base_price'].isna()))
print(sum(df_train['base_price'].isnull()))

### Очень большие и отрицательные значения

In [None]:
col = df_train['base_price'].apply(lambda x: np.trunc(x / 100_000) * 100_000 if x > 0 else x)

In [None]:
sorted(col.unique())

In [None]:
df_train[df_train['base_price'] > 900_000_000].head(3)

In [None]:
col.value_counts(dropna=False)

In [None]:
df_train[df_train['base_price'] == 0].head(3)

In [None]:
df_train[df_train['base_price'] == -1].head(3)

**Результат:** Есть отрицательные значения и бесплатные. Часть из них действительно бесплатные (Отдам в хорошие руки).

**Предположение:** если есть дубль с -1 и другой суммой, то такие данные порченные.

### -1 explore

In [None]:
df_prices = df_train[df_train.is_double == 1]
df_def_prices = df_prices[df_prices.base_price == -1]

In [None]:
df_def_prices[df_def_prices.base_price != df_def_prices.cand_price][['base_price', 'cand_price']].head(5)

In [None]:
df_def_prices[df_def_prices.base_price != df_def_prices.cand_price].head(5)

**Результат:** для товаров с -1 можно 1. не рассматривать сумму, 2. считать сумму такой же, как и второй товар, если они в одной группе товаров

## How duplicates are look like?

In [None]:
df_duples = df_train[df_train.is_double == 1]

In [None]:
df_duples.head(10)

**Результат:** у кого то разные title/description

Подробнее о других параметрах будет ниже.

## Проверка JSON

In [None]:
dict_params = {}

for params in df_train['base_json_params']:
    for param in json.loads(params):
        if param not in dict_params:
            dict_params[param] = 1
        else:
            dict_params[param] += 1

dict_params = {k: v for k, v in sorted(dict_params.items(), key=lambda item: item[1], reverse=True)}
print(f"Всего параметров: {len(dict_params)}")
types = {}

for params in df_train['base_json_params']:
    for key, val in json.loads(params).items():
        if type(val) not in types:
            types[type(val)] = 1
        else:
            types[type(val)] += 1

print(f"Количество параметров каждого типа: {types}")
types = {}

for params in df_train['base_json_params']:
    for key, val in json.loads(params).items():
        tp = type(val).__name__
        k = int(key)
        if tp not in types:
            types[tp] = [k]
        else:
            if k not in types[tp]:
                types[tp].append(k)

cnt_types = {}

for lst in types.items():
    cnt_types[lst[0]] = len(lst[1])

print(f"Количество признаков каждого типа: {cnt_types}")

In [None]:
dict_params = {}

for params in df_train['cand_json_params']:
    for param in json.loads(params):
        if param not in dict_params:
            dict_params[param] = 1
        else:
            dict_params[param] += 1

dict_params = {k: v for k, v in sorted(dict_params.items(), key=lambda item: item[1], reverse=True)}
print(f"Всего параметров: {len(dict_params)}")

In [None]:
gp = 0

for params in df_train.base_json_params:
    all_params = json.loads(params).keys()
    if gp < len(all_params):
        gp = len(all_params)

In [None]:
print(f"Максимум параметров на одного: {gp}")

## Изображения

In [None]:
os.chdir("train")
zip_files = glob.glob('train_images_part_*-chunk_*.zip')

for zip_path in zip_files:
    print(f"Распаковка: {zip_path}")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Извлекаем в папку с тем же именем, как ZIP-файл (без расширения)
        folder_name = os.path.splitext(os.path.basename(zip_path))[0]
        target_path = folder_name
        os.makedirs(target_path, exist_ok=True)
        zip_ref.extractall(target_path)

print("✅ Все архивы распакованы.")


### Битые/невалидные

In [5]:
from PIL import Image, ImageStat
import matplotlib.pyplot as plt
import glob
from collections import defaultdict
from transformers import SiglipImageProcessor, SiglipVisionModel
import torch, requests
from PIL import Image
import torchvision.transforms as T
from concurrent.futures import ProcessPoolExecutor, as_completed
from concurrent.futures import ThreadPoolExecutor
import pathlib

In [6]:
device    = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "google/siglip-base-patch16-224"
model     = SiglipVisionModel.from_pretrained(model_id,
                                              torch_dtype=torch.float16).to(device).eval()
processor = SiglipImageProcessor.from_pretrained(model_id)

In [None]:
os.chdir('unzipped')
photo_ref = glob.glob('train_images_part_000*-chunk_000*')

In [None]:
img_params = defaultdict(dict)
error_size = defaultdict(list)
error_other = defaultdict(list)

for ref in photo_ref:
    error_size[ref] = []
    error_other[ref] = []
    img_params[ref] = {}

In [None]:
def process_image(ref, filename):
    try:
        path = os.path.join(ref, filename)
        with Image.open(path) as img:
            img = img.convert("L")
            img.load()
            
            width, height = img.size
            if width < 128 or height < 128:
                return ('error_size', ref, filename)

            stat = ImageStat.Stat(img)
            hist = np.array(img.histogram())

            hist = hist / hist.sum()
            entropy = -np.sum(hist * np.log2(hist + 1e-10))

            arr = np.array(img)
            white_ratio = np.mean(arr > 250)
            black_ratio = np.mean(arr < 5)

            return ('success', ref, filename, {
                "stddev": stat.stddev,
                "mean": stat.mean,
                "entropy": entropy,
                "white_ratio": white_ratio,
                "black_ratio": black_ratio
            })
    except Exception as e:
        return ('error_other', ref, filename)

In [None]:
with ThreadPoolExecutor () as executor:
    futures = []
    for ref in tqdm(photo_ref):
        names = os.listdir(ref)

        for filename in tqdm(names):
            futures.append(executor.submit(process_image, ref, filename))

    for future in tqdm(as_completed(futures), total=len(futures)):
        result = future.result()
        if result[0] == 'success':
            _, ref, filename, params = result
            img_params[ref][filename] = params
        elif result[0] == 'error_size':
            _, ref, filename = result
            error_size[ref].append(filename)
        elif result[0] == 'error_other':
            _, ref, filename = result
            error_other[ref].append(filename)

### Сохранение результатов

In [None]:
os.chdir("..")

In [None]:
with open("img_params.json", "w") as fp:
    json.dump(img_params , fp)

with open("error_size.json", "w") as fp:
    json.dump(error_size , fp)
    
with open("error_other.json", "w") as fp:
    json.dump(error_other , fp)

### Определение количества корректных фото

In [13]:
with open("img_params.json", "r") as file:
    img_params = json.load(file)

In [14]:
for ref in img_params:
    for filename in img_params[ref]:
        img_params[ref][filename]['use'] = 1

In [15]:
for ref in img_params:
    for filename in img_params[ref]:
        if   img_params[ref][filename]['stddev'][0] < 25:
             img_params[ref][filename]['use'] = 0
        elif img_params[ref][filename]['black_ratio'] > 0.85 and img_params[ref][filename]['white_ratio'] < 0.99:
             img_params[ref][filename]['use'] = 0
        elif img_params[ref][filename]['mean'][0] < 20 or img_params[ref][filename]['mean'][0] > 245:
             img_params[ref][filename]['use'] = 0

In [16]:
cnt = 0
for ref in img_params:
    for filename in img_params[ref]:
        cnt += img_params[ref][filename]['use']

cnt

2393040

### Получение эмбедингов изображений

In [24]:
os.chdir("train")

In [28]:
MIN_W, MIN_H = 128, 128
BASE_DIR = pathlib.Path.cwd()
BATCH = 8

def safe_open(path: pathlib.Path):
    """
    Открывает изображение, проверяя:
      • файл существует и расширение поддерживается
      • файл читается Pillow (не битый)
      • размеры не меньше MIN_W×MIN_H
    Возвращает объект PIL.Image или None, если не прошло валидацию.
    """

    try:
        with Image.open(path) as img:
            img.verify()             # быстрый тест на «битость»
        img = Image.open(path).convert("RGB")   # повторно открываем для чтения пикселей

        if img.width < MIN_W or img.height < MIN_H:
            return None
        return img
    except Exception:
        return None

In [29]:
for folder in BASE_DIR.iterdir():
    if not folder.is_dir():
        continue

    print(f"\n▶ Processing folder: {folder.name}")
    image_paths = [p for p in folder.rglob("*") if p.is_file()]
    valid_image_paths = []
    considered_path = img_params[folder.name]
    for path in image_paths:
        try:
            if considered_path[path.name]['use'] == 1:
                valid_image_paths.append(path)
        except:
            continue
    embeddings  = {}
    skipped     = []

    # батчами по BATCH штук
    for i in tqdm(range(0, len(valid_image_paths), BATCH), desc="  batching"):
        batch = valid_image_paths[i : i + BATCH]
        images, kept = [], []

        for p in batch:
            img = safe_open(p)
            if img is None:
                skipped.append(p)
            else:
                images.append(img)
                kept.append(p)

        if not images:
            continue

        inputs = processor(images=images, return_tensors="pt").to(device)
        with torch.no_grad():
            out = model(**inputs)
            vecs = out.pooler_output  # (len(kept), 768)

        for path, vec in zip(kept, vecs):
            embeddings[path.name] = vec.cpu()

    # ── сохраняем словарь для этой папки
    if embeddings:
        out_file = folder.with_name(f"{folder.name}_embeddings.pt")
        torch.save(embeddings, out_file)
        print(f"  Saved {len(embeddings)} embeddings → {out_file.name}")
    else:
        print("  No embeddings to save in this folder.")

    if skipped:
        print(f"  Skipped {len(skipped)} files (too small or corrupt)")


▶ Processing folder: train_images_part_0001-chunk_0001


  batching: 100%|████████████████████████████████████████████████████████████████| 11257/11257 [12:45<00:00, 14.71it/s]


  Saved 90055 embeddings → train_images_part_0001-chunk_0001_embeddings.pt

▶ Processing folder: train_images_part_0001-chunk_0002


  batching: 100%|████████████████████████████████████████████████████████████████| 11268/11268 [12:49<00:00, 14.65it/s]


  Saved 90137 embeddings → train_images_part_0001-chunk_0002_embeddings.pt

▶ Processing folder: train_images_part_0001-chunk_0003


  batching: 100%|████████████████████████████████████████████████████████████████| 11250/11250 [12:53<00:00, 14.54it/s]


  Saved 89998 embeddings → train_images_part_0001-chunk_0003_embeddings.pt

▶ Processing folder: train_images_part_0001-chunk_0004


  batching: 100%|████████████████████████████████████████████████████████████████| 11269/11269 [13:00<00:00, 14.44it/s]


  Saved 90150 embeddings → train_images_part_0001-chunk_0004_embeddings.pt

▶ Processing folder: train_images_part_0001-chunk_0005


  batching: 100%|████████████████████████████████████████████████████████████████| 11251/11251 [12:54<00:00, 14.52it/s]


  Saved 90005 embeddings → train_images_part_0001-chunk_0005_embeddings.pt

▶ Processing folder: train_images_part_0001-chunk_0006


  batching: 100%|████████████████████████████████████████████████████████████████| 11255/11255 [13:02<00:00, 14.38it/s]


  Saved 90035 embeddings → train_images_part_0001-chunk_0006_embeddings.pt

▶ Processing folder: train_images_part_0001-chunk_0007


  batching: 100%|██████████████████████████████████████████████████████████████████| 5755/5755 [06:43<00:00, 14.25it/s]


  Saved 46039 embeddings → train_images_part_0001-chunk_0007_embeddings.pt

▶ Processing folder: train_images_part_0001-chunk_0008


  batching: 100%|██████████████████████████████████████████████████████████████████| 4399/4399 [05:03<00:00, 14.50it/s]


  Saved 35192 embeddings → train_images_part_0001-chunk_0008_embeddings.pt

▶ Processing folder: train_images_part_0002-chunk_0001


  batching: 100%|████████████████████████████████████████████████████████████████| 11345/11345 [12:59<00:00, 14.55it/s]


  Saved 90757 embeddings → train_images_part_0002-chunk_0001_embeddings.pt

▶ Processing folder: train_images_part_0002-chunk_0002


  batching: 100%|████████████████████████████████████████████████████████████████| 11312/11312 [12:58<00:00, 14.52it/s]


  Saved 90494 embeddings → train_images_part_0002-chunk_0002_embeddings.pt

▶ Processing folder: train_images_part_0002-chunk_0003


  batching: 100%|████████████████████████████████████████████████████████████████| 11342/11342 [13:00<00:00, 14.54it/s]


  Saved 90735 embeddings → train_images_part_0002-chunk_0003_embeddings.pt

▶ Processing folder: train_images_part_0002-chunk_0004


  batching: 100%|████████████████████████████████████████████████████████████████| 11320/11320 [12:59<00:00, 14.52it/s]


  Saved 90560 embeddings → train_images_part_0002-chunk_0004_embeddings.pt

▶ Processing folder: train_images_part_0002-chunk_0005


  batching: 100%|████████████████████████████████████████████████████████████████| 11316/11316 [13:02<00:00, 14.47it/s]


  Saved 90521 embeddings → train_images_part_0002-chunk_0005_embeddings.pt

▶ Processing folder: train_images_part_0002-chunk_0006


  batching: 100%|████████████████████████████████████████████████████████████████| 11316/11316 [12:57<00:00, 14.55it/s]


  Saved 90526 embeddings → train_images_part_0002-chunk_0006_embeddings.pt

▶ Processing folder: train_images_part_0002-chunk_0007


  batching: 100%|██████████████████████████████████████████████████████████████████| 6134/6134 [07:06<00:00, 14.39it/s]


  Saved 49067 embeddings → train_images_part_0002-chunk_0007_embeddings.pt

▶ Processing folder: train_images_part_0002-chunk_0008


  batching: 100%|██████████████████████████████████████████████████████████████████| 5899/5899 [06:53<00:00, 14.28it/s]


  Saved 47192 embeddings → train_images_part_0002-chunk_0008_embeddings.pt

▶ Processing folder: train_images_part_0003-chunk_0001


  batching: 100%|████████████████████████████████████████████████████████████████| 11314/11314 [13:18<00:00, 14.17it/s]


  Saved 90511 embeddings → train_images_part_0003-chunk_0001_embeddings.pt

▶ Processing folder: train_images_part_0003-chunk_0002


  batching: 100%|████████████████████████████████████████████████████████████████| 11308/11308 [13:09<00:00, 14.33it/s]


  Saved 90459 embeddings → train_images_part_0003-chunk_0002_embeddings.pt

▶ Processing folder: train_images_part_0003-chunk_0003


  batching: 100%|████████████████████████████████████████████████████████████████| 11308/11308 [13:18<00:00, 14.16it/s]


  Saved 90457 embeddings → train_images_part_0003-chunk_0003_embeddings.pt

▶ Processing folder: train_images_part_0003-chunk_0004


  batching: 100%|████████████████████████████████████████████████████████████████| 11305/11305 [13:10<00:00, 14.29it/s]


  Saved 90437 embeddings → train_images_part_0003-chunk_0004_embeddings.pt

▶ Processing folder: train_images_part_0003-chunk_0005


  batching: 100%|████████████████████████████████████████████████████████████████| 11312/11312 [13:21<00:00, 14.12it/s]


  Saved 90496 embeddings → train_images_part_0003-chunk_0005_embeddings.pt

▶ Processing folder: train_images_part_0003-chunk_0006


  batching: 100%|████████████████████████████████████████████████████████████████| 11317/11317 [13:30<00:00, 13.96it/s]


  Saved 90534 embeddings → train_images_part_0003-chunk_0006_embeddings.pt

▶ Processing folder: train_images_part_0003-chunk_0007


  batching: 100%|██████████████████████████████████████████████████████████████████| 7408/7408 [08:46<00:00, 14.08it/s]


  Saved 59261 embeddings → train_images_part_0003-chunk_0007_embeddings.pt

▶ Processing folder: train_images_part_0003-chunk_0008


  batching: 100%|██████████████████████████████████████████████████████████████████| 6042/6042 [07:04<00:00, 14.22it/s]


  Saved 48330 embeddings → train_images_part_0003-chunk_0008_embeddings.pt

▶ Processing folder: train_images_part_0004-chunk_0001


  batching: 100%|████████████████████████████████████████████████████████████████| 11248/11248 [13:14<00:00, 14.16it/s]


  Saved 89980 embeddings → train_images_part_0004-chunk_0001_embeddings.pt

▶ Processing folder: train_images_part_0004-chunk_0002


  batching: 100%|████████████████████████████████████████████████████████████████| 11269/11269 [13:13<00:00, 14.21it/s]


  Saved 90146 embeddings → train_images_part_0004-chunk_0002_embeddings.pt

▶ Processing folder: train_images_part_0004-chunk_0003


  batching: 100%|████████████████████████████████████████████████████████████████| 11258/11258 [13:07<00:00, 14.30it/s]


  Saved 90063 embeddings → train_images_part_0004-chunk_0003_embeddings.pt

▶ Processing folder: train_images_part_0004-chunk_0004


  batching: 100%|████████████████████████████████████████████████████████████████| 11256/11256 [13:09<00:00, 14.26it/s]


  Saved 90042 embeddings → train_images_part_0004-chunk_0004_embeddings.pt

▶ Processing folder: train_images_part_0004-chunk_0005


  batching: 100%|██████████████████████████████████████████████████████████████████| 9060/9060 [10:34<00:00, 14.28it/s]


  Saved 72478 embeddings → train_images_part_0004-chunk_0005_embeddings.pt

▶ Processing folder: train_images_part_0004-chunk_0006


  batching: 100%|██████████████████████████████████████████████████████████████████| 6048/6048 [07:00<00:00, 14.39it/s]


  Saved 48383 embeddings → train_images_part_0004-chunk_0006_embeddings.pt


In [31]:
merged = {}

for f in tqdm(sorted(BASE_DIR.glob("*_embeddings.pt"))):
    chunk = torch.load(f, map_location="cpu")   # {id: tensor}
    merged.update(chunk)

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [02:02<00:00,  4.08s/it]


In [32]:
os.chdir("..")

In [33]:
torch.save(merged, "train_images_embeddings_merged.pt")

### Mapping id -> jpg

In [40]:
df_base = df_train[['base_item_id', 'base_title_image']]
df_cand = df_train[['cand_item_id', 'cand_title_image']]

base_ids = {}

In [41]:
for row in tqdm(df_base.iterrows(), total=len(df_base)):
    base_item_id = row[1].iloc[0]
    base_title_image = row[1].iloc[1]
    if base_item_id not in base_ids.keys():
        base_ids[base_item_id] = base_title_image

for row in tqdm(df_cand.iterrows(), total=len(df_cand)):
    base_item_id = row[1].iloc[0]
    base_title_image = row[1].iloc[1]
    if base_item_id not in base_ids.keys():
        base_ids[base_item_id] = base_title_image

100%|█████████████████████████████████████████████████████████████████████| 1879555/1879555 [00:29<00:00, 63302.29it/s]
100%|█████████████████████████████████████████████████████████████████████| 1879555/1879555 [00:29<00:00, 63078.08it/s]


In [47]:
items_visual_emb = {}

for item_id, image_name in base_ids.items():
    img_format = image_name + '.jpg'
    if img_format in merged:
        items_visual_emb[item_id] = merged[img_format] 
    else:
        items_visual_emb[item_id] = torch.zeros(768)

In [54]:
torch.save(items_visual_emb, "train_images_embeddings_merged.pt")

## Количество картинок + регион

In [8]:
df_train['cand_count_images'].value_counts(dropna=False)

cand_count_images
1.0     368246
3.0     230598
2.0     223049
10.0    214378
4.0     205678
5.0     184637
6.0     137008
7.0     101934
8.0      93254
9.0      88432
NaN      32333
11.0         6
12.0         2
Name: count, dtype: int64

In [10]:
df_train['base_count_images'].value_counts(dropna=False)

base_count_images
1.0     363685
3.0     228002
10.0    219698
2.0     218011
4.0     204213
5.0     183617
6.0     138421
7.0     105146
8.0      95362
9.0      90795
NaN      32598
11.0         7
Name: count, dtype: int64

In [11]:
df_train['is_same_location'].value_counts(dropna=False)

is_same_location
True     1841435
False      38120
Name: count, dtype: int64

In [12]:
df_train['is_same_region'].value_counts(dropna=False)

is_same_region
True     1854062
False      25493
Name: count, dtype: int64