In [None]:
import pandas as pd
from collections import Counter, defaultdict

## Take need images

In [13]:
TARGET_CLASSES = [
    "Mountain",
    "Forest",
    "Goose",
    "Airplane",
    "Person",
    "Cat",
    "Dog",
    "Water",
    "Car",
    "Building",
    "Snow",
    "Bird",
    "Road",
    "Train",
    "Fire",
    "Frog",
    "Bridge",
    "Cloud",
    "Flower",
    "Boat",
    "Grass",
    "Fish",
    "Chair", 
    "Table", 
    "Book", 
    "Computer", 
    "Food",
    "Sky",
    "Horse",
    "Window",
    "Tree",
    "Sun",
    "Window"
]


In [None]:
classes = pd.read_csv(
    "data/csv_tables/oidv7-class-descriptions.csv",
    header=None,
    names=["LabelName", "ClassName"]
)

target_labels = classes[
    classes["ClassName"].isin(TARGET_CLASSES)
]["LabelName"].tolist()

print("Используемые LabelName:")
print(target_labels)


Используемые LabelName:
['/m/05czz6l', '/m/015p6', '/m/019jd', '/m/0bt_c3', '/m/0cgh4', '/m/0k4j', '/m/01yrx', '/m/01mzpv', '/m/0csby', '/m/01m3v', '/m/0bt9lr', '/m/02_41', '/m/0ch_cf', '/m/0c9ph5', '/m/02wbm', '/m/02zr8', '/m/09ld4', '/m/0dbvp', '/m/08t9c_', '/m/03k3r', '/m/09d_r', '/m/01g317', '/m/06gfj', '/m/01bqvp', '/m/06_dn', '/m/06m_p', '/m/04bcr3', '/m/07jdr', '/m/07j7r', '/m/0838f', '/m/0d4v4']


In [None]:
ann = pd.read_csv("data/csv_tables/oidv7-train-annotations-human-imagelabels.csv")

filtered = ann[ann["LabelName"].isin(target_labels)]
MAX_PER_CLASS = 3200

count_classes = Counter()
image_ids = set()

for label in target_labels:
    ids = (
        filtered[filtered["LabelName"] == label]
        ["ImageID"]
        .unique()
    )
    count_classes[TARGET_CLASSES[target_labels.index(label)]] += ids[:MAX_PER_CLASS].shape[0]
    image_ids.update(ids[:MAX_PER_CLASS])

print("Итого изображений:", len(image_ids))


Итого изображений: 40471


In [18]:
count_classes

Counter({'Forest': 3200,
         'Goose': 3200,
         'Airplane': 3200,
         'Person': 3200,
         'Cat': 3200,
         'Dog': 3200,
         'Water': 3200,
         'Car': 3200,
         'Building': 3200,
         'Snow': 3200,
         'Bird': 3200,
         'Road': 3200,
         'Train': 3200,
         'Fire': 3200,
         'Frog': 3200,
         'Cloud': 3200,
         'Boat': 3200,
         'Grass': 3200,
         'Fish': 3200,
         'Chair': 3200,
         'Table': 3200,
         'Book': 3200,
         'Food': 3200,
         'Sky': 3200,
         'Horse': 3200,
         'Window': 3200,
         'Tree': 3200,
         'Bridge': 3084,
         'Computer': 2416,
         'Mountain': 2383,
         'Flower': 1864})

In [None]:
with open("data/image_list_test.txt", "w") as f:
    for image_id in image_ids:
        f.write(f"test/{image_id}\n")

## Script for load images

python download_images.py data/image_list_train.txt --download_folder data/images/train --num_processes 8

python download_images.py data/image_list_val.txt --download_folder data/images/val --num_processes 8

python download_images.py data/image_list_test.txt --download_folder data/images/test --num_processes 8


## Save labels

In [None]:
import os

IMAGE_DIR = "data/images/test"

downloaded_ids = set(
    os.path.splitext(f)[0]
    for f in os.listdir(IMAGE_DIR)
    if f.endswith(".jpg")
)

print("Скачано изображений:", len(downloaded_ids))


Скачано изображений: 5319


In [None]:
import pandas as pd

ann = pd.read_csv("data/csv_tables/oidv7-test-annotations-human-imagelabels.csv")

ann = ann[ann["ImageID"].isin(downloaded_ids)]

ann = ann[ann["LabelName"].isin(target_labels)]

ann = ann[ann["Confidence"] == 1]


In [48]:
label_map = dict(
    classes[["LabelName", "ClassName"]].values
)


In [49]:

image_to_labels = defaultdict(set)

for _, row in ann.iterrows():
    image_to_labels[row["ImageID"]].add(
        label_map[row["LabelName"]]
    )


In [50]:
class_counter = Counter()

for labels in image_to_labels.values():
    for label in labels:
        class_counter[label] += 1

class_counter

Counter({'Person': 1082,
         'Water': 689,
         'Car': 624,
         'Building': 579,
         'Road': 500,
         'Bird': 451,
         'Dog': 433,
         'Airplane': 401,
         'Mountain': 388,
         'Snow': 359,
         'Forest': 358,
         'Cat': 343,
         'Fire': 309,
         'Train': 253,
         'Goose': 155,
         'Frog': 116})

In [51]:
from collections import Counter

label_counts = Counter(len(v) for v in image_to_labels.values())
label_counts


Counter({1: 2944, 2: 1479, 3: 330, 4: 37})

In [52]:
labels_df = pd.DataFrame(
    0,
    index=sorted(downloaded_ids),
    columns=TARGET_CLASSES,
    dtype="int8"
)


In [53]:
for _, row in ann.iterrows():
    class_name = label_map[row["LabelName"]]
    labels_df.loc[row["ImageID"], class_name] = 1


In [54]:
empty = (labels_df.sum(axis=1) == 0).sum()
print("Изображений без меток:", empty)


Изображений без меток: 529


In [55]:
labels_df_ne = labels_df[labels_df.sum(axis=1) != 0]

In [56]:
labels_df_ne.sum().sort_values(ascending=False)


Person      1082
Water        689
Car          624
Building     579
Road         500
Bird         451
Dog          433
Airplane     401
Mountain     388
Snow         359
Forest       358
Cat          343
Fire         309
Train        253
Goose        155
Frog         116
dtype: int64

In [57]:
labels_df_ne.shape

(4790, 16)

In [None]:
labels_df_ne.reset_index(inplace=True)
labels_df_ne.rename(columns={"index": "ImageID"}, inplace=True)

labels_df_ne.to_csv("data/labels.csv", index=False)


In [59]:
import numpy as np

X_ids = labels_df_ne["ImageID"].values
Y = labels_df[TARGET_CLASSES].values.astype("float32")

np.save("image_ids.npy", X_ids)
np.save("labels.npy", Y)
