In [1]:
import numpy as np
import pandas as pd
import gc

from sklearn.decomposition import PCA
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.model_selection import train_test_split

from src.data_extraction_pipeline import deduplicate_embeddings_efficient
from src.data_preparation_utils import (
    get_files,
    load_parts,
    create_df,
    process_and_save_batches,
    get_remove_indices_per_file,
)

# Loading embeddings and labels

In [2]:
embeddings_files, embeddings_stats_df = get_files("embeddings")
labels_files, labels_stats_df = get_files("labels")
images_files, images_stats_df = get_files("images")

In [3]:
labels = load_parts(labels_files, "labels", labels_stats_df)
embeddings = load_parts(embeddings_files, "embeddings", embeddings_stats_df)

In [4]:
initial_rows = labels.shape[0]

pd.concat([labels_stats_df, embeddings_stats_df], ignore_index=True)

Unnamed: 0,file_path,file_name,size,part,rows,cols
0,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_0_labels.npz,2.1 kB,labels,11375,1
1,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_10_labels.npz,2.1 kB,labels,11272,1
2,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_11_labels.npz,2.1 kB,labels,11218,1
3,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_12_labels.npz,2.1 kB,labels,11355,1
4,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_13_labels.npz,2.1 kB,labels,11340,1
5,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_14_labels.npz,2.1 kB,labels,11186,1
6,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_1_labels.npz,2.1 kB,labels,11359,1
7,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_2_labels.npz,2.1 kB,labels,11300,1
8,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_3_labels.npz,2.1 kB,labels,11301,1
9,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_4_labels.npz,2.1 kB,labels,11333,1


In [5]:
images_stats_df[["file_path", "file_name", "size"]]

Unnamed: 0,file_path,file_name,size
0,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_0_images.npz,1.6 GB
1,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_10_images.npz,1.6 GB
2,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_11_images.npz,1.6 GB
3,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_12_images.npz,1.6 GB
4,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_13_images.npz,1.6 GB
5,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_14_images.npz,1.6 GB
6,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_1_images.npz,1.6 GB
7,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_2_images.npz,1.6 GB
8,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_3_images.npz,1.6 GB
9,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,initial_set_part_4_images.npz,1.6 GB


# Deduplication

In [6]:
unique_indices = deduplicate_embeddings_efficient(embeddings)
deduplicated_rows = len(unique_indices)

removed = initial_rows - deduplicated_rows
reduction = removed / initial_rows
print(
    f"Initial rows: {initial_rows}, Deduplicated rows: {deduplicated_rows} "
    f"({reduction:.2%} reduction, {removed} rows removed)"
)

Processing batches: 100%|██████████| 13/13 [06:23<00:00, 29.49s/batch]

Initial rows: 169442, Deduplicated rows: 169296 (0.09% reduction, 146 rows removed)





In [7]:
deduplicated_mask = np.zeros(initial_rows, dtype=bool)
deduplicated_mask[unique_indices] = True

deduplicated_embeddings = embeddings[deduplicated_mask]
deduplicated_labels = labels[deduplicated_mask]

del embeddings
gc.collect()

49

# Tackling imbalanced distribution

Tackle imbalanced distribution problem by undersampling majority class based on information stored in their embeddings.

In [8]:
distribution = np.unique(deduplicated_labels, return_counts=True)[1].astype(np.uint32)

print(f"Distribution of labels in the deduplicated dataset: {distribution}")

Distribution of labels in the deduplicated dataset: [ 55893 113259]


In [9]:
pca = PCA(n_components=0.9, random_state=42)
pca_embeddings = pca.fit_transform(deduplicated_embeddings)

n_components = pca_embeddings.shape[1]
original_components = deduplicated_embeddings.shape[1]
variance_explained = pca.explained_variance_ratio_.sum()
reduction = (original_components - n_components) / original_components
print(
    f"Reduced embeddings size: {n_components} components "
    f"({variance_explained:.2%} variance explained, original size: {original_components} - {reduction:.2%} reduction)"
)

Reduced embeddings size: 951 components (90.01% variance explained, original size: 2048 - 53.56% reduction)


In [10]:
enn = EditedNearestNeighbours(n_neighbors=15, n_jobs=-1)

_ = enn.fit_resample(pca_embeddings, deduplicated_labels)

_labels = deduplicated_labels[enn.sample_indices_]
final_distribution = np.unique(_labels, return_counts=True)[1]
print(f"Distribution of labels in the final dataset: {final_distribution}")

Distribution of labels in the final dataset: [55893 77718]


In [11]:
mask_enn = np.zeros(len(deduplicated_labels), dtype=bool)
mask_enn[enn.sample_indices_] = True

num_before = len(deduplicated_labels)
num_after = np.sum(mask_enn)
removed = num_before - num_after
print(
    f"Rows before resampling: {num_before}, after: {num_after} "
    f"({removed / num_before:.2%} reduction, {removed} rows removed)"
)

Rows before resampling: 169152, after: 133611 (21.01% reduction, 35541 rows removed)


# Remove entries in final data, split into sets and save in optimized format

In [12]:
final_mask = np.zeros(initial_rows, dtype=bool)
_tmp = final_mask[deduplicated_mask]
_tmp[mask_enn] = True
final_mask[deduplicated_mask] = _tmp

removed = initial_rows - np.sum(final_mask)
print(
    f"Initial number of rows: {initial_rows}, after: {np.sum(final_mask)} "
    f"({removed / num_before:.2%} reduction, {removed} rows removed)"
)

Initial number of rows: 169442, after: 133611 (21.18% reduction, 35831 rows removed)


In [13]:
final_labels = labels[final_mask]

indices = np.arange(len(final_labels))
train_idx, temp_idx, train_labels, temp_labels = train_test_split(
    indices, final_labels, test_size=0.2, stratify=final_labels, random_state=42
)
val_idx, test_idx, val_labels, test_labels = train_test_split(
    temp_idx, temp_labels, test_size=0.5, stratify=temp_labels, random_state=42
)

train_mask = np.zeros(len(final_labels), dtype=bool)
train_mask[train_idx] = True
val_mask = np.zeros(len(final_labels), dtype=bool)
val_mask[val_idx] = True
test_mask = np.zeros(len(final_labels), dtype=bool)
test_mask[test_idx] = True

final_train_mask = np.zeros(initial_rows, dtype=bool)
_tmp = final_train_mask[final_mask]
_tmp[train_mask] = True
final_train_mask[final_mask] = _tmp
final_val_mask = np.zeros(initial_rows, dtype=bool)
_tmp = final_val_mask[final_mask]
_tmp[val_mask] = True
final_val_mask[final_mask] = _tmp
final_test_mask = np.zeros(initial_rows, dtype=bool)
_tmp = final_test_mask[final_mask]
_tmp[test_mask] = True
final_test_mask[final_mask] = _tmp

print(
    f"Final train set size: {np.sum(final_train_mask)},\n"
    f"validation set size: {np.sum(final_val_mask)},\n"
    f"test set size: {np.sum(final_test_mask)}\n"
)

Final train set size: 106888,
validation set size: 13361,
test set size: 13362



In [14]:
final_indices_to_remove_per_file = get_remove_indices_per_file(
    final_mask, labels_stats_df
)
duplicates_indices_to_remove_per_file = get_remove_indices_per_file(
    deduplicated_mask, labels_stats_df
)
undersampled_indices_to_remove_per_file = get_remove_indices_per_file(
    mask_enn, labels_stats_df
)
training_indices_to_remove_per_file = get_remove_indices_per_file(
    train_mask, labels_stats_df
)
validation_indices_to_remove_per_file = get_remove_indices_per_file(
    val_mask, labels_stats_df
)
test_indices_to_remove_per_file = get_remove_indices_per_file(
    test_mask, labels_stats_df
)

final_stats_df = create_df(final_indices_to_remove_per_file)
final_stats_df["deduplicated"] = create_df(duplicates_indices_to_remove_per_file)[
    "total_removed"
]
final_stats_df["undersampled"] = create_df(undersampled_indices_to_remove_per_file)[
    "total_removed"
]
final_stats_df["training"] = (
    initial_rows - create_df(training_indices_to_remove_per_file)["total_removed"]
)
final_stats_df["validation"] = (
    initial_rows - create_df(validation_indices_to_remove_per_file)["total_removed"]
)
final_stats_df["test"] = (
    initial_rows - create_df(test_indices_to_remove_per_file)["total_removed"]
)
final_stats_df

Unnamed: 0,file_path,total_removed,deduplicated,undersampled,training,validation,test
0,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,2410,0,2410,167154,159216,159208
1,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,2396,1,2395,167197,159330,159255
2,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,2335,5,2331,167172,159410,159308
3,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,2353,11,2342,167176,159202,159238
4,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,2427,8,2420,167136,159260,159250
5,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,2284,6,2282,167196,159395,159363
6,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,2391,12,2382,167203,159180,159225
7,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,2439,14,2425,167153,159237,159336
8,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,2308,9,2302,167201,159255,159268
9,c:\Users\pawpo\Desktop\Indoor-Outdoor\data_pre...,2445,14,2431,167200,159239,159221


# Save 

Finally only 80k for training, 10k for validation and 10k for test will be saved.

In [19]:
process_and_save_batches(
    training_indices_to_remove_per_file,
    validation_indices_to_remove_per_file,
    test_indices_to_remove_per_file,
    labels_files,
    images_files,
)

Processing files:   0%|          | 0/15 [00:00<?, ?file/s]

23:00:37 - src.data_preparation_utils - INFO - Saved batch 0 for train.
23:00:45 - src.data_preparation_utils - INFO - Saved batch 1 for train.
23:00:54 - src.data_preparation_utils - INFO - Saved batch 2 for train.
23:01:02 - src.data_preparation_utils - INFO - Saved batch 3 for train.
23:01:11 - src.data_preparation_utils - INFO - Saved batch 4 for train.
23:01:19 - src.data_preparation_utils - INFO - Saved batch 5 for train.
23:01:28 - src.data_preparation_utils - INFO - Saved batch 6 for train.
23:01:36 - src.data_preparation_utils - INFO - Saved batch 7 for train.
23:01:44 - src.data_preparation_utils - INFO - Saved batch 8 for train.
23:01:53 - src.data_preparation_utils - INFO - Saved batch 0 for val.
23:02:01 - src.data_preparation_utils - INFO - Saved batch 0 for test.
23:02:34 - src.data_preparation_utils - INFO - Saved batch 9 for train.
23:02:43 - src.data_preparation_utils - INFO - Saved batch 10 for train.
23:02:51 - src.data_preparation_utils - INFO - Saved batch 11 for 