In [None]:
import numpy as np
import pandas as pd
import gc

from sklearn.decomposition import PCA
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.model_selection import train_test_split

from src.data_preparation_pipeline import deduplicate_embeddings_efficient
from src.data_preparation_utils import (
    get_files,
    load_parts,
    create_df,
    process_and_save_batches,
    get_remove_indices_per_file,
)

# Loading embeddings and labels

In [2]:
embeddings_files, embeddings_stats_df = get_files("embeddings")
labels_files, labels_stats_df = get_files("labels")
images_files, images_stats_df = get_files("images")

In [3]:
labels = load_parts(labels_files, "labels", labels_stats_df)
embeddings = load_parts(embeddings_files, "embeddings", embeddings_stats_df)

In [4]:
initial_rows = labels.shape[0]

pd.concat([labels_stats_df, embeddings_stats_df], ignore_index=True)

Unnamed: 0,file_path,file_name,size,part,rows,cols
0,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_0_labels.npz,2.7 kB,labels,15098,1
1,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_10_labels.npz,2.7 kB,labels,14989,1
2,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_11_labels.npz,2.7 kB,labels,15075,1
3,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_12_labels.npz,2.7 kB,labels,15082,1
4,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_13_labels.npz,2.7 kB,labels,15095,1
5,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_14_labels.npz,2.7 kB,labels,15006,1
6,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_15_labels.npz,2.7 kB,labels,15041,1
7,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_16_labels.npz,2.7 kB,labels,15014,1
8,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_17_labels.npz,2.7 kB,labels,15036,1
9,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_18_labels.npz,2.7 kB,labels,15168,1


In [5]:
images_stats_df[["file_path", "file_name", "size"]]

Unnamed: 0,file_path,file_name,size
0,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_0_images.npz,2.0 GB
1,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_10_images.npz,2.0 GB
2,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_11_images.npz,2.0 GB
3,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_12_images.npz,2.0 GB
4,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_13_images.npz,2.0 GB
5,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_14_images.npz,2.0 GB
6,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_15_images.npz,2.0 GB
7,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_16_images.npz,2.0 GB
8,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_17_images.npz,2.0 GB
9,/Users/pawelp/Desktop/education/pw/deep/data_p...,initial_set_part_18_images.npz,2.0 GB


# Deduplication

In [6]:
unique_indices = deduplicate_embeddings_efficient(embeddings)
deduplicated_rows = len(unique_indices)

removed = initial_rows - deduplicated_rows
reduction = removed / initial_rows
print(
    f"Initial rows: {initial_rows}, Deduplicated rows: {deduplicated_rows} "
    f"({reduction:.2%} reduction, {removed} rows removed)"
)

Processing batches: 100%|██████████| 26/26 [21:37<00:00, 49.91s/batch]

Initial rows: 376842, Deduplicated rows: 374032 (0.75% reduction, 2810 rows removed)





In [7]:
deduplicated_mask = np.zeros(initial_rows, dtype=bool)
deduplicated_mask[unique_indices] = True

deduplicated_embeddings = embeddings[deduplicated_mask]
deduplicated_labels = labels[deduplicated_mask]

del embeddings
gc.collect()

20

# Tackling imbalanced distribution

Tackle imbalanced distribution problem by undersampling majority class based on information stored in their embeddings.

In [8]:
distribution = np.unique(deduplicated_labels, return_counts=True)[1].astype(np.uint32)

print(f"Distribution of labels in the deduplicated dataset: {distribution}")

Distribution of labels in the deduplicated dataset: [137549 233855]


In [9]:
pca = PCA(n_components=0.9, random_state=42)
pca_embeddings = pca.fit_transform(deduplicated_embeddings)

n_components = pca_embeddings.shape[1]
original_components = deduplicated_embeddings.shape[1]
variance_explained = pca.explained_variance_ratio_.sum()
reduction = (original_components - n_components) / original_components
print(
    f"Reduced embeddings size: {n_components} components "
    f"({variance_explained:.2%} variance explained, original size: {original_components} - {reduction:.2%} reduction)"
)

Reduced embeddings size: 940 components (90.00% variance explained, original size: 2048 - 54.10% reduction)


In [10]:
enn = EditedNearestNeighbours(n_neighbors=15, n_jobs=-1)

_ = enn.fit_resample(pca_embeddings, deduplicated_labels)

_labels = deduplicated_labels[enn.sample_indices_]
final_distribution = np.unique(_labels, return_counts=True)[1]
print(f"Distribution of labels in the final dataset: {final_distribution}")

Distribution of labels in the final dataset: [137549 156743]


In [11]:
mask_enn = np.zeros(len(deduplicated_labels), dtype=bool)
mask_enn[enn.sample_indices_] = True

num_before = len(deduplicated_labels)
num_after = np.sum(mask_enn)
removed = num_before - num_after
print(
    f"Rows before resampling: {num_before}, after: {num_after} "
    f"({removed / num_before:.2%} reduction, {removed} rows removed)"
)

Rows before resampling: 371404, after: 294292 (20.76% reduction, 77112 rows removed)


# Remove entries in final data, split into sets and save in optimized format

In [12]:
final_mask = np.zeros(initial_rows, dtype=bool)
_tmp = final_mask[deduplicated_mask]
_tmp[mask_enn] = True
final_mask[deduplicated_mask] = _tmp

removed = initial_rows - np.sum(final_mask)
print(
    f"Initial number of rows: {initial_rows}, after: {np.sum(final_mask)} "
    f"({removed / num_before:.2%} reduction, {removed} rows removed)"
)

Initial number of rows: 376842, after: 294292 (22.23% reduction, 82550 rows removed)


In [13]:
final_labels = labels[final_mask]

indices = np.arange(len(final_labels))
train_idx, temp_idx, train_labels, temp_labels = train_test_split(
    indices, final_labels, test_size=0.2, stratify=final_labels, random_state=42
)
val_idx, test_idx, val_labels, test_labels = train_test_split(
    temp_idx, temp_labels, test_size=0.5, stratify=temp_labels, random_state=42
)

train_mask = np.zeros(len(final_labels), dtype=bool)
train_mask[train_idx] = True
val_mask = np.zeros(len(final_labels), dtype=bool)
val_mask[val_idx] = True
test_mask = np.zeros(len(final_labels), dtype=bool)
test_mask[test_idx] = True

final_train_mask = np.zeros(initial_rows, dtype=bool)
_tmp = final_train_mask[final_mask]
_tmp[train_mask] = True
final_train_mask[final_mask] = _tmp
final_val_mask = np.zeros(initial_rows, dtype=bool)
_tmp = final_val_mask[final_mask]
_tmp[val_mask] = True
final_val_mask[final_mask] = _tmp
final_test_mask = np.zeros(initial_rows, dtype=bool)
_tmp = final_test_mask[final_mask]
_tmp[test_mask] = True
final_test_mask[final_mask] = _tmp

print(
    f"Final train set size: {np.sum(final_train_mask)},\n"
    f"validation set size: {np.sum(final_val_mask)},\n"
    f"test set size: {np.sum(final_test_mask)}\n"
)

Final train set size: 235433,
validation set size: 29429,
test set size: 29430



In [14]:
final_indices_to_remove_per_file = get_remove_indices_per_file(
    final_mask, labels_stats_df
)
duplicates_indices_to_remove_per_file = get_remove_indices_per_file(
    deduplicated_mask, labels_stats_df
)
undersampled_indices_to_remove_per_file = get_remove_indices_per_file(
    mask_enn, labels_stats_df
)
training_indices_to_remove_per_file = get_remove_indices_per_file(
    train_mask, labels_stats_df
)
validation_indices_to_remove_per_file = get_remove_indices_per_file(
    val_mask, labels_stats_df
)
test_indices_to_remove_per_file = get_remove_indices_per_file(
    test_mask, labels_stats_df
)

final_stats_df = create_df(final_indices_to_remove_per_file)
final_stats_df["deduplicated"] = create_df(duplicates_indices_to_remove_per_file)[
    "total_removed"
]
final_stats_df["undersampled"] = create_df(undersampled_indices_to_remove_per_file)[
    "total_removed"
]
final_stats_df["training"] = (
    initial_rows - create_df(training_indices_to_remove_per_file)["total_removed"]
)
final_stats_df["validation"] = (
    initial_rows - create_df(validation_indices_to_remove_per_file)["total_removed"]
)
final_stats_df["test"] = (
    initial_rows - create_df(test_indices_to_remove_per_file)["total_removed"]
)
final_stats_df

Unnamed: 0,file_path,total_removed,deduplicated,undersampled,training,validation,test
0,/Users/pawelp/Desktop/education/pw/deep/data_p...,3170,0,3170,373783,363240,363309
1,/Users/pawelp/Desktop/education/pw/deep/data_p...,3130,23,3109,373779,363340,363429
2,/Users/pawelp/Desktop/education/pw/deep/data_p...,3137,37,3115,373788,363337,363251
3,/Users/pawelp/Desktop/education/pw/deep/data_p...,3153,56,3098,373767,363341,363254
4,/Users/pawelp/Desktop/education/pw/deep/data_p...,3229,58,3186,373936,363201,363199
5,/Users/pawelp/Desktop/education/pw/deep/data_p...,3127,69,3079,373980,363251,363283
6,/Users/pawelp/Desktop/education/pw/deep/data_p...,3140,86,3073,373823,363338,363283
7,/Users/pawelp/Desktop/education/pw/deep/data_p...,3103,91,3020,373860,363329,363309
8,/Users/pawelp/Desktop/education/pw/deep/data_p...,3224,96,3158,373838,363325,363291
9,/Users/pawelp/Desktop/education/pw/deep/data_p...,3230,104,3141,373909,363134,363147


In [None]:
process_and_save_batches(
    training_indices_to_remove_per_file,
    validation_indices_to_remove_per_file,
    test_indices_to_remove_per_file,
    labels_files,
    images_files,
)

Processing files:   0%|          | 0/25 [00:00<?, ?file/s]