In [1]:
import pandas as pd
import random

In [2]:
training_dataset_train = pd.read_csv('original_datasets/train.csv', low_memory=False)
training_dataset_train.drop(['group'], axis=1, inplace=True)
training_dataset_test = pd.read_csv('original_datasets/test.csv', low_memory=False)

In [3]:
remove_rows = training_dataset_test[training_dataset_test['name'] == '9397815.jpg']
display(remove_rows)

Unnamed: 0,name,class
51244,9397815.jpg,8969


In [4]:
training_dataset_test = training_dataset_test[training_dataset_test['name'] != '9397815.jpg']

In [5]:
train_df = pd.concat([training_dataset_train, training_dataset_test])
display(train_df.head())
print(train_df.shape)
print(train_df['name'].duplicated().sum())

Unnamed: 0,name,class
0,1.jpg,0
1,2.jpg,0
2,3.jpg,0
3,4.jpg,0
4,5.jpg,0


(197306, 2)
0


In [None]:
import itertools
import math

dataset_columns = ['img1', 'img2', 'label', 'class']
dataset = pd.DataFrame(columns=dataset_columns)
classes = train_df['class'].unique()
number_of_images_per_class_to_use = 10
tot_combs = math.factorial(number_of_images_per_class_to_use + 1) // (math.factorial(number_of_images_per_class_to_use - 1) * 2)
use_over_sampling = False
for class_ in classes:
    images_of_class = train_df[train_df['class'] == class_]
    sample_size = 0
    num_images_in_class = len(images_of_class.index)
    if num_images_in_class >= number_of_images_per_class_to_use:
        sample_size = number_of_images_per_class_to_use
    else:
        sample_size = num_images_in_class
    image_names = images_of_class['name'].tolist()
    image_sample = random.sample(image_names, sample_size)
    combs_with_replacement = list(itertools.combinations_with_replacement(image_sample, 2))
    if use_over_sampling and sample_size < number_of_images_per_class_to_use:
        num_combs = math.factorial(sample_size + 1) // (math.factorial(sample_size - 1) * 2)
        number_of_combinations_needed = tot_combs - num_combs
        oversample = random.choices(combs_with_replacement, k=number_of_combinations_needed)
        combs_with_replacement += oversample
    row_list = []
    for comb in combs_with_replacement:
        row = [comb[0], comb[1], 1, class_]
        row_list.append(row)
    temp = pd.DataFrame(row_list, columns=dataset_columns)
    dataset = pd.concat([dataset, temp])
print(dataset.shape)
display(dataset.head())

In [None]:
filtered_train_df = train_df.copy()
number_of_pivot_images = 3
number_of_non_class_images_per_pivot = 100
_dataset = pd.DataFrame(columns=dataset_columns)
for class_ in classes:
    images_of_class = filtered_train_df[filtered_train_df['class'] == class_]
    images_not_of_class = filtered_train_df[filtered_train_df['class'] != class_]
    if len(images_of_class.index) < number_of_pivot_images:
        number_of_pivot_images = len(images_of_class.index)
    if len(images_not_of_class.index) < number_of_non_class_images_per_pivot:
        number_of_non_class_images_per_pivot = len(images_not_of_class.index)
    image_names_of_class = images_of_class['name'].tolist()
    image_names_not_of_class = images_not_of_class['name'].tolist()
    pivot_images = random.sample(image_names_of_class, number_of_pivot_images)
    row_list = []
    for pivot_image in pivot_images:
        for non_class_image in random.sample(image_names_not_of_class, number_of_non_class_images_per_pivot):
            row = [pivot_image, non_class_image, 0, -1]
            row_list.append(row)
        filtered_train_df = filtered_train_df[filtered_train_df['name'] != pivot_image]
    temp = pd.DataFrame(row_list, columns=dataset_columns)
    _dataset = pd.concat([_dataset, temp])
print(_dataset.shape)
display(_dataset.head())

In [None]:
final_train_and_validation_dataset = pd.concat([dataset, _dataset])
print(final_train_and_validation_dataset.shape)
display(final_train_and_validation_dataset.head())

In [None]:
final_train_and_validation_dataset.to_csv('generated_datasets/final_train_and_validation_dataset.csv')

In [None]:
# similar_dataset = dataset.copy()
# non_similar_dataset = _dataset.copy()

In [8]:
final_train_and_validation_dataset = pd.read_csv('generated_datasets/final_train_and_validation_dataset.csv', low_memory=False)
final_train_and_validation_dataset.drop(['Unnamed: 0'], axis=1, inplace=True)
similar_dataset = final_train_and_validation_dataset[final_train_and_validation_dataset['class'] != -1]
non_similar_dataset =final_train_and_validation_dataset[final_train_and_validation_dataset['class'] == -1]
display(similar_dataset.head())
display(non_similar_dataset.head())

dataset_columns = ['img1', 'img2', 'label', 'class']
classes = train_df['class'].unique()

Unnamed: 0,img1,img2,label,class
0,14.jpg,14.jpg,1,0
1,14.jpg,6.jpg,1,0
2,14.jpg,9.jpg,1,0
3,14.jpg,6997109.jpg,1,0
4,14.jpg,9561745.jpg,1,0


Unnamed: 0,img1,img2,label,class
500954,9.jpg,111821.jpg,0,-1
500955,9.jpg,63119.jpg,0,-1
500956,9.jpg,15639.jpg,0,-1
500957,9.jpg,100314.jpg,0,-1
500958,9.jpg,2466.jpg,0,-1


In [9]:
num_rows_from_class_for_validation = 2
num_rows_from_class_for_train = 5
train_dataset = pd.DataFrame(columns=dataset_columns)
validation_dataset = pd.DataFrame(columns=dataset_columns)
for class_ in classes:
    images_of_class = similar_dataset[similar_dataset['class'] == class_]
    images_of_class = images_of_class.sample(frac=1).reset_index(drop=True)
    val_temp = images_of_class.iloc[:num_rows_from_class_for_validation]
    if len(images_of_class.index) - num_rows_from_class_for_validation < num_rows_from_class_for_train:
        train_temp = images_of_class.iloc[num_rows_from_class_for_validation:]
    else:
        train_temp = images_of_class.iloc[num_rows_from_class_for_validation:(num_rows_from_class_for_train + num_rows_from_class_for_validation)]
    train_dataset = pd.concat([train_dataset, train_temp])
    validation_dataset = pd.concat([validation_dataset, val_temp])
print(train_dataset.shape)
display(train_dataset.head())
print('\n', validation_dataset.shape)
display(validation_dataset.head())

(48455, 4)


Unnamed: 0,img1,img2,label,class
2,6.jpg,11.jpg,1,0
3,6997109.jpg,8638678.jpg,1,0
4,6997109.jpg,6997109.jpg,1,0
5,14.jpg,11.jpg,1,0
6,13.jpg,8638678.jpg,1,0



 (19382, 4)


Unnamed: 0,img1,img2,label,class
0,11.jpg,5.jpg,1,0
1,9.jpg,13.jpg,1,0
0,17.jpg,17.jpg,1,1
1,18.jpg,18.jpg,1,1
0,4025423.jpg,2870228.jpg,1,2


In [10]:
num_rows_for_validation = 29073
num_rows_for_train = 72680
non_similar_dataset = non_similar_dataset.sample(frac=1).reset_index(drop=True)
val_temp = non_similar_dataset.iloc[:num_rows_for_validation]
train_temp = non_similar_dataset.iloc[num_rows_for_validation:(num_rows_for_train + num_rows_for_validation)]
validation_dataset = pd.concat([validation_dataset, val_temp])
train_dataset = pd.concat([train_dataset, train_temp])
print(train_dataset.shape)
display(train_dataset.head())
print('\n', validation_dataset.shape)
display(validation_dataset.head())

(121135, 4)


Unnamed: 0,img1,img2,label,class
2,6.jpg,11.jpg,1,0
3,6997109.jpg,8638678.jpg,1,0
4,6997109.jpg,6997109.jpg,1,0
5,14.jpg,11.jpg,1,0
6,13.jpg,8638678.jpg,1,0



 (48455, 4)


Unnamed: 0,img1,img2,label,class
0,11.jpg,5.jpg,1,0
1,9.jpg,13.jpg,1,0
0,17.jpg,17.jpg,1,1
1,18.jpg,18.jpg,1,1
0,4025423.jpg,2870228.jpg,1,2


In [11]:
for i in range(10):
    train_dataset = train_dataset.sample(frac=1).reset_index(drop=True)
    validation_dataset = validation_dataset.sample(frac=1).reset_index(drop=True)
train_dataset.drop(['class'], axis=1, inplace=True)
validation_dataset.drop(['class'], axis = 1, inplace=True)

In [12]:
print(train_dataset.shape)
display(train_dataset.head())
print('\n', validation_dataset.shape)
display(validation_dataset.head())

(121135, 3)


Unnamed: 0,img1,img2,label
0,7187374.jpg,2721754.jpg,1
1,118823.jpg,7574439.jpg,1
2,3287857.jpg,8880.jpg,1
3,2683097.jpg,136507.jpg,0
4,9273.jpg,3871724.jpg,0



 (48455, 3)


Unnamed: 0,img1,img2,label
0,1070904.jpg,7083947.jpg,1
1,2014013.jpg,68154.jpg,0
2,14738.jpg,14738.jpg,1
3,2163621.jpg,12907.jpg,1
4,9236975.jpg,110513.jpg,1


In [72]:
train_dataset.to_csv('generated_datasets/train_dataset_small.csv')
validation_dataset.to_csv('generated_datasets/validation_dataset_small.csv')