In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# New Section

In [None]:
%%capture
!unzip /content/gdrive/MyDrive/archive.zip -d /content/training-dataset/

In [None]:
!mv "/content/training-dataset/train/train" "/content/"
!mv "/content/training-dataset/test/test" "/content/"

!rmdir "/content/training-dataset/train"
!rmdir "/content/training-dataset/test"

!mv "/content/train" "/content/training-dataset/"
!mv "/content/test" "/content/training-dataset/"




In [None]:
import os
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.utils import shuffle
import cv2
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
source_dir = '/content/training-dataset/test/'
destination_dir = '/content/training-dataset/train/'

images = [f for f in os.listdir(source_dir) if f.lower().endswith('.jpg')]

for image in images:
    source_path = os.path.join(source_dir, image)
    destination_path = os.path.join(destination_dir, image)
    shutil.move(source_path, destination_path)

!rmdir "/content/training-dataset/test"

In [None]:
training_dataset_train = pd.read_csv('training-dataset/train.csv', low_memory=False)
training_dataset_test = pd.read_csv('training-dataset/test.csv', low_memory=False)

train_df = pd.concat([training_dataset_train, training_dataset_test], ignore_index=True)

train_df.to_csv('trainn.csv', index=False)



In [None]:
display(train_df.head())

Unnamed: 0,name,class,group
0,1.jpg,0,0.0
1,2.jpg,0,0.0
2,3.jpg,0,0.0
3,4.jpg,0,0.0
4,5.jpg,0,0.0


In [None]:
train_df.shape

(197307, 3)

In [None]:
# Create an empty DataFrame to store the results
results_1 = pd.DataFrame(columns=['image1', 'image2', 'similarity','class'])

# Create a copy of the train_df DataFrame
train_df_copy = train_df.copy()

# Get the number of iterations as manual input from the user
num_iterations = 400000

# Create a list to store the individual DataFrames
dfs = []
available_classes = train_df_copy['class'].unique()

for i in range(num_iterations):
    # Filter train_df to get rows with the same class
    class_choice = random.choice(available_classes)

    class_rows = train_df_copy[train_df_copy['class'] == class_choice]

    if len(class_rows) >= 2:
        random_indices = random.sample(class_rows.index.tolist(), 2)
        random_rows = class_rows.loc[random_indices]

        # Get the image names and classes
        image1_name = random_rows.iloc[0]['name']
        image2_name = random_rows.iloc[1]['name']
        class1 = random_rows.iloc[0]['class']
        class2 = random_rows.iloc[1]['class']

        # Determine whether the classes are the same
        if class1 == class2:
            similarity = 1


        # Create a new DataFrame with the information
        result_df = pd.DataFrame({
            'image1': [image1_name],
            'image2': [image2_name],
            'similarity': [similarity],
            'class':[class1]
        })

        # Append the result DataFrame to the list
        dfs.append(result_df)

        # # Remove the selected rows from train_df_copy
        # train_df_copy.drop(random_indices[0], inplace=True)

        # # Remove the class from available_classes if it has <= 1 rows
        # if len(class_rows) <= 1:
        #     available_classes = [c for c in available_classes if c != class_choice]

# Concatenate all individual DataFrames into the final results DataFrame
results_1 = pd.concat(dfs, ignore_index=True)

print(f"Generated {num_iterations} similarity pairs and saved to 'same_image_similarity_results.csv'")



Generated 400000 similarity pairs and saved to 'same_image_similarity_results.csv'


In [None]:
print(results_1)

In [None]:
# Create a new column with sorted image pairs
results_1['sorted_images'] = results_1.apply(lambda row: tuple(sorted([row['image1'], row['image2']])), axis=1)

# Drop duplicate rows based on sorted image pairs
filtered_results_1 = results_1.drop_duplicates(subset='sorted_images')

# Drop the temporary sorted_images column
results_1 = filtered_results_1.drop(columns=['sorted_images'])


In [None]:
print(results_1)

             image1       image2  similarity  class
0         36704.jpg    36694.jpg           1   3465
1       2053229.jpg   125962.jpg           1   8638
2         98557.jpg    98558.jpg           1   6872
3       8483064.jpg  6247397.jpg           1    901
4       5825141.jpg    65198.jpg           1   4857
...             ...          ...         ...    ...
399995    17811.jpg    17817.jpg           1   2040
399996   140990.jpg  2075548.jpg           1   9618
399997  4738756.jpg   104650.jpg           1   7362
399998    13741.jpg  7658253.jpg           1   1666
399999     3620.jpg     3617.jpg           1    479

[325313 rows x 4 columns]


In [None]:
num_unique_classes = results_1['class'].nunique()
print("Number of unique classes:", num_unique_classes)

Number of unique classes: 9691


In [None]:
# Create an empty DataFrame to store the results
results_2 = pd.DataFrame(columns=['image1', 'image2', 'similarity','class'])

# Create a copy of the train_df DataFrame
train_df_copy = train_df.copy()

# Create an empty set to store the unique class names
unique_classes = set()

# Get the number of iterations as manual input from the user
num_iterations = 900000

# Create a list to store the individual DataFrames
dfs = []

for i in range(num_iterations):
    # Choose two random rows from the DataFrame
    random_indices = random.sample(range(len(train_df)), 2)
    random_rows = train_df.iloc[random_indices]

    # Get the image names and classes
    image1_name = random_rows.iloc[0]['name']
    image2_name = random_rows.iloc[1]['name']
    class1 = random_rows.iloc[0]['class']
    class2 = random_rows.iloc[1]['class']

    unique_classes.update([class1])
    unique_classes.update([class2])

    # Determine whether the classes are the same
    if class1 == class2:
        similarity = 1
    else:
        similarity = 0

    # Create a new DataFrame with the information
    result_df = pd.DataFrame({
        'image1': [image1_name],
        'image2': [image2_name],
        'similarity': [similarity],
        'class': [class1]
    })

    # Append the result DataFrame to the list
    dfs.append(result_df)



# Concatenate all individual DataFrames into the final results DataFrame
results_2 = pd.concat(dfs, ignore_index=True)



print(f"Generated {num_iterations} similarity pairs and saved to 'random_image_similarity_results.csv'")


Generated 900000 similarity pairs and saved to 'random_image_similarity_results.csv'


In [None]:
print(results_2)

             image1       image2  similarity  class
0         37301.jpg     8236.jpg           0   3511
1         74539.jpg  9988470.jpg           0   5342
2         37839.jpg     7684.jpg           0   3553
3        131833.jpg    32012.jpg           0   9003
4         92973.jpg    34639.jpg           0   6481
...             ...          ...         ...    ...
899995   103738.jpg    50375.jpg           0   7284
899996  2284130.jpg    35457.jpg           0   4876
899997  9003279.jpg    60103.jpg           0   7858
899998    34396.jpg    67877.jpg           0   3290
899999     8925.jpg  9042881.jpg           0   1150

[900000 rows x 4 columns]


In [None]:
# Create a new column with sorted image pairs
results_2['sorted_images'] = results_2.apply(lambda row: tuple(sorted([row['image1'], row['image2']])), axis=1)

# Drop duplicate rows based on sorted image pairs
filtered_results_2 = results_2.drop_duplicates(subset='sorted_images')

# Drop the temporary sorted_images column
results_2 = filtered_results_2.drop(columns=['sorted_images'])

In [None]:
print(results_2)

             image1       image2  similarity  class
0         37301.jpg     8236.jpg           0   3511
1         74539.jpg  9988470.jpg           0   5342
2         37839.jpg     7684.jpg           0   3553
3        131833.jpg    32012.jpg           0   9003
4         92973.jpg    34639.jpg           0   6481
...             ...          ...         ...    ...
899995   103738.jpg    50375.jpg           0   7284
899996  2284130.jpg    35457.jpg           0   4876
899997  9003279.jpg    60103.jpg           0   7858
899998    34396.jpg    67877.jpg           0   3290
899999     8925.jpg  9042881.jpg           0   1150

[899975 rows x 4 columns]


In [None]:

# Count the occurrences of 0s and 1s in the 'similarity' column
similarity_counts = results_2['similarity'].value_counts()

# Print the counts
print("Count of 0s:", similarity_counts.get(0, 0))
print("Count of 1s:", similarity_counts.get(1, 0))

Count of 0s: 899862
Count of 1s: 113


In [None]:
num_unique_classes = results_2['class'].nunique()
print("Number of unique classes:", num_unique_classes)

Number of unique classes: 9691


In [None]:
similarity_results= pd.concat([results_1, results_2], ignore_index=True)
similarity_results = shuffle(similarity_results)



In [None]:
from sklearn.model_selection import train_test_split

# Get the unique classes in the dataset
unique_classes = similarity_results['class'].unique()

# Initialize lists to store the data for train and validation sets
train_data = []
val_data = []

# Iterate through each unique class
for class_name in unique_classes:
    class_data = similarity_results[similarity_results['class'] == class_name]

    # Split the class data into training and validation sets
    train_class_data, val_class_data = train_test_split(class_data, test_size=0.1, random_state=42)

    train_data.append(train_class_data)
    val_data.append(val_class_data)

# Concatenate the data for train and validation sets
train_df = pd.concat(train_data, ignore_index=True)
val_df = pd.concat(val_data, ignore_index=True)


In [None]:
print(train_df)
print(val_df)

              image1       image2  similarity
0          34609.jpg    34605.jpg           1
1        8825557.jpg  2663704.jpg           1
2        8825557.jpg    38049.jpg           0
3          34607.jpg    34606.jpg           1
4        1219564.jpg    34609.jpg           1
...              ...          ...         ...
1098417     1223.jpg    69036.jpg           0
1098418     1224.jpg    82210.jpg           0
1098419  8886641.jpg  2814661.jpg           0
1098420  3122811.jpg    27138.jpg           0
1098421     1222.jpg  6409085.jpg           0

[1098422 rows x 3 columns]
             image1       image2  similarity
0       2663704.jpg    34605.jpg           1
1         34605.jpg  3648466.jpg           0
2       2663704.jpg    95458.jpg           0
3       1638523.jpg    35079.jpg           0
4       9060149.jpg    48111.jpg           0
...             ...          ...         ...
126861   102962.jpg  5528491.jpg           1
126862  3122811.jpg    63952.jpg           0
126863  8886641

In [None]:
num_unique_classes = train_df['class'].nunique()
print("Number of unique classes:", num_unique_classes)

Number of unique classes: 9691


In [None]:
num_unique_classes = val_df['class'].nunique()
print("Number of unique classes:", num_unique_classes)

Number of unique classes: 9691


In [None]:

# Count the occurrences of 0s and 1s in the 'similarity' column
similarity_counts = train_df['similarity'].value_counts()

# Print the counts
print("Count of 0s:", similarity_counts.get(0, 0))
print("Count of 1s:", similarity_counts.get(1, 0))

Count of 0s: 806678
Count of 1s: 291744


In [None]:
# Count the occurrences of 0s and 1s in the 'similarity' column
similarity_counts = val_df['similarity'].value_counts()

# Print the counts
print("Count of 0s:", similarity_counts.get(0, 0))
print("Count of 1s:", similarity_counts.get(1, 0))

Count of 0s: 93184
Count of 1s: 33682


In [None]:
train_df=train_df.drop(columns=['class'])
val_df=val_df.drop(columns=['class'])

In [None]:
# Save the training and validation datasets to CSV files
train_df.to_csv('train_dataset.csv', index=False)
val_df.to_csv('validation_dataset.csv', index=False)