In [1]:
import os
import shutil
import random
import pandas as pd

In [2]:
image_folder = 'final_balanced_train_images_256x256'
csv_file = 'updated_image_to_hotel_mapping.csv'

In [3]:
# Minimum number of images per hotel ID in the train set
min_images_per_hotel = 11

# Create final folder to store train and validation folders
os.makedirs('final', exist_ok=True)

# Destination folders for train and validation images
train_folder = 'final/train_images'
validation_folder = 'final/validation_images'

# Create the destination folders if they don't exist
os.makedirs(train_folder, exist_ok=True)
os.makedirs(validation_folder, exist_ok=True)

## Splitting data into train and val folders

In [4]:
data = pd.read_csv(csv_file)

hotel_groups = data.groupby('hotel_id')

train_data = []
validation_data = []

for hotel_id, group in hotel_groups:
    image_ids = group['image_id'].tolist()

    random.shuffle(image_ids)

    # Check if the hotel has enough images for the train set
    if len(image_ids) >= min_images_per_hotel:
        train_data.extend(image_ids[:min_images_per_hotel])
        validation_data.extend(image_ids[min_images_per_hotel:])
    else:
        validation_data.extend(image_ids)

# Function to copy images to the destination folder
def copy_images(source_folder, destination_folder, image_ids):
    for image_id in image_ids:
        hotel_id = data.loc[data['image_id'] == image_id, 'hotel_id'].iloc[0]
        src_path = os.path.join(source_folder, str(hotel_id), image_id)
        dst_path = os.path.join(destination_folder, str(hotel_id), image_id)
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        shutil.copy(src_path, dst_path)

# Copy train and validation images
copy_images(image_folder, train_folder, train_data)
copy_images(image_folder, validation_folder, validation_data)

num_train_images = len(train_data)
num_validation_images = len(validation_data)

print(f"Number of images in train set: {num_train_images}")
print(f"Number of images in validation set: {num_validation_images}")

Number of images in train set: 34276
Number of images in validation set: 9345


## Create csv's for train and val data

In [5]:
hotel_list = []
for dirname, _, filenames in os.walk('final/train_images'):
    for filename in filenames:
        hotel_list.append(os.path.join(dirname, filename))

In [6]:
image_id = []
hotel_id = []

In [7]:
for i in hotel_list:
    image_id.append(str(i.split('/')[-1]))
    hotel_id.append(str(i.split('/')[-2]))

In [8]:
no_of_images_per_hotel = {val: hotel_id.count(val) for val in set(hotel_id)}

In [9]:
df = pd.DataFrame(columns=['image_id','hotel_id'])

In [10]:
df['image_id'] = image_id
df['hotel_id'] = hotel_id
df.to_csv('train.csv',index=False)

In [11]:
df['hotel_id'].nunique() 

3116

In [12]:
hotel_list = []
for dirname, _, filenames in os.walk('final/validation_images'):
    for filename in filenames:
        hotel_list.append(os.path.join(dirname, filename))

In [13]:
image_id = []
hotel_id = []

In [14]:
for i in hotel_list:
    image_id.append(str(i.split('/')[-1]))
    hotel_id.append(str(i.split('/')[-2]))

In [15]:
df = pd.DataFrame(columns=['image_id','hotel_id'])

In [16]:
df['image_id'] = image_id
df['hotel_id'] = hotel_id
df.to_csv('validation.csv',index=False)

In [17]:
df['hotel_id'].nunique() 

3116

In [None]:
# import os
# import random
# import shutil
# import pandas as pd

# # Path to the original image folder
# image_folder = 'final_balanced_train_images_256x256'

# # Path to the original CSV file
# csv_file = 'updated_image_to_hotel_mapping.csv'

# # Number of images for the test folder
# test_images = 500

# # Train-validation split ratio
# train_ratio = 0.7
# validation_ratio = 0.2

# # Create final folder to store train, test, and validation folders
# os.makedirs('final', exist_ok=True)

# # Destination folders for train, test, and validation images
# train_folder = 'final/train_images'
# test_folder = 'final/test_images'
# validation_folder = 'final/validation_images'

# # Create the destination folders if they don't exist
# os.makedirs(train_folder, exist_ok=True)
# os.makedirs(test_folder, exist_ok=True)
# os.makedirs(validation_folder, exist_ok=True)

# # Read the CSV file
# data = pd.read_csv(csv_file)

# # Get the unique hotel IDs
# unique_hotel_ids = data['hotel_id'].unique().tolist()

# # Shuffle the hotel IDs randomly
# random.shuffle(unique_hotel_ids)

# # Split the hotel IDs into train, validation, and test sets
# num_hotels = len(unique_hotel_ids)
# num_train_hotels = int(train_ratio * num_hotels)
# num_validation_hotels = int(validation_ratio * num_hotels)

# train_hotel_ids = unique_hotel_ids[:num_train_hotels]
# validation_hotel_ids = unique_hotel_ids[num_train_hotels:num_train_hotels+num_validation_hotels]
# test_hotel_ids = unique_hotel_ids[num_train_hotels+num_validation_hotels:]

# # Function to copy images to the destination folder
# def copy_images(source_folder, destination_folder, hotel_ids):
#     for hotel_id in hotel_ids:
#         hotel_folder = os.path.join(source_folder, str(hotel_id))
#         images = os.listdir(hotel_folder)
#         for image in images:
#             src_path = os.path.join(hotel_folder, image)
#             dst_path = os.path.join(destination_folder, str(hotel_id), image)
#             os.makedirs(os.path.dirname(dst_path), exist_ok=True)
#             shutil.copy(src_path, dst_path)

# # Copy train images
# copy_images(image_folder, train_folder, train_hotel_ids)

# # Copy validation images
# copy_images(image_folder, validation_folder, validation_hotel_ids)

# # Copy test images
# copy_images(image_folder, test_folder, test_hotel_ids)

# # Print the number of images in each set
# num_train_images = len(os.listdir(train_folder))
# num_validation_images = len(os.listdir(validation_folder))
# num_test_images = len(os.listdir(test_folder))

# print(f"Number of images in train set: {num_train_images}")
# print(f"Number of images in validation set: {num_validation_images}")
# print(f"Number of images in test set: {num_test_images}")