In [1]:
import os
import numpy as np 
import pandas as pd
import random
import shutil
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# !pip install opencv-python
import cv2

In [3]:
df1 = pd.read_csv('hotel_to_imagecount_mapping.csv')

In [4]:
df2 = pd.read_csv('image_to_hotel_mapping.csv')

In [5]:
df1

Unnamed: 0,hotel_id,count
0,10684,21
1,6213,8
2,199664,20
3,103319,16
4,29562,12
...,...,...
3111,38940,10
3112,306466,4
3113,310203,8
3114,21582,4


In [6]:
df2.head()

Unnamed: 0,image_id,hotel_id
0,000011648.jpg,15526
1,000011630.jpg,15526
2,000011650.jpg,15526
3,000011633.jpg,15526
4,000011656.jpg,15526


In [7]:
s = df1['count'].sum()
print('total number of images: ', s)
mx= df1['count'].max()
print('maximum number of images for a hotel: ', mx)
mn = df1['count'].min()
print('minimum number of images for a hotel: ', mn)
mean = df1['count'].mean()
print('average number of images for a hotel: ', mean)


total number of images:  44702
maximum number of images for a hotel:  1393
minimum number of images for a hotel:  1
average number of images for a hotel:  14.345956354300386


In [8]:
new_img = cv2.imread('resized_train_images_256/100143/000007991.jpg')
new_img.shape

(256, 256, 3)

## Augmenting images<mean for each hotel_id

Creating a list to store image_paths where count is less than 14

In [9]:
mean_count = 14
path = 'resized_train_images_256'

# Get list of hotel IDs
hotel_id_list = df1['hotel_id'].tolist()

count = 0
minority = []
minority_images_label = []
# minority_image = []
minority_image_list = []

for hotel_id in hotel_id_list:
    # Get count for current hotel ID
    val = df1[df1['hotel_id'] == hotel_id]['count'].values[0]
    
    if val <= mean_count:
        count += 1
        path_min = os.path.join(path, str(hotel_id))
        
        for filename in os.listdir(path_min):
            minority_image_list.append(os.path.join(path_min, filename))
            # img = cv2.imread(p)
            # minority_image.append(img)
            minority_images_label.append(hotel_id)

In [10]:
len(minority_image_list)

18305

Functions that describe the different augmentations

In [11]:
def rotate(image):
    angle = random.randint(-20, 20)
    height, width = image.shape[:2]
    matrix = cv2.getRotationMatrix2D((width/2, height/2), angle, 1)
    rotated_image = cv2.warpAffine(image, matrix, (width, height))
    return rotated_image

def flip(image):
    flip_code = random.randint(-1, 1)
    if flip_code == 0:
        flipped_image = cv2.flip(image, 1) #horizontal flip
    # elif flip_code == 1:
    #     flipped_image = cv2.flip(image, 0) #vertical flip
    else:
        flipped_image = image
    return flipped_image

def optical_distortion(image):
    height, width = image.shape[:2]
    fx = random.uniform(0.8, 1.2)
    fy = random.uniform(0.8, 1.2)
    cx = width/2
    cy = height/2
    k1 = random.uniform(-0.05, 0.05)
    k2 = random.uniform(-0.05, 0.05)
    k3 = random.uniform(-0.05, 0.05)
    p1 = random.uniform(-0.03, 0.03)
    p2 = random.uniform(-0.03, 0.03)
    distCoeffs = cv2.UMat(np.array([k1, k2, p1, p2, k3]))
    camera_matrix = np.array([[fx*width, 0, cx],
                              [0, fy*height, cy],
                              [0, 0, 1]], dtype=np.float32)
    distorted_image = cv2.undistort(image, camera_matrix, distCoeffs, None)
    return distorted_image

def gaussian_blur(image):
    kernel_size = random.choice([3, 5, 7])  # Randomly choose kernel size
    blurred_image = cv2.GaussianBlur(image, (kernel_size, kernel_size), 0)
    return blurred_image

def color_jitter(image):
    brightness_factor = random.uniform(0.7, 1.3)
    contrast_factor = random.uniform(0.7, 1.3)
    saturation_factor = random.uniform(0.7, 1.3)

    hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    hsv_image[:, :, 2] = hsv_image[:, :, 2] * brightness_factor
    hsv_image[:, :, 1] = hsv_image[:, :, 1] * contrast_factor
    hsv_image[:, :, 0] = hsv_image[:, :, 0] * saturation_factor

    augmented_image = cv2.cvtColor(hsv_image, cv2.COLOR_HSV2BGR)
    return augmented_image

Randomly selecting the augmentation from list of trasnformations and creating a new folder with augmented image+original image (upto 14 images)

In [12]:
output_path = 'balanced_undersampled_train_images_256x256'
transformations = [rotate, flip, optical_distortion, gaussian_blur, color_jitter]
max_augmentations = 14

for hotel_id in set(minority_images_label):
    # Filter images for the current hotel ID
    hotel_images = [img for img, label in zip(minority_image_list, minority_images_label) if label == hotel_id]
    
    # Check the number of original images for the hotel_id
    original_image_count = len(hotel_images)
    augmentation_factor = max_augmentations - original_image_count
    
    hotel_path = os.path.join(output_path, str(hotel_id))
    if not os.path.exists(hotel_path):
        os.makedirs(hotel_path)
    
    # Copy the original images to the output folder
    for image_path in hotel_images:
        shutil.copy2(image_path, hotel_path)
    
    # Generate additional augmented images
    while augmentation_factor > 0:
        # Randomly select an image from the hotel images
        image_path = random.choice(hotel_images)
        img = cv2.imread(image_path)
        
        # Randomly select a transformation
        chosen_transformation = random.choice(transformations)
        
        transformed_image = chosen_transformation(img)
        
        # Generate a new filename for the augmented image
        full_image_name = image_path.split("/")[-1]
        image_name = full_image_name.split(".")[0]
        file_extension = full_image_name.split(".")[-1]
        new_image_name = "{}_augmented_{}.{}".format(image_name, original_image_count + 1, file_extension)
        
        cv2.imwrite(os.path.join(hotel_path, new_image_name), transformed_image)
        
        augmentation_factor -= 1
        original_image_count += 1

### Downsampling the images>mean for each hotel_id

In [13]:
# df1 is the DataFrame containing columns 'hotel_id' and 'count'
# df2 is the DataFrame containing columns 'image_id' and 'hotel_id'

source_folder = 'resized_train_images_256'

# Identify hotel_id classes with a count higher than the mean (14)
threshold = 14
classes_to_undersample = df1[df1['count'] > threshold]['hotel_id'].tolist()

final_df = pd.DataFrame(columns=['image_id', 'hotel_id'])

for hotel_id in classes_to_undersample:
    class_df = df2[df2['hotel_id'] == hotel_id].sample(n=threshold, random_state=42)
    final_df = pd.concat([final_df, class_df])
    
destination_folder = 'balanced_oversampled_train_images_256x256'
os.makedirs(destination_folder, exist_ok=True)

for _, row in final_df.iterrows():
    hotel_id = row['hotel_id']
    image_id = row['image_id']
    
    # Create the hotel_id folder if it doesn't exist
    hotel_path = os.path.join(destination_folder, str(hotel_id))
    if not os.path.exists(hotel_path):
        os.makedirs(hotel_path)
        
    source_path = os.path.join(source_folder, str(hotel_id), str(image_id))
    destination_path = os.path.join(hotel_path, str(image_id))
    os.makedirs(os.path.dirname(destination_path), exist_ok=True)

    # copy the image from source to destination
    try:
        shutil.copy(source_path, destination_path)
    except FileNotFoundError:
        print("Warning: File not found {}".format(source_path))
        continue



### Checking total image counts in both undersampled and oversampled folders

In [14]:
destination_folder = 'balanced_undersampled_train_images_256x256'
image_counts_df = pd.DataFrame(columns=['hotel_id', 'image_count'])

for root, dirs, files in os.walk(destination_folder):
    for hotel_id in dirs:
        hotel_path = os.path.join(root, hotel_id)
        image_count = len(os.listdir(hotel_path))
        image_counts_df = image_counts_df.append({'hotel_id': hotel_id, 'image_count': image_count}, ignore_index=True)

# Display the image counts DataFrame
print(image_counts_df.sample(50))

     hotel_id image_count
2310     8629          14
20       5905          14
302    208254          14
1038    37707          14
183    201370          14
130    197443          14
738     13416          14
1719   200297          14
2110   644634          14
2016     4081          14
1662   119266          14
1445   306733          14
615    199511          14
58      75136          14
174     89987          14
2229    17500          14
1742   309902          14
1803   310276          14
1177   207228          14
1432   208471          14
1775    95726          14
262     21582          14
177     86397          14
1782   274117          14
1634   310971          14
1759    31512          14
2052    26003          14
607     85237          14
637    309954          14
1009    16202          14
650    199257          14
1115    66541          14
417    306424          14
1564   676252          14
256     20272          14
155     26959          14
1049   309816          14
1252    2110

In [15]:
np.mean(image_count)

14.0

In [16]:
np.max(image_count)

14

In [17]:
np.min(image_count)

14

In [18]:
destination_folder = 'balanced_oversampled_train_images_256x256'
image_counts_df = pd.DataFrame(columns=['hotel_id', 'image_count'])

for root, dirs, files in os.walk(destination_folder):
    for hotel_id in dirs:
        hotel_path = os.path.join(root, hotel_id)
        # print(hotel_path)
        image_count = len(os.listdir(hotel_path))
        image_counts_df = image_counts_df.append({'hotel_id': hotel_id, 'image_count': image_count}, ignore_index=True)

# Display the image counts DataFrame
print(image_counts_df.sample(50))

    hotel_id image_count
58    117083          14
673    17717          14
634    12016          14
606   197432          14
491   459382          14
545    23668          14
17     76931          14
516    18801          14
648   199295          14
713    18093          14
124    15078          14
561   310976          14
285    13253          14
656    78805          14
22       546          14
201   110119          14
220    27858          14
642    12108          14
601    19899          14
761     6514          14
85      4389          14
458   202992          14
368     3631          14
68     39400          14
382     4572          14
74     37890          14
589    98841          14
132    16666          14
238     1562          14
342    56572          14
548    91129          14
401    21235          14
111    76693          14
765    93803          14
775    22789          14
751   308350          14
115   115775          14
266     9021          14
263   127242          14


In [19]:
np.mean(image_count)

14.0

## Creating Final Balanced Folder 

Creating a final folder which has all augmented+downsampeld images

In [20]:
folder1_path = 'balanced_undersampled_train_images_256x256'
folder2_path = 'balanced_oversampled_train_images_256x256'

new_folder_path = 'final_balanced_train_images_256x256'

if not os.path.exists(new_folder_path):
    os.makedirs(new_folder_path)

for root, _, files in os.walk(folder1_path):
    relative_path = os.path.relpath(root, folder1_path)
    new_subdirectory = os.path.join(new_folder_path, relative_path)

    if not os.path.exists(new_subdirectory):
        os.makedirs(new_subdirectory)

    for file in files:
        file_path = os.path.join(root, file)
        shutil.copy(file_path, new_subdirectory)

for root, _, files in os.walk(folder2_path):
    relative_path = os.path.relpath(root, folder2_path)
    new_subdirectory = os.path.join(new_folder_path, relative_path)

    if not os.path.exists(new_subdirectory):
        os.makedirs(new_subdirectory)

    for file in files:
        file_path = os.path.join(root, file)
        shutil.copy(file_path, new_subdirectory)

Checking final folder stats

In [21]:
destination_folder = 'final_balanced_train_images_256x256'
image_counts_df = pd.DataFrame(columns=['hotel_id', 'image_count'])

for root, dirs, files in os.walk(destination_folder):
    for hotel_id in dirs:
        hotel_path = os.path.join(root, hotel_id)
        # print(hotel_path)
        image_count = len(os.listdir(hotel_path))
        image_counts_df = image_counts_df.append({'hotel_id': hotel_id, 'image_count': image_count}, ignore_index=True)

# Total number of images after Balancing
print('Total number of images after Balancing: ', image_counts_df['image_count'].sum())
# Display the image counts DataFrame
print(image_counts_df.sample(10))

Total number of images after Balancing:  43621
     hotel_id image_count
2546     3518          14
2996     3096          14
1895   249786          14
657    200039          14
1507    12994          14
1159     3236          14
764     43372          14
578     32635          14
1200    88414          14
2762    94762          14


## Creating new csv's for 1] image to hotel mapping and 2] hotelid and image count after balancing

In [22]:
hotel_list = []
for dirname, _, filenames in os.walk('final_balanced_train_images_256x256'):
    for filename in filenames:
        hotel_list.append(os.path.join(dirname, filename))

In [23]:
image_id = []
hotel_id = []

In [24]:
for i in hotel_list:
    image_id.append(str(i.split('/')[-1]))
    hotel_id.append(str(i.split('/')[-2]))

In [25]:
no_of_images_per_hotel = {val: hotel_id.count(val) for val in set(hotel_id)}

In [26]:
df = pd.DataFrame(columns=['image_id','hotel_id'])

In [28]:
df['image_id'] = image_id
df['hotel_id'] = hotel_id
df.to_csv('updated_image_to_hotel_mapping.csv',index=False)

In [29]:
df2 = pd.DataFrame(columns=['hotel_id','count'])

In [30]:
lisKey = []
lisVal = []
for key,val in no_of_images_per_hotel.items():
    lisKey.append(key)
    lisVal.append(val)
df2['hotel_id'] = lisKey
df2['count'] = lisVal
df2.to_csv('updated_hotel_to_imagecount_mapping.csv',index=False)