In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd
import os
import shutil
import glob


def select_train_labels(n=50, full_csv_path="", downloaded_img_ids=""):
    downloaded_img_ids = pd.read_csv(downloaded_img_ids)
    full_train_data = pd.read_csv(full_csv_path)

    # Select only downloaded img ids
    downloaded_img_ids = downloaded_img_ids['id'].tolist()
    downloaded_train_data = full_train_data[full_train_data['id'].isin(downloaded_img_ids)]

    # Count and sort the downloaded training data
    label_counts = downloaded_train_data.groupby("landmark_id").size().sort_values(ascending=False)

    selected_labels = label_counts.head(n).index.tolist()

    selected_train_data = downloaded_train_data[downloaded_train_data['landmark_id'].isin(selected_labels)]
    selected_train_data.to_csv('/content/drive/MyDrive/cs444-final-project/project/data/train/selected_train.csv', index=False)

    labels_and_count = [(landmark_id, count) for landmark_id, count in label_counts.head(n).items()]

    print(f"Size of selected_train_data: {selected_train_data.shape[0]}")
    # print("List of image IDs and their occurrences")
    # print(labels_and_count)
    # return selected_labels



"""
    Moves selected training images from the src_data_path to dest_data_path
"""
def gather_train_data(selected_train_data="", src_data_path="", dest_data_path=""):
    if not os.path.exists(dest_data_path):
        os.makedirs(dest_data_path)

    image_formats = ['.jpg', '.jpeg', '.png']
    image_file_paths = []

    # Gets list of img ids (names)
    train_data = pd.read_csv(selected_train_data)
    image_ids = set(train_data["id"].tolist())

    print(f"Length of selected train data img ids: {len(image_ids)}")
    for root, dirs, files in os.walk(src_data_path):
        for file in files:
            if file.lower().endswith('.jpg'):
                file_name = os.path.splitext(file)[0]
                # Ensures that img is within selected pool
                if file_name in image_ids:
                    image_file_paths.append(os.path.join(root, file))

    print(f"Length of selected train data files: {len(image_file_paths)}")
    print(f"Moving these data to {dest_data_path}...")

    for image_file_path in image_file_paths:
        # Move the image file to the destination path
        shutil.move(image_file_path, dest_data_path)



def info_downloaded_train_data(full_csv_path="", downloaded_train=""):
    downloaded_img_ids = []

    for root, dirs, files in os.walk(downloaded_train):
        for file in files:
            if file.lower().endswith('.jpg'):
                file_name = os.path.splitext(file)[0]
                downloaded_img_ids.append(file_name)

    # df = pd.DataFrame(downloaded_img_ids, columns=['id'])

    # Save the DataFrame as a CSV file
    # df.to_csv('/content/drive/MyDrive/cs444-final-project/project/data/downloaded_img_ids.csv', index=False)
    print(f"Number of imgs present in training data: {len(downloaded_img_ids)}")
    # return

    full_train_data = pd.read_csv(full_csv_path)

    # Select only downloaded img ids
    downloaded_train_data = full_train_data[full_train_data['id'].isin(downloaded_img_ids)]

    # Count and sort the downloaded training data
    label_counts = downloaded_train_data.groupby("landmark_id").size().sort_values(ascending=False)

    print(f"Number of unique landmarks in downloaded data: {len(label_counts)}")
    min_labels = 0  # Start with 1 label
    total_samples = downloaded_train_data.shape[0]

    while min_labels <= len(label_counts):
        samples = downloaded_train_data[downloaded_train_data['landmark_id'].isin(label_counts.head(min_labels).index)].shape[0]
        coverage = samples / total_samples
        print(f"Top {min_labels} classes, coverage: {round(coverage * 100, 2)}%, with {samples} images")
        min_labels += 500
        if coverage > 0.5:
          break


In [36]:
full_train_data = "/content/drive/MyDrive/cs444-final-project/project/data/labels/train.csv"
train_folder = "/content/drive/MyDrive/cs444-final-project/project/data/train"
info_downloaded_train_data(full_csv_path=full_train_data, downloaded_train=train_folder)


Number of imgs present in training data: 537499
Number of unique landmarks in downloaded data: 130173
Top 0 classes, coverage: 0.0%, with 0 images
Top 500 classes, coverage: 7.67%, with 41233 images
Top 1000 classes, coverage: 11.35%, with 60989 images
Top 1500 classes, coverage: 14.22%, with 76442 images
Top 2000 classes, coverage: 16.66%, with 89565 images
Top 2500 classes, coverage: 18.84%, with 101283 images
Top 3000 classes, coverage: 20.83%, with 111971 images
Top 3500 classes, coverage: 22.65%, with 121742 images
Top 4000 classes, coverage: 24.35%, with 130894 images
Top 4500 classes, coverage: 25.94%, with 139453 images
Top 5000 classes, coverage: 27.44%, with 147508 images
Top 5500 classes, coverage: 28.87%, with 155151 images
Top 6000 classes, coverage: 30.22%, with 162455 images
Top 6500 classes, coverage: 31.53%, with 169455 images
Top 7000 classes, coverage: 32.75%, with 176013 images
Top 7500 classes, coverage: 33.95%, with 182496 images
Top 8000 classes, coverage: 35.07%

In [4]:
downloaded_img_ids = "/content/drive/MyDrive/cs444-final-project/project/data/downloaded_img_ids.csv"
full_train_data = "/content/drive/MyDrive/cs444-final-project/project/data/train/train.csv"
select_train_labels(n=2000, full_csv_path=full_train_data, downloaded_img_ids=downloaded_img_ids)


Size of selected_train_data: 89565


In [8]:
selected_train_data = "/content/drive/MyDrive/cs444-final-project/project/data/train/selected_train.csv"
src_data_path = "/content/drive/MyDrive/cs444-final-project/project/data/train"
dest_data_path = "/content/drive/MyDrive/cs444-final-project/project/data/selected_train"
gather_train_data(selected_train_data=selected_train_data, src_data_path=src_data_path, dest_data_path=dest_data_path)


Length of selected train data img ids: 89565
Length of selected train data files: 89565
Moving these data to /content/drive/MyDrive/cs444-final-project/project/data/selected_train...


In [5]:
import glob
num_jpg_files = len(glob.glob('/Users/ruipenghan/Desktop/Academics/11. SP 2024/CS 444/project/data/selected_train/*.jpg'))
print(f"Number of JPG files: {num_jpg_files}")


Number of JPG files: 7083
