In [24]:
import pandas as pd
import os
import shutil
import sys
import json
from tqdm.notebook import tqdm

# Process NUS-UAL Data

In [25]:
DATASET_FOLDER_INPUT = "./Dataset_OG"
OUTPUT_FOLDER = "./Dataset"
VALID_CSV_FILE_HEADER_REQUIREMENTS = ['img_path','city', 'country', 'continent', 'lat', 'lon']

In [26]:
def find_files(extension, folder):
    files_found = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith(extension):
                files_found.append(os.path.join(root, file))

    print(f"Found {len(files_found)} files with extension {extension}")
    return files_found


# Call the function for csv and jpg files
all_csv_files_in_folder = find_files('.csv', DATASET_FOLDER_INPUT)
all_images_in_folder = find_files('.jpg', DATASET_FOLDER_INPUT) + \
                       find_files('.jpeg', DATASET_FOLDER_INPUT) + \
                       find_files('.png', DATASET_FOLDER_INPUT)

Found 39 files with extension .csv
Found 0 files with extension .jpg
Found 60146 files with extension .jpeg
Found 0 files with extension .png


In [36]:
all_images_in_folder[:5]

['./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/img/data 7/yujun/repo/global-streetscapes-internal/data/TRAIN_TEST/img_split/6/bbaee6bb-e060-4a66-bba3-c8b9d388717f.jpeg',
 './Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/img/data 7/yujun/repo/global-streetscapes-internal/data/TRAIN_TEST/img_split/6/70c4f1d8-4dcb-4bde-9211-db7e698c124c.jpeg',
 './Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/img/data 7/yujun/repo/global-streetscapes-internal/data/TRAIN_TEST/img_split/6/ea945e24-af6a-411e-be6b-172745e53174.jpeg',
 './Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/img/data 7/yujun/repo/global-streetscapes-internal/data/TRAIN_TEST/img_split/6/9de405bc-7def-4404-8a52-e3f09d2b16be.jpeg',
 './Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/img/data 7/yujun/repo/global-streetscapes-internal/data/TRAIN_TEST/img_split/6/c0cde45d-d6eb-4817-a011-d83c7b2e0514.jpeg']

In [41]:
relative_path_images = ["img/" + file.split("/")[-2]+ "/" + file.split("/")[-1] for file in all_images_in_folder]
print(f"Trimmed {len(relative_path_images)}/{len(all_images_in_folder)} images")

rel_to_full_mapping = {rel: full for rel, full in zip(relative_path_images, all_images_in_folder)}

print(f"Mapping from relative to full path for images {len(rel_to_full_mapping)}")
rel_to_full_mapping

Trimmed 60146/60146 images
Mapping from relative to full path for images 60146


{'img/6/bbaee6bb-e060-4a66-bba3-c8b9d388717f.jpeg': './Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/img/data 7/yujun/repo/global-streetscapes-internal/data/TRAIN_TEST/img_split/6/bbaee6bb-e060-4a66-bba3-c8b9d388717f.jpeg',
 'img/6/70c4f1d8-4dcb-4bde-9211-db7e698c124c.jpeg': './Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/img/data 7/yujun/repo/global-streetscapes-internal/data/TRAIN_TEST/img_split/6/70c4f1d8-4dcb-4bde-9211-db7e698c124c.jpeg',
 'img/6/ea945e24-af6a-411e-be6b-172745e53174.jpeg': './Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/img/data 7/yujun/repo/global-streetscapes-internal/data/TRAIN_TEST/img_split/6/ea945e24-af6a-411e-be6b-172745e53174.jpeg',
 'img/6/9de405bc-7def-4404-8a52-e3f09d2b16be.jpeg': './Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/img/data 7/yujun/repo/global-streetscapes-internal/data/TRAIN_TEST/img_split/6/9de405bc-7def-4404-8a52-e3f09d2b16be.jpeg',
 'im

In [28]:
removed_doublicates = list(set(relative_path_images))
print(f"Removed {len(relative_path_images) - len(removed_doublicates)} duplicates")

Removed 0 duplicates


In [29]:
valid_csv_files = []
invalid_csv_files = []
for file in all_csv_files_in_folder:
    with open(file, 'r') as f:
        header = f.readline().strip().split(',')
        for requirement in VALID_CSV_FILE_HEADER_REQUIREMENTS:
            if requirement not in header:
                invalid_csv_files.append(file)
                break
        else:
            valid_csv_files.append(file)

print(f"Found {len(valid_csv_files)} valid csv files")
print(f"Found {len(invalid_csv_files)} invalid csv files")

Found 16 valid csv files
Found 23 invalid csv files


In [30]:
print(f"Invalid Files:")
for file in invalid_csv_files:
    print(file)

Invalid Files:
./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/cities688.csv
./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/info.csv
./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/data/simplemaps.csv
./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/data/perception.csv
./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/data/metadata_mly5.csv
./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/data/metadata_mly4.csv
./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/data/metadata_mly1.csv
./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/data/metadata_mly3.csv
./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/data/osm.csv
./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/data/metadata_mly2.csv
./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/data/places365.csv
./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/data/segmentation.csv
./Dataset_OG/Training/Download_NUS-UAL/global

In [37]:
def process_img_data(img_path, city, country, continent, lat, lon):
    json_data = {
        "img_path": img_path,
        "city": city,
        "country": country,
        "continent": continent,
        "lat": lat,
        "lon": lon
    }

    # move image to output folder
    img_name = img_path.split("/")[-1]
    img_folder = "/".join(img_path.split("/")[:-2])
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    shutil.copyfile(img_path, os.path.join(OUTPUT_FOLDER, img_name))

    # save json data
    json_name = img_name.split(".")[0] + ".json"
    with open(os.path.join(OUTPUT_FOLDER,json_name), 'w') as f:
        json.dump(json_data, f)



In [42]:
processed_images = []
for csv_file in valid_csv_files:
    df = pd.read_csv(csv_file)
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Processing {csv_file}"):
        rel_image_path = row['img_path']
        full_image_path = rel_to_full_mapping[rel_image_path]
        processed_images.append(full_image_path)
        process_img_data(full_image_path, row['city'], row['country'], row['continent'], row['lat'], row['lon'])

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/test/glare.csv:   0%|     …

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/test/lighting_condition.cs…

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/test/pano_status.csv:   0%…

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/test/quality.csv:   0%|   …

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/test/view_direction.csv:  …

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/test/weather.csv:   0%|   …

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/test/platform.csv:   0%|  …

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/test/reflection.csv:   0%|…

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/train/glare.csv:   0%|    …

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/train/lighting_condition.c…

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/train/pano_status.csv:   0…

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/train/quality.csv:   0%|  …

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/train/view_direction.csv: …

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/train/weather.csv:   0%|  …

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/train/platform.csv:   0%| …

Processing ./Dataset_OG/Training/Download_NUS-UAL/global-streetscapes/manual_labels/train/reflection.csv:   0%…

In [43]:
print(f"Processed {len(processed_images)} images")
print(f"Could not process {len(all_images_in_folder) - len(processed_images)} images")

Processed 131945 images
Could not process -71799 images
