In [45]:
import pandas as pd
import os
import shutil
import sys
import json
from tqdm.notebook import tqdm

In [46]:
FOLDER_PATH = "./"

OUTPUT_PATH = "./output/"

In [47]:
def find_files(extension, folder):
    files_found = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith(extension):
                files_found.append(os.path.join(root, file))

    print(f"Found {len(files_found)} files with extension {extension}")
    return files_found

In [48]:
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

In [49]:
all_csv_files = find_files(".csv", FOLDER_PATH)
all_jpg_files = find_files(".jpg", FOLDER_PATH)

Found 216 files with extension .csv
Found 1530160 files with extension .jpg


In [50]:
all_raw_csv_files = [file for file in all_csv_files if "raw" in file]
print(len(all_raw_csv_files))

for file in all_raw_csv_files[:5]:
    print(file)
    

print(100*"-")

for file in all_jpg_files[:5]:
    print(file)

48
./MetaData/train_val/zurich/database/raw.csv
./MetaData/train_val/zurich/query/raw.csv
./MetaData/train_val/paris/database/raw.csv
./MetaData/train_val/paris/query/raw.csv
./MetaData/train_val/austin/database/raw.csv
----------------------------------------------------------------------------------------------------
./Images_2/test/buenosaires/database/images/5r3GmJocWqaYyB0Nzb18gQ.jpg
./Images_2/test/buenosaires/database/images/i48_o76TcCSyNVtMGQDzgQ.jpg
./Images_2/test/buenosaires/database/images/zKrhEYJIE6H_8NgsCKhFSw.jpg
./Images_2/test/buenosaires/database/images/pkcbr32M4hCwFytTU5gt3w.jpg
./Images_2/test/buenosaires/database/images/4r_UbfAceGkdGor_wxOjJg.jpg


In [51]:
processes_jpg = []

def process_csv_file(file):
    df = pd.read_csv(file)
    
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Processing {file}"):
        image_name = row["key"]
        lat = row["lat"]
        lon = row["lon"]

        matched_image = None
        for jpg_file in all_jpg_files:
            if image_name in jpg_file:
                matched_image = jpg_file
                break

        if matched_image is not None:
            json_data = {
                        "img_path": "",
                        "city": "",
                        "country": "",
                        "continent": "",
                        "lat": lat,
                        "lon": lon
                    } 

            if os.path.exists(OUTPUT_PATH + image_name + ".json"):
                print(f"File {OUTPUT_PATH + image_name + '.json'} already exists. Skipping")
                
            else:    
                json.dump(json_data, open(OUTPUT_PATH + image_name + ".json", "w")) 

            if os.path.exists(OUTPUT_PATH + image_name + ".jpg"):
                print(f"File {OUTPUT_PATH + image_name + '.jpg'} already exists. Skipping")
            else:
                shutil.copy(matched_image, OUTPUT_PATH + image_name + ".jpg")

            processes_jpg.append(matched_image)
        else:
            print(f"Could not find image for {image_name}")

        # print(f"Image name: {image_name}, Lat: {lat}, Lon: {lon}")




In [52]:
for file in all_raw_csv_files:
    process_csv_file(file)

print(f"Processed {len(all_raw_csv_files)} files")
print(f"Processed {len(processes_jpg)} images")
print(f"Processed {len(all_jpg_files)} images")

Processing ./MetaData/train_val/zurich/database/raw.csv:   0%|          | 0/2991 [00:00<?, ?it/s]

Processing ./MetaData/train_val/zurich/query/raw.csv:   0%|          | 0/2193 [00:00<?, ?it/s]

Processing ./MetaData/train_val/paris/database/raw.csv:   0%|          | 0/9503 [00:00<?, ?it/s]

Processing ./MetaData/train_val/paris/query/raw.csv:   0%|          | 0/8480 [00:00<?, ?it/s]

Processing ./MetaData/train_val/austin/database/raw.csv:   0%|          | 0/28462 [00:00<?, ?it/s]

Processing ./MetaData/train_val/austin/query/raw.csv:   0%|          | 0/14222 [00:00<?, ?it/s]

Processing ./MetaData/train_val/melbourne/database/raw.csv:   0%|          | 0/101827 [00:00<?, ?it/s]

Processing ./MetaData/train_val/melbourne/query/raw.csv:   0%|          | 0/88118 [00:00<?, ?it/s]

Processing ./MetaData/train_val/moscow/database/raw.csv:   0%|          | 0/171878 [00:00<?, ?it/s]

Processing ./MetaData/train_val/moscow/query/raw.csv:   0%|          | 0/77496 [00:00<?, ?it/s]

Processing ./MetaData/train_val/cph/database/raw.csv:   0%|          | 0/12601 [00:00<?, ?it/s]

Processing ./MetaData/train_val/cph/query/raw.csv:   0%|          | 0/6595 [00:00<?, ?it/s]

Processing ./MetaData/train_val/helsinki/database/raw.csv:   0%|          | 0/33248 [00:00<?, ?it/s]

Processing ./MetaData/train_val/helsinki/query/raw.csv:   0%|          | 0/15228 [00:00<?, ?it/s]

Processing ./MetaData/train_val/amman/database/raw.csv:   0%|          | 0/953 [00:00<?, ?it/s]

Processing ./MetaData/train_val/amman/query/raw.csv:   0%|          | 0/835 [00:00<?, ?it/s]

Processing ./MetaData/train_val/manila/database/raw.csv:   0%|          | 0/6064 [00:00<?, ?it/s]

Processing ./MetaData/train_val/manila/query/raw.csv:   0%|          | 0/5378 [00:00<?, ?it/s]

Processing ./MetaData/train_val/tokyo/database/raw.csv:   0%|          | 0/34823 [00:00<?, ?it/s]

Processing ./MetaData/train_val/tokyo/query/raw.csv:   0%|          | 0/26310 [00:00<?, ?it/s]

Processing ./MetaData/train_val/phoenix/database/raw.csv:   0%|          | 0/106221 [00:00<?, ?it/s]

Processing ./MetaData/train_val/phoenix/query/raw.csv:   0%|          | 0/50243 [00:00<?, ?it/s]

Processing ./MetaData/train_val/goa/database/raw.csv:   0%|          | 0/5722 [00:00<?, ?it/s]

Processing ./MetaData/train_val/goa/query/raw.csv:   0%|          | 0/5362 [00:00<?, ?it/s]

Processing ./MetaData/train_val/nairobi/database/raw.csv:   0%|          | 0/437 [00:00<?, ?it/s]

Processing ./MetaData/train_val/nairobi/query/raw.csv:   0%|          | 0/427 [00:00<?, ?it/s]

Processing ./MetaData/train_val/trondheim/database/raw.csv:   0%|          | 0/5015 [00:00<?, ?it/s]

Processing ./MetaData/train_val/trondheim/query/raw.csv:   0%|          | 0/4136 [00:00<?, ?it/s]

Processing ./MetaData/train_val/sf/database/raw.csv:   0%|          | 0/6315 [00:00<?, ?it/s]

Processing ./MetaData/train_val/sf/query/raw.csv:   0%|          | 0/4525 [00:00<?, ?it/s]

Processing ./MetaData/train_val/boston/database/raw.csv:   0%|          | 0/14024 [00:00<?, ?it/s]

Processing ./MetaData/train_val/boston/query/raw.csv:   0%|          | 0/6724 [00:00<?, ?it/s]

Processing ./MetaData/train_val/budapest/database/raw.csv:   0%|          | 0/153321 [00:00<?, ?it/s]

Processing ./MetaData/train_val/budapest/query/raw.csv:   0%|          | 0/45800 [00:00<?, ?it/s]

Processing ./MetaData/train_val/toronto/database/raw.csv:   0%|          | 0/12789 [00:00<?, ?it/s]

Processing ./MetaData/train_val/toronto/query/raw.csv:   0%|          | 0/7352 [00:00<?, ?it/s]

Processing ./MetaData/train_val/london/database/raw.csv:   0%|          | 0/3291 [00:00<?, ?it/s]

Processing ./MetaData/train_val/london/query/raw.csv:   0%|          | 0/2692 [00:00<?, ?it/s]

Processing ./MetaData/train_val/berlin/database/raw.csv:   0%|          | 0/42965 [00:00<?, ?it/s]

Processing ./MetaData/train_val/berlin/query/raw.csv:   0%|          | 0/28197 [00:00<?, ?it/s]

Processing ./MetaData/train_val/amsterdam/database/raw.csv:   0%|          | 0/11539 [00:00<?, ?it/s]

Processing ./MetaData/train_val/amsterdam/query/raw.csv:   0%|          | 0/7893 [00:00<?, ?it/s]

Processing ./MetaData/train_val/saopaulo/database/raw.csv:   0%|          | 0/35096 [00:00<?, ?it/s]

Processing ./MetaData/train_val/saopaulo/query/raw.csv:   0%|          | 0/18989 [00:00<?, ?it/s]

Processing ./MetaData/train_val/ottawa/database/raw.csv:   0%|          | 0/69756 [00:00<?, ?it/s]

Processing ./MetaData/train_val/ottawa/query/raw.csv:   0%|          | 0/53517 [00:00<?, ?it/s]

Processing ./MetaData/train_val/bangkok/database/raw.csv:   0%|          | 0/74620 [00:00<?, ?it/s]

Processing ./MetaData/train_val/bangkok/query/raw.csv:   0%|          | 0/40125 [00:00<?, ?it/s]

Processed 48 files
Processed 1464298 images
Processed 1530160 images
