In [33]:
import numpy as np
import plotly.graph_objects as go
import pandas as pd
import plotly.express as px
import random
from PIL import Image
from concurrent.futures import ThreadPoolExecutor, as_completed

#make_subplots 
from plotly.subplots import make_subplots

import matplotlib.pyplot as plt

import os
import sys
import json

from tqdm.notebook import tqdm

In [34]:
TEST_DATA_FOLDER = "/Volumes/DATASET/Test"
TRAIN_DATA_FOLDER = r"/Volumes/DATASET/Train"

with open("./.apiTokenMapBox","r") as f:
    MAPBOX_TOKEN = f.read().strip()

In [35]:
def getPos(json_path):
    with open(json_path) as f:
        data = json.load(f)
    return {
        "lat": data["lat"],
        "lon": data["lon"]
    }

In [36]:
def search_files_in_subdirectory(subdirectory, extension):
    files = []
    for root, _, filenames in os.walk(subdirectory):
        for filename in filenames:
            if filename.endswith(extension):
                files.append(os.path.join(root, filename))
                
    return files

def search_files_in_directory(directory,extension, num_threads=8):
    # Get the list of subdirectories
    subdirectories = [os.path.join(directory, d) for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]

    all_files = []
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = {executor.submit(search_files_in_subdirectory, subdirectory,extension): subdirectory for subdirectory in subdirectories}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Searching subdirectories"):
            all_files.extend(future.result())

    print(f"Found {len(all_files)} files in {directory} with extension {extension}")
    return all_files

In [57]:
all_test_json_files = search_files_in_directory(TEST_DATA_FOLDER, ".json")
all_test_jpg_files = search_files_in_directory(TEST_DATA_FOLDER, ".jpg")
# 5105 + 474 = 5579


missing_json_files = []
for img_path in all_test_jpg_files:
    json_path = img_path.replace(".jpg",".json")
    if json_path not in all_test_json_files:
        missing_json_files.append(img_path)
        
meta_files = [f for f in all_test_jpg_files if "._" in f]


print(f"Found {len(all_test_json_files)} json files")
print(f"Found {len(all_test_jpg_files)} jpg files")
print(f"Found a total of {len(all_test_json_files)+ len(all_test_jpg_files)} files")
print(f"Found {len(missing_json_files)} missing json files")
print(f"Found {len(meta_files)} meta files")

unknown_files = [f for f in missing_json_files if f not in meta_files]
print(f"Found {len(unknown_files)} unknown files")
unknown_files


Searching subdirectories:   0%|          | 0/2 [00:00<?, ?it/s]

Found 3234 files in /Volumes/DATASET/Test with extension .json


Searching subdirectories:   0%|          | 0/2 [00:00<?, ?it/s]

Found 3234 files in /Volumes/DATASET/Test with extension .jpg
Found 3234 json files
Found 3234 jpg files
Found a total of 6468 files
Found 0 missing json files
Found 0 meta files
Found 0 unknown files


[]

In [60]:
all_train_json_files = search_files_in_directory(TRAIN_DATA_FOLDER, ".json")
all_train_jpg_files = search_files_in_directory(TRAIN_DATA_FOLDER, ".jpg")
all_train_jpeg_files = search_files_in_directory(TRAIN_DATA_FOLDER, ".jpeg")

print(f"Found {len(all_train_json_files)} json files")
print(f"Found {len(all_train_jpg_files)} jpg files")
print(f"Found {len(all_train_jpeg_files)} jpge files")
print(f"Found a total of {len(all_train_json_files)+ len(all_train_jpg_files) + len(all_train_jpeg_files)} files")

Searching subdirectories:   0%|          | 0/1525 [00:00<?, ?it/s]

Found 1524444 files in /Volumes/DATASET/Train with extension .json


Searching subdirectories:   0%|          | 0/1525 [00:00<?, ?it/s]

Found 1464298 files in /Volumes/DATASET/Train with extension .jpg


Searching subdirectories:   0%|          | 0/1525 [00:00<?, ?it/s]

Found 60146 files in /Volumes/DATASET/Train with extension .jpeg
Found 1524444 json files
Found 1464298 jpg files
Found 60146 jpge files
Found a total of 3048888 files


In [61]:
print(f"Found {len(all_train_json_files)} json files")
print(f"Found {len(all_train_jpg_files) + len(all_train_jpeg_files)} image files")

Found 1524444 json files
Found 1524444 image files
