## Download the Dataset Images to Google Drive

In order to work TorchVision FasterRCNN and YOLO on google colab, we should download dataset images to google drive.

- Create a folder "**CrackDetection**" on your google drive ([My Drive](https://drive.google.com/drive/u/0/my-drive))
- Upload this notebook and dataset JSON files '**SeamTaping_All_bbox_annotations.json**' or '**WRB_All_bbox_annotations.json**' from your local to the created gdrive folder "**CrackDetection**"
- Then run this notebook. 
    * When runing the first time, you need to allow colab access to your google drive.
    * When start running, please check the "project_name" variable setting.

In [None]:
# project_name = "SeamTaping"
project_name = "WRB"

In [1]:
import json
import os
import requests
from tqdm import tqdm
import time

# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

# Path to save images
image_folder = f'/content/gdrive/MyDrive/CrackDetection/{project_name}_images'

# Check if image_folder already exists
if not os.path.exists(image_folder):
    os.makedirs(image_folder)
    print(f"Created directory: {image_folder}")
else:
    print(f"Directory already exists: {image_folder}")

# Load dataset from JSON
dataset_json_path = f'/content/gdrive/MyDrive/CrackDetection/{project_name}_All_bbox_annotations.json'
with open(dataset_json_path, 'r') as f:
    dataset = json.load(f)

# Function to download image from URL with retries
def download_image(url, save_path, retries=3, retry_delay=2):
    attempt = 0
    while attempt < retries:
        try:
            response = requests.get(url, stream=True)
            if response.status_code == 200:
                with open(save_path, 'wb') as file:
                    for chunk in response.iter_content(chunk_size=1024):
                        if chunk:
                            file.write(chunk)
                return True
            else:
                print(f"Failed to download image from {url}. Status code: {response.status_code}")
        except Exception as e:
            print(f"Exception occurred while downloading image from {url}: {e}")

        attempt += 1
        time.sleep(retry_delay)

    print(f"Failed to download image from {url} after {retries} attempts.")
    return False

# Download images with tqdm progress bar
for data in tqdm(dataset):
    image_urls = data['image_urls']
    image_file_name = data['image_file_name']
    image_save_path = os.path.join(image_folder, image_file_name)

    # Check if file already exists
    if os.path.exists(image_save_path):
        print(f"Skipping download of {image_file_name}. File already exists.")
        continue

    # Download only the first URL
    success = download_image(image_urls[0], image_save_path)
    if not success:
        print(f"Failed to download {image_file_name} from {image_urls[0]}")

# Count number of image files in directory
num_images = len([name for name in os.listdir(image_folder) if os.path.isfile(os.path.join(image_folder, name)) and name.lower().endswith(('.png', '.jpg', '.jpeg'))])
print(f"\nTotal number of image files in {image_folder}: {num_images}")

Mounted at /content/gdrive
Created directory: /content/gdrive/MyDrive/CrackDetection/images


100%|██████████| 2131/2131 [1:45:07<00:00,  2.96s/it]



Total number of image files in /content/gdrive/MyDrive/CrackDetection/images: 2131


In [2]:
# Count number of image files in directory
num_images = len([name for name in os.listdir(image_folder)
                      if os.path.isfile(os.path.join(image_folder, name)) and
                         name.lower().endswith(('.png', '.jpg', '.jpeg'))
                 ])
print(f"\nTotal number of image files in {image_folder}: {num_images}")


Total number of image files in /content/gdrive/MyDrive/CrackDetection/images: 2131
