In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install boto3

Collecting boto3
  Downloading boto3-1.28.57-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.32.0,>=1.31.57 (from boto3)
  Downloading botocore-1.31.57-py3-none-any.whl (11.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.8.0,>=0.7.0 (from boto3)
  Downloading s3transfer-0.7.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting urllib3<1.27,>=1.25.4 (from botocore<1.32.0,>=1.31.57->boto3)
  Downloading urllib3-1.26.16-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.1/143.1 kB[0m [31m10.6 MB/s[0m eta [36

In [None]:
import os
import boto3
from tqdm import tqdm
from datetime import datetime, timedelta

# Set the environment variables
# use your AWS credentials insted of these
os.environ['AWS_ACCESS_KEY_ID'] = ''
os.environ['AWS_SECRET_ACCESS_KEY'] = ''

In [None]:
class S3DataDownloader:
    def __init__(self, bucket_name, download_path):
        """
        Initializes the S3DataDownloader instance.

        Args:
            bucket_name (str): The name of the S3 bucket.
            download_path (str): The local directory to download files to.
        """
        self.bucket_name = bucket_name
        self.download_path = download_path
        # Create an S3 client
        self.s3_client = boto3.client('s3')
        self.grouped_objects = self.group_objects_by_date()

    def list_s3_objects(self, bucket_name=None):
        """
        Lists all objects in the specified S3 bucket.

        Args:
            bucket_name (str): The name of the S3 bucket.

        Returns:
            list: A list of objects in the S3 bucket.
        """
        if bucket_name is None:
            bucket_name = self.bucket_name
        object_list = []
        # Use a paginator to iterate through all the objects in the bucket
        paginator = self.s3_client.get_paginator('list_objects_v2')
        page_iterator = paginator.paginate(Bucket=bucket_name)

        for page in page_iterator:
            # Get the list of objects in the current page
            objects = page.get('Contents', [])
            object_list.extend(objects)
        print("\nlist_s3_objects done")
        return object_list

    def group_objects_by_date(self):
        """
        Lists all objects in the S3 bucket & groups them by their last modified date.

        Returns:
            dict: A dictionary where keys are dates and values are lists of object keys.
        """
        objects = self.list_s3_objects(bucket_name=self.bucket_name)
        grouped_objects = {}
        for obj in objects:
            key = obj['Key']
            timestamp = obj['LastModified']
            date = str(timestamp.date())

            if date in grouped_objects:
                grouped_objects[date].append(key)
            else:
                grouped_objects[date] = [key]

        # Sort the dictionary based on date values in the keys
        grouped_objects = dict(sorted(grouped_objects.items(),
                                      key=lambda item: item[0]))
        print("group_objects_by_date done\n")
        return grouped_objects

    def replace_misspelled_folder_names(self, species_name):
        misspelled_folders = {
            'Basa' : ['Basa', 'Basaa'],
            'Are' : ['Ar', 'Are'],
            'Barracuda' : ['Barcoda', 'Barkoda', 'Barracoda', 'Barracuda'],
            'Bolo' : ['Bolo', 'Bulo'],
            'Sea bass' : ['C boss', 'C boos', 'Siba'],
            'Chara pona' : ['Chara pana'],
            'Emperor' : ['Comprel', 'Emperor', 'Emporwel', 'Empowel',
                         'M perl', 'M preal'],
            'Demo' : ['Demo', 'Demo2', 'Test', 'Trial'],
            'Hilsa' : ['Hilsa', 'Hilis'],
            'Catla' : ['Katala', 'Katalaa', 'Katla'],
            'Croaker' : ['Kokor', 'Croaker'],
            'Lady' : ['Lady', 'Ledi'],
            'Malabar trevally' : ['Mabar tavili', 'Malbhot',
                                  'Trvili', 'Trevally'],
            'Needle' : ['Needale', 'Nidal', 'Nidil'],
            'Parsi' : ['Parci'],
            'Pearl spot' : ['(bloch,', 'Bloch,', 'Bloch',
                            'Pearl spot', 'Pearls spot',
                            'Green chromide'],
            'Shol' : ['Sholo'],
            'Snapper' : ['Sinper', 'Sniper'],
            'White snapar' : ['White snapper'],
        }

        for key, misspellings in misspelled_folders.items():
            if species_name in misspellings:
                return key
        return species_name

    def download_data(self, date, keys):
        # Create a directory for the date if it doesn't exist
        date_directory = os.path.join(self.download_path, date)
        os.makedirs(date_directory, exist_ok=True)

        # Download each object in the group
        for key in tqdm(keys, desc=f'Downloading {date} data'):
            # Extract fish name and fresh type from the image name
            image_name = key.split('/')[-1]
            try:
                fish_name, fresh_type = image_name.split('_')[-2:]
            except:
                continue
            fresh_type = fresh_type.split('.')[0]
            fish_name = fish_name.capitalize()
            fresh_type = fresh_type.capitalize()

            if fish_name.endswith(" "):
                fish_name = fish_name[:-1]
            if fresh_type.endswith(" "):
                fresh_type = fresh_type[:-1]

            fish_name = self.replace_misspelled_folder_names(fish_name)

            # Create a directory structure:
            # date_folder/fish_name_folder/fresh_type_folder
            fish_directory = os.path.join(date_directory, fish_name)
            fresh_directory = os.path.join(fish_directory, fresh_type)
            os.makedirs(fresh_directory, exist_ok=True)

            file_name = os.path.join(fresh_directory, image_name)

            # Download the object if it doesn't already exist locally
            if not os.path.exists(file_name):
                self.s3_client.download_file(self.bucket_name, key, file_name)
            else:
                pass
#                 print(f"Skipped (already exists): {file_name}")

    def download_daily_data(self):
        for date, keys in self.grouped_objects.items():
            self.download_data(date, keys)

    def download_specific_date_data(self, specific_date):
        valid_dates = [date for date in self.grouped_objects.keys()]
        if specific_date not in valid_dates:
            print(f"Data is not collected on : {specific_date}")
            return
        keys = self.grouped_objects[specific_date]
        self.download_data(specific_date, keys)

    def download_weekly_data(self, start_date, end_date):
        for date, keys in self.grouped_objects.items():
            if start_date <= date <= end_date:
                self.download_data(date, keys)


In [None]:
# Example usage
bucket_name='fish-data-collection'
download_path="/content/drive/MyDrive/Sowmya /qZense Dataset/S3 Data/Daily Data"

data_downloader=S3DataDownloader(bucket_name, download_path)

# Download all data daily
# data_downloader.download_daily_data()

# Date should be in YYYY-MM-DD format

# Download data for a specific date
# specific_date = '2023-09-09'
# data_downloader.download_specific_date_data(specific_date)

# Download data weekly between two dates
start_date = '2023-09-24'
end_date = '2023-10-02'
data_downloader.download_weekly_data(start_date, end_date)


list_s3_objects done
group_objects_by_date done



Downloading 2023-09-24 data: 100%|██████████| 197/197 [00:02<00:00, 84.01it/s] 
Downloading 2023-09-26 data: 100%|██████████| 34/34 [00:19<00:00,  1.73it/s]
Downloading 2023-09-28 data: 100%|██████████| 7/7 [00:06<00:00,  1.07it/s]
Downloading 2023-09-29 data: 100%|██████████| 211/211 [02:13<00:00,  1.58it/s]
Downloading 2023-09-30 data: 100%|██████████| 222/222 [02:20<00:00,  1.58it/s]
Downloading 2023-10-02 data: 100%|██████████| 114/114 [01:12<00:00,  1.58it/s]
