In [1]:
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

Collecting google-api-python-client
  Downloading google_api_python_client-2.144.0-py2.py3-none-any.whl.metadata (6.7 kB)
Downloading google_api_python_client-2.144.0-py2.py3-none-any.whl (12.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m114.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-api-python-client
  Attempting uninstall: google-api-python-client
    Found existing installation: google-api-python-client 2.137.0
    Uninstalling google-api-python-client-2.137.0:
      Successfully uninstalled google-api-python-client-2.137.0
Successfully installed google-api-python-client-2.144.0


In [3]:
import os
import pickle
import google.auth
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

# If modifying these SCOPES, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/drive']

def authenticate_drive():
    creds = None
    # The file token.pickle stores the user's access and refresh tokens.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        # else:
        #     flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
        #     creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    return creds

creds = authenticate_drive()
drive_service = build('drive', 'v3', credentials=creds)

In [4]:
def empty_trash(drive_service):
    try:
        # Call the Drive API to delete all files in Trash
        drive_service.files().emptyTrash().execute()
        print("Trash emptied successfully.")
    except Exception as e:
        print(f"An error occurred: {e}")

empty_trash(drive_service)

An error occurred: ("Failed to retrieve http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/?recursive=true from the Google Compute Engine metadata service. Status: 404 Response:\nb''", <google_auth_httplib2._Response object at 0x7a4012bae620>)


Clearing trash from drive is above

In [None]:
import os
import uuid
import random
from tqdm.auto import tqdm
import copy
import shutil
import zipfile

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [66]:
base_path = "/content/drive/MyDrive/trustworthyml"
data_path = os.path.join(base_path, "data")
zips_path = os.path.join(base_path, "zips")

local_path = "/content/dataset"
local_zips_path = "/content/zips"

In [67]:
total, used, free = shutil.disk_usage("/content/sample_data")
print(f"Total: {total // (2**30)} GiB")
print(f"Used: {used // (2**30)} GiB")
print(f"Free: {free // (2**30)} GiB")

Total: 225 GiB
Used: 33 GiB
Free: 192 GiB


In [68]:
def zip_folder(folder_path, output_zip_path):
    """Zips the contents of a folder into a zip file."""
    print(f"Zipping folder {folder_path} to {output_zip_path}")
    with zipfile.ZipFile(output_zip_path, "w") as zip_file:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                zip_file.write(file_path)
    print(f"Created zip in {output_zip_path}")

def process_batch(batch_files, local_path, local_zips_path, batch_num, zips_path):
    """Processes a single batch of files: copies, zips, and cleans up."""
    # Copy files to local
    for file in batch_files:
        shutil.copy(file, local_path)
    print(f"Copied {len(batch_files)} files to local")

    # Zip the batch
    local_zip_path = os.path.join(local_zips_path, f"batch_{batch_num + 1}.zip")
    zip_folder(local_path, local_zip_path)
    print(f"Zipped batch {batch_num + 1} to {local_zip_path}")

    # Copy zip to Google Drive
    shutil.copy(local_zip_path, zips_path)
    print(f"Copied zip from {local_zip_path} to {zips_path}")

    # Clean up: delete local zip and copied files
    os.remove(local_zip_path)
    for file in batch_files:
        os.remove(os.path.join(local_path, os.path.basename(file)))
    print(f"Cleaned up batch {batch_num + 1}")

def copy_and_zip_batches(data_path, local_path, local_zips_path, zips_path, batch_size, limit=False):
    """Copies files in batches, zips them, and saves to Google Drive."""
    os.makedirs(local_path, exist_ok=True)
    os.makedirs(local_zips_path, exist_ok=True)
    os.makedirs(zips_path, exist_ok=True)

    if limit:
        print(f"Limiting to first 100 files only")
        all_files = [os.path.join(data_path, f) for f in os.listdir(data_path)][:100]
    else:
        print(f"No limit")
        all_files = [os.path.join(data_path, f) for f in os.listdir(data_path)]

    print(f"Finished listing directory, total is {len(all_files)}")
    total_batches = len(all_files) // batch_size + (len(all_files) % batch_size > 0)

    print(f"Total batches to process: {total_batches}")

    for batch_num in tqdm(range(total_batches), desc="Processing Batches"):
        print("-" * 50)
        print(f"Processing batch {batch_num + 1}/{total_batches}")

        # Determine the start and end index for the current batch
        start_index = batch_num * batch_size
        end_index = min(start_index + batch_size, len(all_files))
        batch_files = all_files[start_index:end_index]

        # Process the current batch
        process_batch(batch_files, local_path, local_zips_path, batch_num, zips_path)

    print(f"Finished processing all batches!")

In [None]:
batch = 200
copy_and_zip_batches(data_path, local_path, local_zips_path, zips_path, batch, False)
print(f"Finished copying to drive")

No limit
Finished listing directory, total is 32880
Total batches to process: 165


Processing Batches:   0%|          | 0/165 [00:00<?, ?it/s]

--------------------------------------------------
Processing batch 1/165
Copied 200 files to local
Zipping folder /content/dataset to /content/zips/batch_1.zip
Created zip in /content/zips/batch_1.zip
Zipped batch 1 to /content/zips/batch_1.zip
Copied zip from /content/zips/batch_1.zip to /content/drive/MyDrive/trustworthyml/zips
Cleaned up batch 1
--------------------------------------------------
Processing batch 2/165
Copied 200 files to local
Zipping folder /content/dataset to /content/zips/batch_2.zip
Created zip in /content/zips/batch_2.zip
Zipped batch 2 to /content/zips/batch_2.zip
Copied zip from /content/zips/batch_2.zip to /content/drive/MyDrive/trustworthyml/zips
Cleaned up batch 2
--------------------------------------------------
Processing batch 3/165
Copied 200 files to local
Zipping folder /content/dataset to /content/zips/batch_3.zip
Created zip in /content/zips/batch_3.zip
Zipped batch 3 to /content/zips/batch_3.zip
Copied zip from /content/zips/batch_3.zip to /cont

Code to get all zips into local instance and unzip it

In [None]:
def unzip_files(zip_folder_path, extract_to_path):
    """Unzips all zip files in the specified folder to the given extraction path."""
    zip_files = [os.path.join(zip_folder_path, zip_file) for zip_file in os.listdir(zip_folder_path) if zip_file.endswith('.zip')]
    print(f"Found {len(zip_files)} zip files to unzip.")

    for zip_file_path in tqdm(zip_files, desc="Unzipping Files"):
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            # Extract each file individually to avoid creating nested folders
            for file_info in zip_ref.infolist():
                # Only extract files, not directories
                if not file_info.is_dir():
                    extracted_file_path = os.path.join(extract_to_path, os.path.basename(file_info.filename))
                    zip_ref.extract(file_info, extract_to_path)
                    # Move the extracted file to the target location without the directory structure
                    shutil.move(os.path.join(extract_to_path, file_info.filename), extracted_file_path)
                    # Remove the original extracted file (with directory structure)
        print(f"Unzipped {zip_file_path} to {extract_to_path}")
    toremove = os.path.join(extract_to_path, "content")
    if os.path.exists(toremove):
      shutil.rmtree(toremove)
      # os.rmdir(toremove)
      print(f"Deleted empty directory {toremove}")
    else:
      print(f"No empty directory found at {toremove}")

def copy_and_unzip_from_drive(drive_zips_path, local_zips_path, extract_to_path):
    """Copies zip files from Google Drive to local and unzips them."""
    print(f"Copying files from {drive_zips_path} to {local_zips_path} and then unzipping to {extract_to_path}")

    # Create local zip directory if it doesn't exist
    os.makedirs(local_zips_path, exist_ok=True)
    os.makedirs(extract_to_path, exist_ok=True)

    # List all zip files in the Google Drive zips directory
    zip_files = [f for f in os.listdir(drive_zips_path) if f.endswith('.zip')]
    print(f"Found {len(zip_files)} zip files in Google Drive.")

    for zip_file in tqdm(zip_files, desc="Copying and Unzipping"):
        zip_file_path = os.path.join(drive_zips_path, zip_file)
        local_zip_path = os.path.join(local_zips_path, zip_file)

        # Copy zip file to local
        shutil.copy(zip_file_path, local_zip_path)
        print(f"Copied {zip_file} to local.")

        # Unzip the file
        unzip_files(local_zips_path, extract_to_path)



In [None]:
new_local_path = "/content/new_dataset"
new_local_zips_path = "/content/new_zips"
copy_and_unzip_from_drive(zips_path, new_local_zips_path, new_local_path)

Copying files from /content/drive/MyDrive/trustworthyml/zips to /content/new_zips and then unzipping to /content/new_dataset
Found 3 zip files in Google Drive.


Copying and Unzipping:   0%|          | 0/3 [00:00<?, ?it/s]

Copied batch_1.zip to local.
Found 1 zip files to unzip.


Unzipping Files:   0%|          | 0/1 [00:00<?, ?it/s]

Unzipped /content/new_zips/batch_1.zip to /content/new_dataset
Deleted empty directory /content/new_dataset/content
Copied batch_2.zip to local.
Found 2 zip files to unzip.


Unzipping Files:   0%|          | 0/2 [00:00<?, ?it/s]

Unzipped /content/new_zips/batch_1.zip to /content/new_dataset
Unzipped /content/new_zips/batch_2.zip to /content/new_dataset
Deleted empty directory /content/new_dataset/content
Copied batch_3.zip to local.
Found 3 zip files to unzip.


Unzipping Files:   0%|          | 0/3 [00:00<?, ?it/s]

Unzipped /content/new_zips/batch_1.zip to /content/new_dataset
Unzipped /content/new_zips/batch_3.zip to /content/new_dataset
Unzipped /content/new_zips/batch_2.zip to /content/new_dataset
Deleted empty directory /content/new_dataset/content


In [None]:
!rm -r /content/new_dataset
!rm -r /content/new_zips

In [None]:
len(os.listdir(new_local_path))

30