In [None]:
from datetime import datetime
import io
import os

from dotenv import load_dotenv
import google.auth
from google.cloud import storage
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload


load_dotenv()

In [None]:
BUCKET_NAME = os.environ['BUCKET_NAME']
SOURCE_FOLDER_ID = os.environ['FOLDER_ID']
print(BUCKET_NAME)
print(SOURCE_FOLDER_ID)

# Helper Functions

In [None]:
def get_drive_service():
    # Authenticates using the Cloud Run Service Account automatically
    creds, _ = google.auth.default(
        scopes=['https://www.googleapis.com/auth/drive.readonly']
    )
    return build('drive', 'v3', credentials=creds)

In [None]:
drive_service = get_drive_service()
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)

In [None]:
drive_service, storage_client, bucket

In [None]:
# Create timestamped folder
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M')
backup_folder = f'backup_{timestamp}'

In [None]:
query = f"'{SOURCE_FOLDER_ID}' in parents and trashed = false"
results = (
    drive_service.files().list(q=query, fields='files(id, name, mimeType)').execute()
)
files = results.get('files', [])

In [None]:
# files

In [None]:
def backup_recursive(drive_service, bucket, folder_id, current_path):
    """Iterate trhough folders and upload files."""
    print(f'Scan folder: {current_path} ...')

    # Pagination: If more thatn 100 files exists in a folder
    page_token = None
    while True:
        # Search for all files/folder which parents is folder_id
        query = f"'{folder_id}' in parents and trashed = false"

        results = (
            drive_service.files()
            .list(
                q=query,
                fields='nextPageToken, files(id, name, mimeType)',
                pageToken=page_token,
            )
            .execute()
        )

        items = results.get('files', [])

        for item in items:
            process_item(drive_service, bucket, item, current_path)

        # Check if there is another page with results
        page_token = results.get('nextPageToken')
        if not page_token:
            break


def upload_file(drive_service, bucket, file_id, mime_type, blob_path):
    """Upload the file to GCS bucket"""
    blob = bucket.blob(blob_path)

    # Skip if file already exists
    if blob.exists():
        print(f'Skip file (already exists): {blob_path}')
        return

    # Convert Google File (docs, sheets) to pdf
    if 'application/vnd.google-apps' in mime_type:
        print(f'Convert to PDF: {blob_path}')
        request = drive_service.files().export_media(
            fileId=file_id, mimeType='application/pdf'
        )
        blob.name += '.pdf'

    # Regular files
    else:
        print(f'Download: {blob_path}')
        request = drive_service.files().get_media(fileId=file_id)

    # Stream Upload
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)

    done = False
    while not done:
        status, done = downloader.next_chunk()

    fh.seek(0)
    blob.upload_from_file(fh)
    print(f'Saved: {blob_path}')


def process_item(drive_service, bucket, item, current_path):
    """Decides if it is a folder (recursion) or a file (upload)!"""
    file_id = item['id']
    file_name = item['name']
    mime_type = item['mimeType']

    # Remove slash in file name for save GCS handling
    safe_name = file_name.replace('/', '_')

    # New path in GCS bucket
    full_blob_path = f'{current_path}/{safe_name}'

    # Case 1: FOLDER
    if mime_type == 'application/vnd.google-apps.folder':
        # Recall the backup function for new folder
        backup_recursive(drive_service, bucket, file_id, full_blob_path)

    # Case 2: FILE
    else:
        upload_file(drive_service, bucket, file_id, mime_type, full_blob_path)

In [None]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M')
root_backup_folder = f'backup_{timestamp}'

backup_recursive(drive_service, bucket, SOURCE_FOLDER_ID, root_backup_folder)