In [1]:
import os

from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaIoBaseDownload
from google.oauth2 import service_account
import pandas as pd

from datetime import datetime
import requests
from io import BytesIO
from tqdm import tqdm
from heic2png import HEIC2PNG
    
# Authenticate and create the Google Drive client
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
SERVICE_ACCOUNT_FILE = 'babert-274310-c1924be40791.json'

credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
drive_service = build('drive', 'v3', credentials=credentials)

In [2]:
def get_image_data (
    image_url, text_cap_en=None, text_cap_native=None, text_culture_loc=None, 
    text_image_loc=None, text_submitter_native_lang=None
):
    created_at = datetime.now().strftime("%d/%m/%Y 00:00:00")
    return {
        'annotation_id': None, 'annotator': None, 'email': 'seacrowd.research@gmail.com', 
        'created_at': created_at, 'timestamp': created_at, 'id': None, 'image': image_url, 
        'lead_time': None, 'text_cap_en': text_cap_en, 'text_cap_native': text_cap_native, 
        'text_culture_loc': text_culture_loc, 'text_image_loc': text_image_loc, 
        'text_submitter_native_lang': text_submitter_native_lang, 'updated_at': None
    }

def download_file_from_google_drive(drive_service, file_id, destination):
    request = drive_service.files().get_media(fileId=file_id)
    fh = BytesIO()
    downloader = MediaIoBaseDownload(fd=fh, request=request)

    done = False
    while not done:
        status, done = downloader.next_chunk()
    fh.seek(0)

    with open(destination, 'wb') as f:
        f.write(fh.read())
        f.close()

def get_file_name_from_id(drive_service, file_id):
    file = drive_service.files().get(fileId=file_id, fields='name').execute()
    return file['name'].replace('"','').replace("'","")

In [3]:
BASE_PATH = "./sea-vl-image-collection/data"
GH_BASE_PATH = "https://raw.githubusercontent.com/SEACrowd/sea-vl-image-collection/refs/heads/main/data"
df = pd.read_csv('migrate_24112024.csv')
migration_data = []

for idx, row in tqdm(df.loc[df['migrate_date'].isna(), :].iterrows()):
    image_path = row['Image Upload']
    text_cap_en = row['In English, what is this image about?']
    text_cap_native = row['In your native language, what is this image about?']
    text_culture_loc = row['(Choose at least 1) This image portrays culturally-relevant information in...']
    text_image_loc = row['Where was this image taken? (City, Country)']
    text_submitter_native_lang = row["What's your native language?"]

    # Download Image
    file_id = image_path.split('?id=')[-1]
    file_name = get_file_name_from_id(drive_service, file_id)
    destination = f'{BASE_PATH}/{idx}_{file_name}'
    download_file_from_google_drive(drive_service, file_id, destination)

    if destination.lower().endswith('heic'):
        heic_img = HEIC2PNG(destination, quality=90)  # Specify the quality of the converted image
        destination = destination.lower().replace('.heic','.png')
        
        # Save as PNG
        if os.path.exists(destination):
            os.remove(destination)
        heic_img.save(destination)
        
        # Update file_name
        file_name = file_name.lower().replace('.heic','.png')

    # Resize Image if too big
    image = Image.open(destination)
    (w, h) = image.size
    if (w > 2200):
        h = int(h * 2200. / w)
        w = 2200
    if (h > 2400):
        h = int(h * 2200. / w)
        w = 2200
    image = image.resize((w, h), Image.BILINEAR)
    image.save(destination)
    
    # Upload Image to XXX
    img_url = f'{GH_BASE_PATH}/{idx}_{file_name}'
    migration_data.append(
        get_image_data(img_url, text_cap_en, text_cap_native, text_culture_loc, text_image_loc, text_submitter_native_lang)
    )

migrate_df = pd.DataFrame(migration_data)
migrate_df.to_csv(f'ready_to_upload_{datetime.now().strftime("%Y%m%d")}.csv', index=False)

1004it [05:24,  3.09it/s]


In [4]:
migrate_df.shape

(1004, 14)