In [1]:
from google.colab import auth
from googleapiclient.discovery import build
import os
import subprocess
import time
import datetime
import json
import io
from googleapiclient.http import MediaIoBaseDownload

In [2]:
auth.authenticate_user()

In [3]:
service = build('drive', 'v3')

In [4]:
SPLIT_FILES_SOURCE = {'BETA': '187KTNwJ2LZXf5d8WAFO1qnP6g8Yqderw',
                      'BITECA': '',
                      'BITAGAP': ''}

SPLIT_FILES_PROCESSED= {'BETA': '1lc-XTCm9lSAfO6xIxxQ5K_4LzHhj3nPl',
                           'BITECA': '',
                           'BITAGAP': ''}
WB_CONFIGS = {
    'pb.cloud': {
        'MEDIAWIKI_API_URL': 'https://pbsandbox.wikibase.cloud/qs/api.php',
        'WB_USER': 'pb.cloud.user',
        'WB_PASSWORD': 'pb.cloud.password',
        'WB_TOKEN': 'pb.cloud.token'
    },
    'pb.cog': {
        'MEDIAWIKI_API_URL': "https://philobiblon.cog.berkeley.edu/qs/api.php",
        'WB_USER': 'pb.cog.user',
        'WB_PASSWORD': 'pb.cog.password',
        'WB_TOKEN': 'pb.cog.token',
    },
        'factgrid': {
        'MEDIAWIKI_API_URL': "https://database.factgrid.de/qs/api.php",
        'WB_USER': 'factgrid.user',
        'WB_PASSWORD': 'factgrid.password',
        'WB_TOKEN': 'factgrid.token',
    }
}

# Manually update bibliography, table and instance that is to be updated
bibliography = 'BETA' # BETA BITECA BITAGAP
table = 'subject' # 'geography' 'analytic' 'library' 'ms_ed' 'biographies' 'copies' 'institutions' 'subject' 'uniform_title'
instance = "pb.cog" # pb.cloud pb.cog factgrid
batch_id = ''
start_time = time.time()
elapsed_time = 0
max_seconds = 36000
complete_status = False

In [5]:
from google.colab import userdata
URL = WB_CONFIGS[instance]['MEDIAWIKI_API_URL']
WB_USER = userdata.get(WB_CONFIGS[instance]['WB_USER'])
WB_PASSWORD = userdata.get(WB_CONFIGS[instance]['WB_PASSWORD'])
WB_TOKEN = userdata.get(WB_CONFIGS[instance]['WB_TOKEN'])

In [6]:
def time_check():
    elapsed_time = time.time() - start_time
    print(f"Elapsed time: {elapsed_time} seconds")
    return elapsed_time

In [7]:
def get_batch_status(batch_id):
    batch_status_command = f"curl {URL} -d action=get_batch_info -d batch={batch_id}"
    batch_status = subprocess.run(batch_status_command, capture_output=True, text=True, shell=True)
    try:
        data = json.loads(batch_status.stdout)
        batch_status = data["data"][str(batch_id)]["batch"]["status"]
    except json.JSONDecodeError:
        print("Error parsing JSON output:", batch_status.stdout)
    try:
        error_count = data["data"][str(batch_id)]["commands"]['ERROR']
    except:
        error_count = 0
    return batch_status, error_count

In [8]:
def move_file(source_id, destination_id, file_name, file_id):
    file = service.files().get(fileId=file_id, fields='parents').execute()
    previous_parents = ",".join(file.get('parents'))
    file = service.files().update(fileId=file_id,
                                  addParents=destination_id,
                                  removeParents=previous_parents,
                                  fields='id, parents').execute()

    print(f"File '{file_name}' moved to folder with ID '{destination_id}'.")

In [9]:
path_id = SPLIT_FILES_SOURCE[str(bibliography)]
destination_id = SPLIT_FILES_PROCESSED[str(bibliography)]

# Check for files in source directory
results = service.files().list(q=f"'{path_id}' in parents and trashed=false", pageSize=1000, fields="nextPageToken, files(id, name)").execute()
items = results.get('files', [])
if not items:
    print('No items found.')
else:
    print('Items:')
    for item in items:
        print(u'{0} ({1})'.format(item['name'], item['id']))

item_dict = [item for item in items if isinstance(item, dict)][0]
table_items = [item for item in items if item['name'].endswith('.qs')]
print(table_items)

# Sort table items by split number
table_items.sort(key=lambda x: int(x['name'].split('_')[4].split('.')[0]) if len(x['name'].split('_')) >= 4 else 0, reverse=False)
names = [item['name'] for item in table_items]
print(f'List of all files to be processed: {names}')

# Loop through sorted items and process
elapsed_time = time_check() # Get elapsed time so far to start
while elapsed_time < max_seconds and not complete_status:
    for table_item in table_items:
        date = datetime.datetime.now().strftime("%Y%m%d_%H%M")
        batchname = f'{bibliography}_{table}_{date}'
        print(u'{0} ({1})'.format(table_item['name'], table_item['id']))
        file_id = table_item['id']
        file_name = table_item['name']
        # Download the file
        print(f'Downloading file: {file_name}')
        request = service.files().get_media(fileId=file_id)
        fh = io.BytesIO()
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while done is False:
            status, done = downloader.next_chunk()
            print("Download %d%%." % int(status.progress() * 100))

        # Save the file to your Colab environment
        with open(file_name, 'wb') as f:
            f.write(fh.getbuffer())
            print(f"File '{file_name}' downloaded successfully.")

        # Run curl command submitting batch
        curl_command = f'curl {URL} -d action=import -d submit=1 -d format=v1 -d username={WB_USER} -d batchname={batchname} --data-raw token=\'{WB_TOKEN}\' --data-urlencode data@{file_name}'
        post_qs = subprocess.run(curl_command, capture_output=True, text=True, shell=True)
        time.sleep(10) # Wait for batch to initiate

        # move qs file to completed folder after submitted batch
        move_file(path_id, destination_id, file_name, file_id)

        if post_qs.returncode != 0:
            print("Error executing curl command:", post_qs.stderr)
            exit(1)
        try:
            data = json.loads(post_qs.stdout)
            batch_id = data["batch_id"]
        except json.JSONDecodeError:
            print(f"Error parsing JSON output for {batch_id}:", post_qs.stdout)
            continue
        print(f"Batch ID: {batch_id}")
        status = get_batch_status(batch_id)
        while status != "DONE":
            print(f'Batch import {batch_id} still running, sleeping for 10 minutes')
            time.sleep(600)
            print("Checking batch status")
            status, error_count = get_batch_status(batch_id)
        print(f'Batch import {batch_id} complete with status {status}')
        print(f'Batch {batch_id} had {error_count} errors')

        # update elapsed time
        elapsed_time = time_check()

    complete_status = True

print(f'All files processed.  Completed in: {elapsed_time} seconds')

Items:
split_beta_subject_qs_0.qs (193RiZ5jlBNhCrwiB-t_fZwyJbh1QYDiH)
Split Files Done (1lc-XTCm9lSAfO6xIxxQ5K_4LzHhj3nPl)
[{'id': '193RiZ5jlBNhCrwiB-t_fZwyJbh1QYDiH', 'name': 'split_beta_subject_qs_0.qs'}]
List of all files to be processed: ['split_beta_subject_qs_0.qs']
Elapsed time: 1.9307582378387451 seconds
split_beta_subject_qs_0.qs (193RiZ5jlBNhCrwiB-t_fZwyJbh1QYDiH)
Downloading file: split_beta_subject_qs_0.qs
Download 100%.
File 'split_beta_subject_qs_0.qs' downloaded successfully.
Batch ID: 57
Batch import 57 still running, sleeping for 10 minutes
Checking batch status
Batch import 57 still running, sleeping for 10 minutes
Checking batch status
Batch import 57 still running, sleeping for 10 minutes
Checking batch status
Batch import 57 still running, sleeping for 10 minutes
Checking batch status
Batch import 57 still running, sleeping for 10 minutes
Checking batch status
Batch import 57 still running, sleeping for 10 minutes
Checking batch status
Batch import 57 still running