In [1]:
# !apt-get install git

from google.colab import auth
auth.authenticate_user()
print('Authenticated')

import os
!git clone https://github.com/PhiloBiblon/philobiblon-to-wikibase.git
os.chdir('philobiblon-to-wikibase/pb2wb')

from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import io
from postprocess.postprocessor.generic import GenericPostprocessor


Authenticated
Cloning into 'philobiblon-to-wikibase'...
remote: Enumerating objects: 1214, done.[K
remote: Counting objects: 100% (125/125), done.[K
remote: Compressing objects: 100% (88/88), done.[K
remote: Total 1214 (delta 62), reused 64 (delta 35), pack-reused 1089 (from 1)[K
Receiving objects: 100% (1214/1214), 2.90 MiB | 28.24 MiB/s, done.
Resolving deltas: 100% (708/708), done.


In [2]:
service = build('drive', 'v3')

In [3]:
# Set variables for folder id's, bib, tables and force statements

OR_FILES_SOURCE = {'beta': '115pNT9ue480HAr996XbGRwWCLHFJK_V_',
                   'biteca': '',
                   'bitagap': ''}

POST_FILES_DESTINATION = {'beta': '1efJDT_HJoIsrRBw1bySuIke6n3xaSTyt',
                          'biteca': '',
                          'bitagap': ''}

# Set bibliography and tables to be post processed
bibliography = ['beta'] #['beta', 'bitagap', 'biteca']
tablenames = ['subject', 'biography'] #['uniform_title','analytic', 'biography', 'library', 'copies', 'ms_ed', 'institutions', 'geography', 'bibliography', 'subject']
force_new_statements = True


In [4]:
def find_file_id(folder_id, table):
    query = f"'{folder_id}' in parents and not mimeType='application/vnd.google-apps.folder'"
    results = service.files().list(q=query,fields="nextPageToken, files(id, name)").execute()
    file_name = [item['name'] for item in results['files'] if item['name'].endswith('.qs') and table in item['name']]
    file_id = [item['id'] for item in results['files'] if item['name'].endswith('.qs') and table in item['name']]
    if len(file_id) > 0:
        return file_name[0], file_id[0]

In [5]:
def create_folders(bib):
  directory_path = f"data/post/{bib}"
  if not os.path.exists(directory_path):
    os.makedirs(directory_path)
    print(f"Directory '{directory_path}' created successfully.")
  else:
    print(f"Directory '{directory_path}' already exists.")
  return directory_path

In [6]:
def download_file(file_name, file_id):
    # Download the file
    if len(file_id) > 0: # Check if file_id list contains any elements
        print(f'Downloading file: {file_name} with id: {file_id}')
        request = service.files().get_media(fileId=file_id)
        fh = io.BytesIO()
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while done is False:
            status, done = downloader.next_chunk()
            print("Download %d%%." % int(status.progress() * 100))
    else:
        print(f'No file found for {file_name}') # Handle the case of an empty file_id list

    # Get the current working directory
    cwd = os.getcwd()

    # Construct the full path to the downloaded file
    file_path = os.path.join(cwd, file_name)

    # Save the downloaded file to the current working directory
    with open(file_path, 'wb') as f:
        fh.seek(0)
        f.write(fh.read())

    print(f'File downloaded to: {file_path}')

In [7]:
def move_file(destination_id, file_name, file_id):
    results = service.files().list(q=f"name='{file_name}'", fields="nextPageToken, files(id, name)").execute()

    # Find the file ID
    for file in results.get('files', []):
      print(file)
      if file['name'] == f'{file_name}':
        file_id = file['id']
        break

    # Get the metadata of the file to be copied
    file_metadata = service.files().get(fileId=file_id).execute()

    # Create a new file object with the desired destination folder
    new_file_metadata = {
        'name': file_metadata['name'],
        'parents': [destination_id]
    }

    # Copy the file
    copied_file = service.files().copy(fileId=file_id, body=new_file_metadata).execute()

    print(f'File {file_name} copied successfully! New file ID:', copied_file['id'])

In [8]:
for bib in bibliography:
  print(f'Processing for {bib} bibliography')
  directory_path = create_folders(bib)
  for item in tablenames:
    print(f'Starting processing {item} table')
    folder_id = OR_FILES_SOURCE[bib]
    post_folder_id = POST_FILES_DESTINATION[bib]

    try:
      file_name, file_id = find_file_id(folder_id, item)
    except:
      print(f'No file found for {item}')
      continue

    download_file(file_name, file_id)
    GenericPostprocessor().postprocess(file_name, directory_path, force_new_statements)
    print(f'postprocess complete for {item}')
    print('................................')

    # Move processed file to Drive
    move_file(post_folder_id, directory_path + '/' + file_name, file_id)

  print('Post processing complete')



Processing for beta bibliography
Directory 'data/post/beta' created successfully.
Starting processing subject table
Downloading file: beta_subject.qs with id: 1YLRQFJPVJCZ3WnaBGDeKc1cmJXoJXv0m
Download 100%.
File downloaded to: /content/philobiblon-to-wikibase/pb2wb/beta_subject.qs
postprocess complete for subject
................................
File data/post/beta/beta_subject.qs copied successfully! New file ID: 1qcgLyXhbAXLQDAqAqEm8jXgFQBFgnD2d
Starting processing biography table
Downloading file: beta_biography.qs with id: 1fvwZJFMzwdn1_vvRwxUcloV_ayVgvGI-
Download 100%.
File downloaded to: /content/philobiblon-to-wikibase/pb2wb/beta_biography.qs
postprocess complete for biography
................................
File data/post/beta/beta_biography.qs copied successfully! New file ID: 1otxrZqMBEJdKzztAKWLwIeXF0-FRWJ9y
Post processing complete
