Champion bucket to Bronze

In [3]:
from google.cloud import storage, bigquery
import pandas as pd
import json
import os
import uuid

bucket_name = 'batch-images'

project_id = "big-data-lol"
bucket_name = "batch-images"
prefix = 'item/14.12/'
dataset_id = 'Bronze_LOL_Dataset'
table_id = 'bronze_batch_img_item'

storage_client = storage.Client(project=project_id)
bigquery_client = bigquery.Client(project=project_id)

bucket = storage_client.bucket(bucket_name)
blobs = bucket.list_blobs(prefix=prefix)

local_files = []

for blob in blobs:
    local_path = '/tmp/' + os.path.basename(blob.name)
    blob.download_to_filename(local_path)
    local_files.append(local_path)

print(f"Downloaded files: {local_files}")

rows = []

for file_path in local_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        data_json = json.load(f)

    for key, value in data_json.items():
        row = {
            'UID': str(uuid.uuid4()),
            'Key': key,
            'Value': json.dumps(value),  # Convertimos el valor a una cadena JSON
            'FileName': os.path.basename(file_path)
        }
        rows.append(row)

# Crear un DataFrame de pandas
df = pd.DataFrame(rows)

# Definir el esquema explícitamente
schema = [
    bigquery.SchemaField('UID', 'STRING', mode='REQUIRED'),
    bigquery.SchemaField('Key', 'STRING', mode='REQUIRED'),
    bigquery.SchemaField('Value', 'STRING', mode='REQUIRED'),
    bigquery.SchemaField('FileName', 'STRING', mode='REQUIRED')
]

# Configuración del trabajo de carga
job_config = bigquery.LoadJobConfig(
    write_disposition='WRITE_TRUNCATE',
    schema=schema
)

# Cargar el DataFrame a BigQuery
table_ref = bigquery_client.dataset(dataset_id).table(table_id)
job = bigquery_client.load_table_from_dataframe(df, table_ref, job_config=job_config)

# Esperar a que termine el trabajo
job.result()

print(f"DataFrame cargado en BigQuery: {dataset_id}.{table_id}")


Downloaded files: ['/tmp/1001.json', '/tmp/1004.json', '/tmp/1006.json', '/tmp/1011.json', '/tmp/1018.json', '/tmp/1026.json', '/tmp/1027.json', '/tmp/1028.json', '/tmp/1029.json', '/tmp/1031.json', '/tmp/1033.json', '/tmp/1035.json', '/tmp/1036.json', '/tmp/1037.json', '/tmp/1038.json', '/tmp/1039.json', '/tmp/1040.json', '/tmp/1042.json', '/tmp/1043.json', '/tmp/1052.json', '/tmp/1053.json', '/tmp/1054.json', '/tmp/1055.json', '/tmp/1056.json', '/tmp/1057.json', '/tmp/1058.json', '/tmp/1082.json', '/tmp/1083.json', '/tmp/1101.json', '/tmp/1102.json', '/tmp/1103.json', '/tmp/1104.json', '/tmp/126697.json', '/tmp/127008.json', '/tmp/1500.json', '/tmp/1501.json', '/tmp/1502.json', '/tmp/1503.json', '/tmp/1504.json', '/tmp/1506.json', '/tmp/1507.json', '/tmp/1508.json', '/tmp/1509.json', '/tmp/1510.json', '/tmp/1511.json', '/tmp/1512.json', '/tmp/1515.json', '/tmp/1516.json', '/tmp/1517.json', '/tmp/1518.json', '/tmp/1519.json', '/tmp/1520.json', '/tmp/1521.json', '/tmp/1522.json', '/tmp