# BigQuery Cleanup

Reverts the rows added by `add_data_to_bq.ipynb` by deleting newly inserted products and removing shipment taxonomy records for the updated shipment IDs.

In [3]:
!uv pip install google-cloud-bigquery

[2K[2mResolved [1m24 packages[0m [2min 186ms[0m[0m                                        [0m
[2K[2mInstalled [1m4 packages[0m [2min 6ms[0m[0my==3.38.0                        [0m
 [32m+[39m [1mgoogle-cloud-bigquery[0m[2m==3.38.0[0m
 [32m+[39m [1mgoogle-cloud-core[0m[2m==2.5.0[0m
 [32m+[39m [1mgoogle-crc32c[0m[2m==1.7.1[0m
 [32m+[39m [1mgoogle-resumable-media[0m[2m==2.8.0[0m


In [1]:
import os
os.environ['GOOGLE_CLOUD_PROJECT'] = 'dev-tradyon-data'
print('Using project:', os.environ['GOOGLE_CLOUD_PROJECT'])


Using project: dev-tradyon-data


## Remove newly added products
Deletes rows in `tradyon.product_master` whose `product_id` values come from `output/auto_run/product_schema_master.csv`.

In [4]:
from google.cloud import bigquery
import csv

client = bigquery.Client()

csv_path = '/home/parshav-potato/Work/tradyon/generic_pipeline/output/auto_run/product_schema_master.csv'

product_ids = []
with open(csv_path, newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        pid = (row.get('product_id') or '').strip()
        if pid:
            product_ids.append(pid)

if not product_ids:
    raise ValueError('No product_ids found in CSV; aborting delete.')

unique_ids = sorted(set(product_ids))

query = '''
DELETE FROM `dev-tradyon-data.tradyon.product_master`
WHERE product_id IN UNNEST(@product_ids)
'''

job_config = bigquery.QueryJobConfig(
    query_parameters=[bigquery.ArrayQueryParameter('product_ids', 'STRING', unique_ids)]
)

result = client.query(query, job_config=job_config)
result.result()

print(f'Deleted rows for {len(unique_ids)} product_ids from product_master')


FileNotFoundError: [Errno 2] No such file or directory: '/home/parshav-potato/Work/tradyon/generic_pipeline/output/auto_run/product_schema_master.csv'

## Delete shipment taxonomy rows
Deletes rows in `tradyon.shipment_master_taxonomy` whose `shipment_id` values appear in `output/auto_run/shipment_id_to_attr.csv`. Duplicate IDs in the CSV are deduplicated before deletion.

In [6]:
from google.cloud import bigquery
import csv

client = bigquery.Client()

csv_path = '/home/parshav-potato/Work/tradyon/generic_pipeline/sample_onion.csv'

shipment_ids = []
with open(csv_path, newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        sid = (row.get('shipment_id') or '').strip()
        if sid:
            shipment_ids.append(sid)

if not shipment_ids:
    raise ValueError('No shipment_ids found in CSV; aborting delete.')

unique_ids = sorted(set(shipment_ids))

query = '''
DELETE FROM `dev-tradyon-data.tradyon.shipment_master_taxonomy`
WHERE shipment_id IN UNNEST(@shipment_ids)
'''

job_config = bigquery.QueryJobConfig(
    query_parameters=[bigquery.ArrayQueryParameter('shipment_ids', 'STRING', unique_ids)]
)

result = client.query(query, job_config=job_config)
result.result()

print(f'Deleted rows for {len(unique_ids)} shipment_ids in shipment_master_taxonomy')


Deleted rows for 25118 shipment_ids in shipment_master_taxonomy
