In [23]:
import os
import pandas as pd
import math
import threading
from google.cloud import bigquery

In [6]:
PROJECT_ID = "integrated-bit-312717"
DATASET_NAME = 'customs'
TABLE_NAME = "customs_test"

table_id = "{project_id}.{dataset_name}.{table_name}".format(project_id=PROJECT_ID, dataset_name=DATASET_NAME, 
            table_name=TABLE_NAME)

In [3]:
client = bigquery.Client(project=PROJECT_ID, location="US")

In [None]:
client.create_dataset('{dataset_name}'.format(dataset_name=DATASET_NAME))

In [7]:
client.create_table('{project_id}.{dataset_name}.{table_name}'.format(project_id=PROJECT_ID, dataset_name=DATASET_NAME, table_name=TABLE_NAME))

Table(TableReference(DatasetReference('integrated-bit-312717', 'customs'), 'customs_test'))

In [13]:
base = "bucket/merge_dataset_customs"

In [15]:
main_dirs = [os.path.join(base, main_dir) for main_dir in os.listdir(base) if os.path.isdir(os.path.join(base, main_dir))]
main_dirs

['bucket/merge_dataset_customs/2001',
 'bucket/merge_dataset_customs/2002',
 'bucket/merge_dataset_customs/2003',
 'bucket/merge_dataset_customs/2004',
 'bucket/merge_dataset_customs/2005',
 'bucket/merge_dataset_customs/2006',
 'bucket/merge_dataset_customs/2007',
 'bucket/merge_dataset_customs/2008',
 'bucket/merge_dataset_customs/2009',
 'bucket/merge_dataset_customs/2010',
 'bucket/merge_dataset_customs/2011',
 'bucket/merge_dataset_customs/2012',
 'bucket/merge_dataset_customs/2013',
 'bucket/merge_dataset_customs/2014',
 'bucket/merge_dataset_customs/2015',
 'bucket/merge_dataset_customs/2016',
 'bucket/merge_dataset_customs/2017',
 'bucket/merge_dataset_customs/2018',
 'bucket/merge_dataset_customs/2019',
 'bucket/merge_dataset_customs/2020',
 'bucket/merge_dataset_customs/2021']

In [24]:
table_id

'integrated-bit-312717.customs.customs_test'

In [30]:
def check_value(value):
    if math.isnan(value):
        return 0
    return int(value)

def tranform_merge_df2jsonBQ(file_path):
    df = pd.read_csv(file_path)
    hs_code = file_path.split('/')[-2]
    datas = []
    for row_id, row in df.iterrows():
        keys = list(row.keys())
        keys.remove('Country')
        datas += [
            {
                'country':row['Country'], 
                'hs_code':hs_code,
                'year_month':key, 
                'value':check_value(row[key])
            } 
        for key in keys]
    return datas

def upload2bq_thread(main_path):
    paths = [os.path.join(main_path, file) for file in os.listdir(main_path)]
    for path in paths:
        rows_to_insert = tranform_merge_df2jsonBQ(path)
        if rows_to_insert != []:
            try:
                errors = client.insert_rows_json(table_id, rows_to_insert)
            except Exception as e:
                bq_errs.append(e)
            else:
                if errors == []:
                    print("New rows have been added. | ", path)
                else:
                    print("Encountered errors while inserting rows: {}".format(errors))

# parallel upload to BQ

In [31]:
t=[]
bq_errs = []
for main_path in main_dirs:
    x = threading.Thread(target=upload2bq_thread, args=(main_path,))
    x.start()
    t.append(x)
for th_id, thread in enumerate(t):
    print("Main    : before joining thread {}".format(th_id))
    thread.join()
    print("Main.   : thread {} done".format(th_id))

Main    : before joining thread 0
New rows have been added. |  bucket/merge_dataset_customs/2021/0101.csv
New rows have been added. |  New rows have been added. |  bucket/merge_dataset_customs/2006/0101.csvbucket/merge_dataset_customs/2014/0101.csv
New rows have been added. |  bucket/merge_dataset_customs/2007/0101.csv
New rows have been added. | New rows have been added. | 
  bucket/merge_dataset_customs/2004/0101.csv
New rows have been added. |  bucket/merge_dataset_customs/2019/0101.csv
bucket/merge_dataset_customs/2005/0101.csv
New rows have been added. |  bucket/merge_dataset_customs/2010/0101.csv
New rows have been added. |  bucket/merge_dataset_customs/2015/0101.csv
New rows have been added. |  bucket/merge_dataset_customs/2002/0101.csv
New rows have been added. |  bucket/merge_dataset_customs/2009/0101.csv
New rows have been added. |  bucket/merge_dataset_customs/2008/0101.csv
New rows have been added. |  bucket/merge_dataset_customs/2013/0101.csv
New rows have been added. |  b