In [None]:
from google.cloud import storage
from google.cloud import bigquery

project_id = "automated-style-411721"
bucket_name = "arcs329e_data"
folder_name = "initial_load"
dataset_name = "retails_raw"
region = "US"

storage_client = storage.Client()
bq_client = bigquery.Client()

In [None]:
def create_load_table(file_name, table_name, schema, delimiter=","):

  uri = "gs://{}/{}/{}".format(bucket_name, folder_name, file_name)
  table_id = "{}.{}.{}".format(project_id, dataset_name, table_name)

  table = bigquery.Table(table_id, schema=schema)
  table = bq_client.create_table(table, exists_ok=True)
  print("Created table {}".format(table.table_id))

  del schema[-1]

  job_config = bigquery.LoadJobConfig(
        schema=schema,
        skip_leading_rows=1,
        source_format=bigquery.SourceFormat.CSV,
        write_disposition="WRITE_TRUNCATE",
        field_delimiter=delimiter,
        autodetect=True,
        allow_jagged_rows=True,
        allow_quoted_newlines=True,
        ignore_unknown_values=True,
        preserve_ascii_control_characters=True,
        use_avro_logical_types=True
  )

  load_job = bq_client.load_table_from_uri(uri, table_id, job_config=job_config)
  load_job.result()

  destination_table = bq_client.get_table(table_id)
  print("Loaded {} rows.".format(destination_table.num_rows))


##mrds Table

In [None]:
file_name = 'mrds.csv'
table_name = 'mrds'

schema = [
    bigquery.SchemaField("dep_id", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("mrds_id", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("mas_id", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("site_name", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("latitude", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("longitude", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("country", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("state", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("county", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("com_type", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("commod1", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("commod2", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("commod3", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("load_time", "TIMESTAMP", mode="NULLABLE", default_value_expression="CURRENT_TIMESTAMP")
]

create_load_table(file_name, table_name, schema)

Created table mrds
Loaded 304632 rows.


## mineral_ores_around_the_world Table

In [None]:
file_name = 'mineral_ores_around_the_world.csv'
table_name = 'mineral'

schema = [
  bigquery.SchemaField("site_name", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("latitude", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("longitude", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("state", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("country", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("county", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("com_type", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("commod1", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("commod2", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("commod3", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("load_time", "TIMESTAMP", mode="NULLABLE", default_value_expression="CURRENT_TIMESTAMP"),
]

# schema = [
#   bigquery.SchemaField("site_name", "STRING", mode="NULLABLE"),
#   bigquery.SchemaField("latitude", "STRING", mode="NULLABLE"),
#   bigquery.SchemaField("longitude", "STRING", mode="NULLABLE"),
#   bigquery.SchemaField("state", "STRING", mode="NULLABLE"),
#   bigquery.SchemaField("country", "STRING", mode="NULLABLE"),
#   bigquery.SchemaField("county", "STRING", mode="NULLABLE"),
#   bigquery.SchemaField("com_type", "STRING", mode="NULLABLE"),
#   bigquery.SchemaField("commod1", "STRING", mode="NULLABLE"),
#   bigquery.SchemaField("commod2", "STRING", mode="NULLABLE"),
#   bigquery.SchemaField("commod3", "STRING", mode="NULLABLE"),
#   bigquery.SchemaField("load_time", "TIMESTAMP", mode="NULLABLE", default_value_expression="CURRENT_TIMESTAMP"),
# ]

create_load_table(file_name, table_name, schema)

Created table mineral
Loaded 304621 rows.


##sales_data_set Table


In [None]:
file_name = 'sales_data_set.csv'
table_name = 'sales_data_set'

schema = [
  bigquery.SchemaField("Store", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("Dept", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("Weekly_Sales", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("IsHoliday", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("Date", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table(file_name, table_name, schema)

Created table sales_data_set
Loaded 421570 rows.


##train

In [None]:
file_name = 'train.csv'
table_name = 'train'

schema = [
  bigquery.SchemaField("store_nbr", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("family", "STRING", mode="NULLABLE"),
  #bigquery.SchemaField("onpromotion", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("sales", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("date", "TIMESTAMP", mode="NULLABLE", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table(file_name, table_name, schema)

Created table train
Loaded 3000888 rows.


##supplier Table

In [None]:
file_name = 'supplier.csv'
table_name = 'supplier'

schema = [
  bigquery.SchemaField("supply_key", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("supplier_nation_key", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("supplier_comment", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("supplier_name", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("supplier_address", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("supplier_phone", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("supplier_account_balance", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("Date", "TIMESTAMP", mode="NULLABLE", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table(file_name, table_name, schema)

Created table supplier
Loaded 10000 rows.


##parts Table

In [None]:
file_name = 'supplier.csv'
table_name = 'supplier'

schema = [
  bigquery.SchemaField("part_key", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("part_type", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("part_size", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("part_brand", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("part_name", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("part_container", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("part_manufacturer", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("part_retail_price", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("part_comment", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("Date", "TIMESTAMP", mode="NULLABLE", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table(file_name, table_name, schema)

Created table supplier
Loaded 10000 rows.


##partsupp Table

In [None]:
file_name = 'supplier.csv'
table_name = 'supplier'

schema = [
  bigquery.SchemaField("part_supply_part_key", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("part_supply_supply_key", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("part_supply_cost", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("part_supply_available_quantity", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("part_supply_comment", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("Date", "TIMESTAMP", mode="NULLABLE", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table(file_name, table_name, schema)

Created table supplier
Loaded 10000 rows.


Verify Loads

In [None]:
sql = "select table_name from {}.INFORMATION_SCHEMA.TABLES order by table_name".format(dataset_name)
query = (sql)

query_job = bq_client.query(
    query,
    location=region,
)

results = query_job.result()

for table in query_job:
    table_name = table.values()[0]
    print("table:", table_name)

table: mineral
table: mineral_ores_around_the_world
table: mrds
table: parts
table: partsupp
table: sales_data_set
table: store_data_set
table: supplier
table: train
