In [None]:
# @title Inspect the schema of bigquery-public-data.thelook_ecommerce.users
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

# Fetch the table
table = client.get_table('bigquery-public-data.thelook_ecommerce.users')

# View table info
print("{} rows".format(table.num_rows))
for row in table.schema:
  print("Column {}: {}".format(row.name, row.field_type))


In [None]:
# @title Inspect the schema of bigquery-public-data.thelook_ecommerce.products
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

# Fetch the table
table = client.get_table('bigquery-public-data.thelook_ecommerce.products')

# View table info
print("{} rows".format(table.num_rows))
for row in table.schema:
  print("Column {}: {}".format(row.name, row.field_type))


In [None]:
# @title Inspect the schema of bigquery-public-data.thelook_ecommerce.order_items
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

# Fetch the table
table = client.get_table('bigquery-public-data.thelook_ecommerce.order_items')

# View table info
print("{} rows".format(table.num_rows))
for row in table.schema:
  print("Column {}: {}".format(row.name, row.field_type))


In [None]:
# @title Setup bigquery client and formatting
from google.cloud import bigquery
from google.colab import data_table

project = 'airflow-demo-437509' # Project ID inserted based on the query results selected to explore
client = bigquery.Client(project=project)
data_table.enable_dataframe_formatter()

In [None]:
# @title Executes the query

sql = '''# prompt: Join these data sources

SELECT
  order_items.id,
  order_items.order_id,
  order_items.user_id,
  order_items.product_id,
  order_items.inventory_item_id,
  order_items.status,
  order_items.created_at,
  order_items.shipped_at,
  order_items.delivered_at,
  order_items.returned_at,
  order_items.sale_price,
  products.id,
  products.cost,
  products.category,
  products.name,
  products.brand,
  products.retail_price,
  products.department,
  products.sku,
  products.distribution_center_id,
  users.id,
  users.first_name,
  users.last_name,
  users.email,
  users.age,
  users.gender,
  users.state,
  users.street_address,
  users.postal_code,
  users.city,
  users.country,
  users.latitude,
  users.longitude,
  users.traffic_source,
  users.created_at,
  users.user_geom
FROM
  `bigquery-public-data.thelook_ecommerce.order_items` AS order_items
INNER JOIN
  `bigquery-public-data.thelook_ecommerce.products` AS products
ON
  order_items.product_id = products.id
INNER JOIN
  `bigquery-public-data.thelook_ecommerce.users` AS users
ON
  order_items.user_id = users.id;'''
query = client.query(sql)
query
# @title Render the query results

job = client.get_job(query.job_id)
df = job.to_dataframe()
df


In [None]:
# @title Executes the query

sql1 = '''# prompt: remove column that end with id

SELECT
  t1.status,
  t1.created_at,
  t1.shipped_at,
  t1.delivered_at,
  t1.returned_at,
  t1.sale_price,
  t1.cost,
  t1.category,
  t1.name,
  t1.brand,
  t1.retail_price,
  t1.department,
  t1.sku,
  t1.first_name,
  t1.last_name,
  t1.email,
  t1.age,
  t1.gender,
  t1.state,
  t1.street_address,
  t1.postal_code,
  t1.city,
  t1.country,
  t1.latitude,
  t1.longitude,
  t1.traffic_source,
  t1.created_at_1
FROM
  `SQL` AS t1;'''
query1 = client.query(sql1)
query1
# @title Render the query results

job1 = client.get_job(query1.job_id)
df1 = job1.to_dataframe()
df1


In [None]:
# @title Executes the query

sql2 = '''# prompt: get all data but filter the status of order to be Shipped

SELECT
  status,
  created_at,
  shipped_at,
  delivered_at,
  returned_at,
  sale_price,
  cost,
  category,
  name,
  brand,
  retail_price,
  department,
  sku,
  first_name,
  last_name,
  email,
  age,
  gender,
  state,
  street_address,
  postal_code,
  city,
  country,
  latitude,
  longitude,
  traffic_source,
  created_at_1
FROM
  `SQL 1` AS t1
WHERE
  t1.status = 'Shipped';'''
query2 = client.query(sql2)
query2
# @title Render the query results

job2 = client.get_job(query2.job_id)
df2 = job2.to_dataframe()
df2


In [None]:
# @title Executes the query

sql3 = '''# prompt:  Train a boosted tree classifier model in BigQuery to predict product categories based on age, gender, city, country, and traffic source by creating a model with CREATE OR REPLACE MODEL, specifying the target column as category, splitting data randomly with 20% for evaluation, and selecting relevant features from the dataset while ensuring data types are consistent.

CREATE OR REPLACE MODEL
  `airflow-demo-437509._b9e0c16f3d4adbfc14369851e36362f2e91e8a7a.boosted_tree_model` OPTIONS ( model_type = 'BOOSTED_TREE_CLASSIFIER',
    data_split_method = 'RANDOM',
    data_split_eval_fraction = 0.2,
    input_label_cols = ['category'],
    max_iterations = 100) AS
SELECT
  CAST(age AS FLOAT64) AS age,
  CASE WHEN gender = 'F' THEN 1 ELSE 0 END AS gender,
  city,
  country,
  traffic_source,
  category
FROM
  `SQL 2`;'''
query3 = client.query(sql3)
query3
# @title Render the query results

job3 = client.get_job(query3.job_id)
df3 = job3.to_dataframe()
df3


In [None]:
# @title Executes the query

sql4 = '''-- Evaluate Model Performance
SELECT
  *
FROM
  ML.EVALUATE(
    MODEL `airflow-demo-437509._b9e0c16f3d4adbfc14369851e36362f2e91e8a7a.boosted_tree_model`,
    (
      SELECT
        CAST(age AS FLOAT64) AS age,
        CASE WHEN gender = 'F' THEN 1 ELSE 0 END AS gender,
        city,
        country,
        traffic_source,
        category
      FROM
        `SQL 2`
      WHERE
        category IS NOT NULL
    )
  );
'''
query4 = client.query(sql4)
query4
# @title Render the query results

job4 = client.get_job(query4.job_id)
df4 = job4.to_dataframe()
df4


In [None]:
# @title Executes the query

sql5 = '''-- Inference Query for Boosted Tree Model
SELECT
  predicted_category,          -- The predicted category
  predicted_category_probs,    -- Probability distribution over all categories
  age,                         -- Input features for reference
  gender,
  city,
  country,
  traffic_source
FROM
  ML.PREDICT(
    MODEL `airflow-demo-437509._b9e0c16f3d4adbfc14369851e36362f2e91e8a7a.boosted_tree_model`,
    (
      SELECT
        CAST(age AS FLOAT64) AS age,
        CASE WHEN gender = 'F' THEN 1 ELSE 0 END AS gender,
        city,
        country,
        traffic_source
      FROM
        `SQL 2` -- Replace with your input table
    )
  );
'''
query5 = client.query(sql5)
query5
# @title Render the query results

job5 = client.get_job(query5.job_id)
df5 = job5.to_dataframe()
df5
