#Final Project - Option 2: Data Enrichment at scale

##Setup

#####Create a dataset for storing the AI tables and another for storing the AI models

In [None]:
%%bigquery
CREATE SCHEMA `automated-style-411721`.retails_stg_ai

Executing query with job ID: 447c8fe6-c2d7-4550-ab5c-d415eedd0955
Query executing: 0.55s


ERROR:
 409 Already Exists: Dataset automated-style-411721:retails_stg_ai

Location: US
Job ID: 447c8fe6-c2d7-4550-ab5c-d415eedd0955



In [None]:
%%bigquery
CREATE SCHEMA `automated-style-411721`.remote_models;

Executing query with job ID: b1ec8eb0-9d56-40c5-8a43-ac9c5198ff74
Query executing: 0.42s


ERROR:
 409 Already Exists: Dataset automated-style-411721:remote_models

Location: US
Job ID: b1ec8eb0-9d56-40c5-8a43-ac9c5198ff74



#####Before running this cell, create the remote connection and assign the IAM role `Vertex AI User` to the service account associated with the connection.

In [None]:
%%bigquery
create or replace model remote_models.gemini_pro
  remote with connection `projects/automated-style-411721/locations/us/connections/vertex_connection`
  options (endpoint = 'gemini-pro');

Query is running:   0%|          |

##Part 1: Fill null values for MRDS Com_Types

####Observe the data we are working with

In [None]:
%%bigquery
select * except (url, mrds_id, mas_id, latitude, longitude, state, county, data_source, load_time, county_predictions, mineral_type, type)
from retails_stg.Mrds
where commod1 is not null
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,site_name,region,country,com_type,commod1,commod2,commod3
0,10400538,Nichols Ranch ISL,,United States,,Uranium,,
1,10400618,Lisbon Valley Uranium Project,,United States,,Uranium,,
2,10305956,Michigan Au Mining Company,,United States,M,Gold,,
3,10307397,Unnamed (Don Miller Hills area),,United States,,Zinc,"Lead, Molybdenum, Silver, Iron, Copper, Barium...",
4,10307907,Johnston Creek,,United States,,Gold,,


In [None]:
%%bigquery
select count(*)
from retails_stg.Mrds
where com_type is null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,f0_
0,3817


#### Test the generate_text function

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest a commodity type for each site. Return the output as json, include the Dep_ID and com_type in the output";
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "site_name", site_name, "region", region,
                  "country", country, "com_type", com_type, "commod1", commod1, "commod2", commod2, "commod3", commod3))) as prompt
    from retails_stg.Mrds
    order by Dep_ID
    limit 10
  ),
  struct(TRUE as flatten_json_output)
);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,ml_generate_text_rai_result,ml_generate_text_status,prompt
0,"```json\n{""Dep_ID"":10000001,""com_type"":""M"",""co...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a commodity type for each site. Return...
1,"```json\n[\n {""Dep_ID"":10000002,""com_type"":""M...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a commodity type for each site. Return...
2,"```json\n[\n {\n ""Dep_ID"": 10000003,\n ...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a commodity type for each site. Return...
3,"```json\n{""Dep_ID"":10000004,""com_type"":""M"",""co...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a commodity type for each site. Return...
4,"```json\n{""Dep_ID"":10000005,""com_type"":""M"",""co...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a commodity type for each site. Return...
5,"```json\n{""Dep_ID"":10000006,""com_type"":""M"",""co...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a commodity type for each site. Return...
6,"```json\n[\n {\n ""Dep_ID"": 10000007,\n ...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a commodity type for each site. Return...
7,"```json\n[\n {\n ""Dep_ID"": 10000008,\n ...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a commodity type for each site. Return...
8,"```json\n{""Dep_ID"":10000009,""com_type"":""M"",""co...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a commodity type for each site. Return...
9,"```json\n[\n {\n ""Dep_ID"": 10000010,\n ...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a commodity type for each site. Return...


####Tweak the prompt and save the output

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest one commodity type for one dep_id. Return the output as json, include the Dep_ID and com_type in the output";
create or replace table retails_stg_ai.com_type_predictions_raw_10 as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
      select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "site_name", site_name, "region", region,
              "country", country, "com_type", com_type, "commod1", commod1, "commod2", commod2, "commod3", commod3))) as prompt
      from retails_stg.Mrds
      where com_type is null
      order by Dep_ID
      limit 10
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [None]:
%%bigquery
select ml_generate_text_llm_result, prompt
from retails_stg_ai.com_type_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,prompt
0,"```json\n{""Dep_ID"":10000003,""com_type"":""M"",""co...",Suggest one commodity type for one dep_id. Ret...
1,"```json\n{""Dep_ID"":10000007,""com_type"":""M"",""co...",Suggest one commodity type for one dep_id. Ret...
2,"```json\n{""Dep_ID"":10000001,""com_type"":""M"",""co...",Suggest one commodity type for one dep_id. Ret...
3,"```json\n{""Dep_ID"":10000005,""com_type"":""M"",""co...",Suggest one commodity type for one dep_id. Ret...
4,"```json\n{""Dep_ID"":10000009,""com_type"":""M"",""co...",Suggest one commodity type for one dep_id. Ret...
5,"```json\n{""Dep_ID"":10000010,""com_type"":""M"",""co...",Suggest one commodity type for one dep_id. Ret...
6,"```json\n{""Dep_ID"":10000008,""com_type"":""M"",""co...",Suggest one commodity type for one dep_id. Ret...
7,"```json\n{""Dep_ID"":10000002,""com_type"":""M"",""co...",Suggest one commodity type for one dep_id. Ret...
8,"```json\n{""Dep_ID"":10000004,""com_type"":""M"",""co...",Suggest one commodity type for one dep_id. Ret...
9,"```json\n{""Dep_ID"":10000006,""com_type"":""M"",""co...",Suggest one commodity type for one dep_id. Ret...


####Format the output to proper json

In [None]:
%%bigquery
select ml_generate_text_llm_result, trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as formated_result
from retails_stg_ai.com_type_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,formated_result
0,"```json\n{""Dep_ID"":10000003,""com_type"":""M"",""co...","{""Dep_ID"":10000003,""com_type"":""M"",""commod1"":""C..."
1,"```json\n{""Dep_ID"":10000007,""com_type"":""M"",""co...","{""Dep_ID"":10000007,""com_type"":""M"",""commod1"":""L..."
2,"```json\n{""Dep_ID"":10000001,""com_type"":""M"",""co...","{""Dep_ID"":10000001,""com_type"":""M"",""commod1"":""C..."
3,"```json\n{""Dep_ID"":10000005,""com_type"":""M"",""co...","{""Dep_ID"":10000005,""com_type"":""M"",""commod1"":""G..."
4,"```json\n{""Dep_ID"":10000009,""com_type"":""M"",""co...","{""Dep_ID"":10000009,""com_type"":""M"",""commod1"":""S..."
5,"```json\n{""Dep_ID"":10000010,""com_type"":""M"",""co...","{""Dep_ID"":10000010,""com_type"":""M"",""commod1"":""C..."
6,"```json\n{""Dep_ID"":10000008,""com_type"":""M"",""co...","{""Dep_ID"":10000008,""com_type"":""M"",""commod1"":""G..."
7,"```json\n{""Dep_ID"":10000002,""com_type"":""M"",""co...","{""Dep_ID"":10000002,""com_type"":""M"",""commod1"":""C..."
8,"```json\n{""Dep_ID"":10000004,""com_type"":""M"",""co...","{""Dep_ID"":10000004,""com_type"":""M"",""commod1"":""G..."
9,"```json\n{""Dep_ID"":10000006,""com_type"":""M"",""co...","{""Dep_ID"":10000006,""com_type"":""M"",""commod1"":""C..."


In [None]:
%%bigquery
create or replace table retails_stg_ai.com_type_predictions_formatted_10 as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from retails_stg_ai.com_type_predictions_raw_10

Query is running:   0%|          |

In [None]:
%%bigquery
select json_value(ml_generate_text_llm_result, '$.Dep_ID') as Dep_ID,
  json_value(ml_generate_text_llm_result, '$.com_type') as com_type
from retails_stg_ai.com_type_predictions_formatted_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,com_type
0,10000001,M
1,10000008,M
2,10000009,M
3,10000003,M
4,10000004,M
5,10000002,M
6,10000006,M
7,10000005,M
8,10000010,M
9,10000007,M


####Add the com_types field to the Mrds table

In [None]:
%%bigquery
alter table retails_stg.Mrds add column com_types_predictions string;

Query is running:   0%|          |

####Update the Mrds records with the predicted com_type

In [None]:
%%bigquery
update retails_stg.Mrds set com_types_predictions =
  (select json_value(ml_generate_text_llm_result, '$.com_type')
   from retails_stg_ai.com_type_predictions_formatted_10
   where Dep_ID = cast(json_value(ml_generate_text_llm_result, '$.Dep_ID') as int64))
where 1=1

Query is running:   0%|          |

####Inspect the output

In [None]:
%%bigquery
select * except (url, mrds_id, mas_id, latitude, longitude, state, county, data_source, load_time, county_predictions, mineral_type, type)
from retails_stg.Mrds
where com_types_predictions is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,site_name,region,country,com_type,commod1,commod2,commod3,com_types_predictions
0,10000001,Lookout Prospect,,United States,M,Copper,"Gold, Silver",,M
1,10000002,Lucky Find Prospect,,United States,M,Copper,Gold,,M
2,10000003,Mccullough Prospect,,United States,M,Copper,,"Zinc, Gold",M
3,10000004,Lucky Jim Claim,,United States,M,Gold,,"Copper, Lead",M
4,10000005,Matilda Occurrence,,United States,M,Gold,,,M
5,10000006,Marion Prospect,,United States,M,Copper,,Lead,M
6,10000007,Marble Heart Prospect,,United States,M,Lead,,,M
7,10000008,Morning Star Prospect,,United States,M,Gold,Copper,Iron,M
8,10000009,Monday Prospect,,United States,M,Silver,Gold,Lead,M
9,10000010,Miller Brothers Claim,,United States,M,"Copper, Gold",,,M


####Apply at larger scale

In [None]:
%%bigquery
select count(*)
from retails_stg.Mrds_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,f0_
0,10000


In [None]:
%%bigquery
declare prompt_query STRING default "Suggest one commodity type for one dep_id. Return the output as json, include the Dep_ID and com_type in the output";
create or replace table retails_stg_ai.com_type_predictions_raw_10k as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
      select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "site_name", site_name, "region", region,
              "country", country, "com_type", com_type, "commod1", commod1, "commod2", commod2, "commod3", commod3))) as prompt
      from retails_stg.Mrds
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Executing query with job ID: dee894c8-22c2-4ca4-8056-6caab26f8158
Query executing: 3809.96s

In [None]:
%%bigquery
select creation_time, end_time, query
from `region-us`.INFORMATION_SCHEMA.JOBS
where job_id = 'a3dff4b9-1a00-4d16-8f65-26c160f1e3e7'

In [None]:
%%bigquery
create or replace table retails_stg_ai.com_type_predictions_formatted_10k as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from retails_stg_ai.com_type_predictions_raw_10k

In [None]:
%%bigquery
select count(*) as county_count
from retails_stg_ai.com_type_predictions_formatted_10k

In [None]:
%%bigquery
select json_value(ml_generate_text_llm_result, '$.Dep_ID') as Dep_ID,
  json_value(ml_generate_text_llm_result, '$.com_type') as com_types_predictions
from retails_stg_ai.com_type_predictions_formatted_10k

In [None]:
%%bigquery
update retails_stg.Mrds set com_types_predictions =
  (select json_value(ml_generate_text_llm_result, '$.com_type')
   from retails_stg_ai.com_type_predictions_formatted_10k
   where Dep_ID = cast(json_value(ml_generate_text_llm_result, '$.Dep_ID') as int64))
where 1=1

In [None]:
%%bigquery
select *
from retails_stg.Mrds
where com_types_predictions is not null

In [None]:
%%bigquery
select commod1, com_types_predictions, count(*) as count
from retails_stg.Mrds
where com_types_predictions is not null
group by commod1, com_types_predictions
order by count(*) desc

In [None]:
%%bigquery
update retails_stg.Mrds
  set data_source = 'usgs' where com_types_predictions is not null

##Part 2: Fill null values for Mrds commod1

####Observe the data we are working with

In [None]:
%%bigquery
select * except (url, mrds_id, mas_id, latitude, longitude, state, county, data_source, load_time, county_predictions, mineral_type, type, com_types_predictions)
from retails_stg.Mrds
where com_type is not null
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,site_name,region,country,com_type,commod1,commod2,commod3
0,10306770,Daggett Hill Iron Pit,,United States,M,Iron,Manganese,
1,10306923,Columbus Area-Sand and Gravel,,United States,N,"Sand and Gravel, Construction",,
2,10400284,Silver King prospect,,United States,M,,Gold,
3,10310660,Original Amador Mine,,United States,M,Gold,,
4,10400312,"Rundle, Mills, and Casler Property",,United States,M,Tungsten,,


In [None]:
%%bigquery
select count(*)
from retails_stg.Mrds
where commod1 is null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,f0_
0,21031


#### Test the generate_text function

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest a primary commodity for each site. Return the output as json, include the Dep_ID and commod1 in the output";
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "site_name", site_name, "region", region,
                  "country", country, "com_type", com_type, "commod1", commod1, "commod2", commod2, "commod3", commod3))) as prompt
    from retails_stg.Mrds
    where commod1 is null
    order by Dep_ID
    limit 10
  ),
  struct(TRUE as flatten_json_output)
);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,ml_generate_text_rai_result,ml_generate_text_status,prompt
0,"```json\n[\n {\n ""Dep_ID"": 10000620,\n ...",,,Suggest a primary commodity for each site. Ret...
1,"```json\n[\n {\n ""Dep_ID"": 10000634,\n ...",,,Suggest a primary commodity for each site. Ret...
2,"```json\n[\n {\n ""Dep_ID"": 10001351,\n ...",,,Suggest a primary commodity for each site. Ret...
3,"```json\n{""Dep_ID"":10001971,""com_type"":""M"",""co...",,,Suggest a primary commodity for each site. Ret...
4,"```json\n{""Dep_ID"":10001973,""com_type"":""M"",""co...",,,Suggest a primary commodity for each site. Ret...
5,"```json\n[\n {\n ""Dep_ID"": 10002009,\n ...",,,Suggest a primary commodity for each site. Ret...
6,"```json\n[\n {\n ""Dep_ID"": 10002033,\n ...",,,Suggest a primary commodity for each site. Ret...
7,"```json\n[\n {\n ""Dep_ID"": 10002115,\n ...",,,Suggest a primary commodity for each site. Ret...
8,"```json\n[\n {\n ""Dep_ID"": 10002383,\n ...",,,Suggest a primary commodity for each site. Ret...
9,"```json\n[\n {\n ""Dep_ID"": 10002406,\n ...",,,Suggest a primary commodity for each site. Ret...


####Tweak the prompt and save the output

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest one primary commodity for one site. Return the output as json, include the Dep_ID and commod1 in the output";
create or replace table retails_stg_ai.commod1_predictions_raw_10 as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
      select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "site_name", site_name, "region", region,
                    "country", country, "com_type", com_type, "commod1", commod1, "commod2", commod2, "commod3", commod3))) as prompt
      from retails_stg.Mrds
      where commod1 is null
      order by Dep_ID
      limit 10
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [None]:
%%bigquery
select ml_generate_text_llm_result, prompt
from retails_stg_ai.commod1_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,prompt
0,"```json\n{""Dep_ID"":10001351,""com_type"":""M"",""co...",Suggest one primary commodity for one site. Re...
1,"```json\n{""Dep_ID"":10002033,""com_type"":""M"",""co...",Suggest one primary commodity for one site. Re...
2,"```json\n{""Dep_ID"":10000620,""com_type"":""M"",""co...",Suggest one primary commodity for one site. Re...
3,"```json\n{""Dep_ID"":10002383,""com_type"":""M"",""co...",Suggest one primary commodity for one site. Re...
4,"```json\n{""Dep_ID"":10001973,""com_type"":""M"",""co...",Suggest one primary commodity for one site. Re...
5,"```json\n{""Dep_ID"":10002406,""com_type"":""M"",""co...",Suggest one primary commodity for one site. Re...
6,"```json\n{""Dep_ID"":10002115,""com_type"":""M"",""co...",Suggest one primary commodity for one site. Re...
7,"```json\n{""Dep_ID"":10000634,""com_type"":""M"",""co...",Suggest one primary commodity for one site. Re...
8,"{""Dep_ID"":10001971,""com_type"":""M"",""commod1"":nu...",Suggest one primary commodity for one site. Re...
9,"```json\n{""Dep_ID"":10002009,""com_type"":""B"",""co...",Suggest one primary commodity for one site. Re...


####Format the output to proper json

In [None]:
%%bigquery
select ml_generate_text_llm_result, trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as formated_result
from retails_stg_ai.commod1_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,formated_result
0,"```json\n{""Dep_ID"":10001351,""com_type"":""M"",""co...","{""Dep_ID"":10001351,""com_type"":""M"",""commod1"":""C..."
1,"```json\n{""Dep_ID"":10002033,""com_type"":""M"",""co...","{""Dep_ID"":10002033,""com_type"":""M"",""commod1"":""T..."
2,"```json\n{""Dep_ID"":10000620,""com_type"":""M"",""co...","{""Dep_ID"":10000620,""com_type"":""M"",""commod1"":""C..."
3,"```json\n{""Dep_ID"":10002383,""com_type"":""M"",""co...","{""Dep_ID"":10002383,""com_type"":""M"",""commod1"":""C..."
4,"```json\n{""Dep_ID"":10001973,""com_type"":""M"",""co...","{""Dep_ID"":10001973,""com_type"":""M"",""commod1"":""T..."
5,"```json\n{""Dep_ID"":10002406,""com_type"":""M"",""co...","{""Dep_ID"":10002406,""com_type"":""M"",""commod1"":""C..."
6,"```json\n{""Dep_ID"":10002115,""com_type"":""M"",""co...","{""Dep_ID"":10002115,""com_type"":""M"",""commod1"":""T..."
7,"```json\n{""Dep_ID"":10000634,""com_type"":""M"",""co...","{""Dep_ID"":10000634,""com_type"":""M"",""commod1"":""L..."
8,"{""Dep_ID"":10001971,""com_type"":""M"",""commod1"":nu...","{""Dep_ID"":10001971,""com_type"":""M"",""commod1"":nu..."
9,"```json\n{""Dep_ID"":10002009,""com_type"":""B"",""co...","{""Dep_ID"":10002009,""com_type"":""B"",""commod1"":""C..."


In [None]:
%%bigquery
create or replace table retails_stg_ai.commod1_predictions_formatted_10 as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from retails_stg_ai.commod1_predictions_raw_10

Query is running:   0%|          |

In [None]:
%%bigquery
select json_value(ml_generate_text_llm_result, '$.Dep_ID') as Dep_ID,
  json_value(ml_generate_text_llm_result, '$.commod1') as commod1
from retails_stg_ai.commod1_predictions_formatted_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,commod1
0,10001351,Copper
1,10001971,
2,10001973,Tungsten
3,10000620,Copper
4,10002115,"Tungsten, Tin"
5,10002009,Chromium
6,10000634,"Lead, Copper"
7,10002033,Tungsten
8,10002383,Copper
9,10002406,"Copper, Silver"


####Add the commod1 field to the Mrds table

In [None]:
%%bigquery
alter table retails_stg.Mrds add column commod1_predictions string;

Query is running:   0%|          |

####Update the Mrds records with the predicted commod1

In [None]:
%%bigquery
update retails_stg.Mrds set commod1_predictions =
  (select json_value(ml_generate_text_llm_result, '$.commod1')
   from retails_stg_ai.commod1_predictions_formatted_10
   where Dep_ID = cast(json_value(ml_generate_text_llm_result, '$.Dep_ID') as int64))
where 1=1

Query is running:   0%|          |

####Inspect the output

In [None]:
%%bigquery
select * except (url, mrds_id, mas_id, latitude, longitude, state, county, data_source, load_time, county_predictions, mineral_type, type, com_types_predictions)
from retails_stg.Mrds
where commod1_predictions is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,site_name,region,country,com_type,commod1,commod2,commod3,commod1_predictions
0,10000620,Little Caribou Occurrence,,United States,M,,"Copper, Antimony, Zinc",,Copper
1,10000634,Ridgetop Prospect,,United States,M,,"Lead, Copper",,"Lead, Copper"
2,10001351,Unnamed Occurrence,,United States,M,,,Copper,Copper
3,10001973,Unnamed Occurrence,,United States,M,,,Tungsten,Tungsten
4,10002009,Cape Nome,,United States,B,,"Chromium, Gold, Tungsten, Fluorine-Fluorite","Chromium, Fluorine-Fluorite",Chromium
5,10002033,Thompson Creek Placer,,United States,M,,,Tungsten,Tungsten
6,10002115,Upper Bonanza Creek Occurrence,,United States,M,,,"Tungsten, Tin","Tungsten, Tin"
7,10002383,Cascade Copper Prospect,,United States,M,,,Copper,Copper
8,10002406,Arms Copper Claim,,United States,M,,,"Copper, Silver","Copper, Silver"


####Apply at larger scale

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest one primary commodity for one site. Return the output as json, include the Dep_ID and commod1 in the output";
create or replace table retails_stg_ai.comomd1_predictions_raw_10k as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
      select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "site_name", site_name, "region", region,
                    "country", country, "com_type", com_type, "commod1", commod1, "commod2", commod2, "commod3", commod3))) as prompt
      from retails_stg.Mrds
      where commod1 is null
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

In [None]:
%%bigquery
select creation_time, end_time, query
from `region-us`.INFORMATION_SCHEMA.JOBS
where job_id = ''

In [None]:
%%bigquery
create or replace table retails_stg_ai.commod1_predictions_formatted_10k as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from retails_stg_ai.comomd1_predictions_raw_10k

In [None]:
%%bigquery
select count(*) as county_count
from retails_stg_ai.commod1_predictions_formatted_10k

In [None]:
%%bigquery
select json_value(ml_generate_text_llm_result, '$.Dep_ID') as Dep_ID,
  json_value(ml_generate_text_llm_result, '$.commod1') as commod1_predictions
from retails_stg_ai.commod1_predictions_formatted_10k

In [None]:
%%bigquery
update retails_stg.Mrds set commod1_predictions =
  (select json_value(ml_generate_text_llm_result, '$.commod1')
   from retails_stg_ai.commod1_predictions_formatted_10k
   where Dep_ID = cast(json_value(ml_generate_text_llm_result, '$.Dep_ID') as int64))
where 1=1

In [None]:
%%bigquery
select *
from retails_stg.Mrds
where commod1_predictions is not null

In [None]:
%%bigquery
select com_type, commod1_predictions, count(*) as count
from retails_stg.Mrds
where commod1_predictions is not null
group by com_type, commod1_predictions
order by count(*) desc

In [None]:
%%bigquery
update retails_stg.Mrds
  set data_source = 'usgs' where commod1_predictions is not null

##Part 3: Fill null values for MRDS state

####Observe the data we are working with

In [None]:
%%bigquery
select * except (url, mrds_id, mas_id, com_type, commod1, commod2, commod3, data_source, load_time, county_predictions, mineral_type, type, com_types_predictions, commod1_predictions)
from retails_stg.Mrds
where country is not null
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,site_name,latitude,longitude,region,country,state,county
0,10307418,Unnamed (Muklung Hills area),59.29933,-158.30266,,United States,Alaska,
1,10311019,Blanca District,37.57169,-105.48779,,United States,Colorado,Costilla
2,10400147,Eliana,13.76692,75.71714,SA,Peru,,
3,10306988,Tri-State Brick and Tile Clay Mine-Hinds County,32.36768,-90.21675,,United States,Mississippi,Hinds
4,10308367,Baggage,63.66976,-141.10262,,United States,Alaska,


In [None]:
%%bigquery
select count(*)
from retails_stg.Mrds
where state is null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,f0_
0,12484


#### Test the generate_text function

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest a state name for each site. Return the output as json, include the Dep_ID and state in the output";
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "site_name", site_name, "latitude", latitude,
                  "longitude", longitude, "region", region, "country", country, "state", state, "county", county))) as prompt
    from retails_stg.Mrds
    where state is null
    and country = 'United States' or country = 'Mexico' or country = 'Canada'
    order by Dep_ID
    limit 10
  ),
  struct(TRUE as flatten_json_output)
);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,ml_generate_text_rai_result,ml_generate_text_status,prompt
0,,,,Suggest a state name for each site. Return the...
1,,,,Suggest a state name for each site. Return the...
2,"```json\n{""Dep_ID"":10008157,""country"":""Mexico""...",,,Suggest a state name for each site. Return the...
3,,,,Suggest a state name for each site. Return the...
4,"```json\n[\n {\n ""Dep_ID"": 10008159,\n ...",,,Suggest a state name for each site. Return the...
5,"```json\n[\n {\n ""Dep_ID"": 10008160,\n ...",,,Suggest a state name for each site. Return the...
6,,,,Suggest a state name for each site. Return the...
7,,,,Suggest a state name for each site. Return the...
8,,,,Suggest a state name for each site. Return the...
9,,,,Suggest a state name for each site. Return the...


####Tweak the prompt and save the output

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest one state or province for one site. Return the output as json, include the Dep_ID and state in the output";
create or replace table retails_stg_ai.state_predictions_raw_10 as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
        select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "site_name", site_name, "latitude", latitude,
                    "longitude", longitude, "region", region, "country", country, "state", state, "county", county))) as prompt
      from retails_stg.Mrds
      where state is null
      order by Dep_ID
      limit 10
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [None]:
%%bigquery
select ml_generate_text_llm_result, prompt
from retails_stg_ai.state_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,prompt
0,,Suggest one state or province for one site. Re...
1,"```json\n{""Dep_ID"":10003683,""country"":""Botswan...",Suggest one state or province for one site. Re...
2,"```json\n{""Dep_ID"":10003687,""country"":""Botswan...",Suggest one state or province for one site. Re...
3,"```json\n{""Dep_ID"":10003689,""country"":""Botswan...",Suggest one state or province for one site. Re...
4,"```json\n{""Dep_ID"":10003681,""country"":""Algeria...",Suggest one state or province for one site. Re...
5,,Suggest one state or province for one site. Re...
6,,Suggest one state or province for one site. Re...
7,"```json\n{""Dep_ID"":10003688,""country"":""Botswan...",Suggest one state or province for one site. Re...
8,"```json\n{""Dep_ID"":10003682,""country"":""Burma"",...",Suggest one state or province for one site. Re...
9,"```json\n{""Dep_ID"":10003686,""country"":""Botswan...",Suggest one state or province for one site. Re...


####Format the output to proper json

In [None]:
%%bigquery
select ml_generate_text_llm_result, trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as formated_result
from retails_stg_ai.state_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,formated_result
0,,
1,"```json\n{""Dep_ID"":10003683,""country"":""Botswan...","{""Dep_ID"":10003683,""country"":""Botswana"",""count..."
2,"```json\n{""Dep_ID"":10003687,""country"":""Botswan...","{""Dep_ID"":10003687,""country"":""Botswana"",""count..."
3,"```json\n{""Dep_ID"":10003689,""country"":""Botswan...","{""Dep_ID"":10003689,""country"":""Botswana"",""count..."
4,"```json\n{""Dep_ID"":10003681,""country"":""Algeria...","{""Dep_ID"":10003681,""country"":""Algeria"",""county..."
5,,
6,,
7,"```json\n{""Dep_ID"":10003688,""country"":""Botswan...","{""Dep_ID"":10003688,""country"":""Botswana"",""count..."
8,"```json\n{""Dep_ID"":10003682,""country"":""Burma"",...","{""Dep_ID"":10003682,""country"":""Burma"",""county"":..."
9,"```json\n{""Dep_ID"":10003686,""country"":""Botswan...","{""Dep_ID"":10003686,""country"":""Botswana"",""count..."


In [None]:
%%bigquery
create or replace table retails_stg_ai.state_predictions_formatted_10 as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from retails_stg_ai.state_predictions_raw_10

Query is running:   0%|          |

In [None]:
%%bigquery
select json_value(ml_generate_text_llm_result, '$.Dep_ID') as Dep_ID,
  json_value(ml_generate_text_llm_result, '$.state') as state_predictions
from retails_stg_ai.state_predictions_formatted_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,state_predictions
0,,
1,,
2,,
3,10003682.0,Shan
4,10003681.0,Adrar
5,10003687.0,Central
6,10003688.0,Gaborone
7,10003683.0,Central
8,10003686.0,Gaborone
9,10003689.0,Gaborone


####Add the state field to the Mrds table

In [None]:
%%bigquery
alter table retails_stg.Mrds add column state_predictions string;

####Update the Mrds records with the predicted state

In [None]:
%%bigquery
update retails_stg.Mrds set state_predictions =
  (select json_value(ml_generate_text_llm_result, '$.state')
   from retails_stg_ai.state_predictions_formatted_10
   where Dep_ID = cast(json_value(ml_generate_text_llm_result, '$.Dep_ID') as int64))
where 1=1

####Inspect the output

In [None]:
%%bigquery
select * except (url, mrds_id, mas_id, com_type, commod1, commod2, commod3, data_source, load_time, county_predictions, mineral_type, type, com_types_predictions, commod1_predictions)
from retails_stg.Mrds
where state_predictions is not null

####Apply at larger scale

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest one state or province for one site. Return the output as json, include the Dep_ID and state in the output";
create or replace table retails_stg_ai.state_predictions_raw_10k as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
        select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "site_name", site_name, "latitude", latitude,
                    "longitude", longitude, "region", region, "country", country, "state", state, "county", county))) as prompt
      from retails_stg.Mrds
      where state is null
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

In [None]:
%%bigquery
select creation_time, end_time, query
from `region-us`.INFORMATION_SCHEMA.JOBS
where job_id = ''

In [None]:
%%bigquery
create or replace table retails_stg_ai.state_predictions_formatted_10k as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from retails_stg_ai.state_predictions_raw_10k

In [None]:
%%bigquery
select count(*) as county_count
from retails_stg_ai.state_predictions_formatted_10k

In [None]:
%%bigquery
select json_value(ml_generate_text_llm_result, '$.Dep_ID') as Dep_ID,
  json_value(ml_generate_text_llm_result, '$.state') as state_predictions
from retails_stg_ai.state_predictions_formatted_10k

In [None]:
%%bigquery
update retails_stg.Mrds set state_predictions =
  (select json_value(ml_generate_text_llm_result, '$.state')
   from retails_stg_ai.state_predictions_formatted_10k
   where Dep_ID = cast(json_value(ml_generate_text_llm_result, '$.Dep_ID') as int64))
where 1=1

In [None]:
%%bigquery
select *
from retails_stg.Mrds
where state_predictions is not null

In [None]:
%%bigquery
select country, state_predictions, count(*) as count
from retails_stg.Mrds
where state_predictions is not null
group by country, state_predictions
order by count(*) desc

In [None]:
%%bigquery
update retails_stg.Mrds
  set data_source = 'usgs' where state_predictions is not null

## Part 4: Parts.material


### Observe the data we are working with

In [5]:
%%bigquery
select * except (size_description, p_size, p_brand, p_name, p_container, p_mfgr, p_retailprice, data_source, load_time, p_comment)
from retails_stg.Parts
where material is not null
limit 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,p_partkey,material,materials_predictions
0,84151,NICKEL,
1,71794,TIN,
2,77224,COPPER,
3,82043,STEEL,
4,129293,BRASS,
5,196804,COPPER,
6,82788,STEEL,
7,199530,TIN,
8,158424,TIN,
9,37120,TIN,


In [10]:
%%bigquery
SELECT *
FROM retails_stg.Parts
WHERE
p_partkey IS NULL OR

p_size IS NULL OR
p_brand IS NULL OR
p_name IS NULL OR
p_container IS NULL OR
p_mfgr IS NULL OR
p_retailprice IS NULL OR
size_description IS NULL OR
material IS NULL OR

data_source IS NULL OR
load_time IS NULL OR
p_comment IS NULL;



Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,p_partkey,size_description,material,p_size,p_brand,p_name,p_container,p_mfgr,p_retailprice,p_comment,data_source,load_time,materials_predictions


In [11]:
%%bigquery

DECLARE prompt_query STRING DEFAULT "Suggest a material and size description for each p_partkey. Also, generate an additional attribute for this part. Return the output as JSON, including the partkey, material, and the generated attribute.";

SELECT
  p.p_partkey,
  p.size_description,
  p.p_size,
  p.p_brand,
  p.p_mfgr,
  p.p_retailprice,
  p.p_comment,
  ml.generated_text
FROM
  retails_stg.Parts p
LEFT JOIN UNNEST(ML.GENERATE_TEXT(
  MODEL `your-project-id.your_dataset.your_model`,
  (
    SELECT
      CONCAT(
        prompt_query,
        TO_JSON_STRING(
          JSON_OBJECT(
            "p_partkey", p_partkey,
            "size_description", size_description,
            "p_size", p_size,
            "p_brand", p_brand,
            "p_mfgr", p_mfgr,
            "p_brand", p_brand,
            "p_retailprice", p_retailprice,
            "p_comment", p_comment
          )
        )
      ) AS prompt
    FROM
      retails_stg.Parts
    WHERE
      p.p_partkey = p_partkey
    LIMIT 1
  ),
  STRUCT(TRUE AS flatten_json_output)
)) ml
ORDER BY
  p.p_partkey
LIMIT 10;


Executing query with job ID: b631b537-d544-452f-a46d-04f77dd906b4
Query executing: 0.27s


ERROR:
 400 Syntax error: Expected ")" but got identifier "remote_models" at [14:7]

Location: US
Job ID: b631b537-d544-452f-a46d-04f77dd906b4



### Tweak the generate_text function

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest a rare material for one part. Return the output as json, include the p_partkey and material in the output";
create or replace table retails_stg_ai.materials_predictions_raw_10 as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
      select concat(prompt_query, to_json_string(json_object("p_partkey", p_partkey, "size_description", size_description, "p_size", p_size,
            "p_brand", p_brand, "p_mfgr", p_mfgr, "p_brand", p_brand, "p_retailprice", p_retailprice, "p_comment", p_comment))) as prompt
      from retails_stg.Parts
      where material is null and size_description is not null
      order by p_partkey
      limit 10
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [None]:
%%bigquery
select ml_generate_text_llm_result, prompt
from retails_stg_ai.materials_predictions_raw_10

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,ml_generate_text_llm_result,prompt


### Format the output to prosper json

In [12]:
%%bigquery
select ml_generate_text_llm_result, trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as formated_result
from retails_stg_ai.materials_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,formated_result
0,"```json\n{""p_partkey"":6,""material"":""Osmium""}\n```","{""p_partkey"":6,""material"":""Osmium""}"
1,"```json\n{""p_partkey"":5,""material"":""Bamboo""}\n...","{""p_partkey"":5,""material"":""Bamboo""} This respo..."
2,"```json\n{""p_partkey"": 9, ""material"": ""Unobtai...","{""p_partkey"": 9, ""material"": ""Unobtainium""} Un..."
3,"```json\n{""p_partkey"": 3, ""material"": ""Unobtai...","{""p_partkey"": 3, ""material"": ""Unobtainium""} Un..."
4,"```json\n{""p_partkey"":7,""material"":""Iridium""}\...","{""p_partkey"":7,""material"":""Iridium""}"
5,"```json\n{""p_partkey"": 1, ""material"": ""Unobtai...","{""p_partkey"": 1, ""material"": ""Unobtainium""}"
6,"```json\n{""p_partkey"":2, ""material"":""Unobtaini...","{""p_partkey"":2, ""material"":""Unobtainium""} Unob..."
7,"```json\n{""p_partkey"": 8, ""material"": ""Unobtai...","{""p_partkey"": 8, ""material"": ""Unobtainium""} Un..."
8,"```json\n{""p_partkey"":4,""material"":""Unobtainiu...","{""p_partkey"":4,""material"":""Unobtainium""}"
9,"```json\n{""p_partkey"": 10, ""material"": ""Unobta...","{""p_partkey"": 10, ""material"": ""Unobtainium""} U..."


In [19]:
%%bigquery
SELECT
ml_generate_text_llm_result
FROM
retails_stg_ai.materials_predictions_raw_10
WHERE
ml_generate_text_llm_result IS NOT NULL
LIMIT 10;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result
0,"```json\n{""p_partkey"":6,""material"":""Osmium""}\n```"
1,"```json\n{""p_partkey"":5,""material"":""Bamboo""}\n..."
2,"```json\n{""p_partkey"": 9, ""material"": ""Unobtai..."
3,"```json\n{""p_partkey"": 3, ""material"": ""Unobtai..."
4,"```json\n{""p_partkey"":7,""material"":""Iridium""}\..."
5,"```json\n{""p_partkey"": 1, ""material"": ""Unobtai..."
6,"```json\n{""p_partkey"":2, ""material"":""Unobtaini..."
7,"```json\n{""p_partkey"": 8, ""material"": ""Unobtai..."
8,"```json\n{""p_partkey"":4,""material"":""Unobtainiu..."
9,"```json\n{""p_partkey"": 10, ""material"": ""Unobta..."


In [23]:
%%bigquery
SELECT
CAST(REGEXP_EXTRACT(ml_generate_text_llm_result, r'"p_partkey": (\d+)') AS INT64) AS p_partkey,
TRIM(REGEXP_EXTRACT(ml_generate_text_llm_result, r'"material": "([^"]*)"')) AS material
FROM
retails_stg_ai.materials_predictions_raw_10;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,p_partkey,material
0,3.0,Unobtainium
1,,
2,,
3,,
4,,
5,,
6,,
7,,
8,8.0,Iridium
9,,


###Add material field to Parts table

In [24]:
%%bigquery
alter table retails_stg.Parts add column materials_predictions string;

Executing query with job ID: 4fa0957d-b44a-4583-b886-3f14618169bd
Query executing: 0.50s


ERROR:
 400 Column already exists: materials_predictions at [1:42]

Location: US
Job ID: 4fa0957d-b44a-4583-b886-3f14618169bd



### Update the Parts records with the predicted materials

In [27]:
%%bigquery
UPDATE retails_stg.Parts p
SET materials_predictions = (
SELECT json_value(ml_generate_text_llm_result, '$.material')
FROM retails_stg_ai.com_type_predictions_formatted_10 ai
WHERE cast(json_value(ml_generate_text_llm_result, '$.p_partkey') as int64) = p.p_partkey
)
WHERE EXISTS (
SELECT 1
FROM retails_stg_ai.com_type_predictions_formatted_10 ai
WHERE cast(json_value(ml_generate_text_llm_result, '$.p_partkey') as int64) = p.p_partkey
);

Query is running:   0%|          |

### Insepect the output

In [29]:
%%bigquery
select * except (size_description, p_size, p_brand, p_name, p_container, p_mfgr, p_retailprice, data_source, load_time, p_comment, materials_predictions)
from retails_stg.Parts
where materials_predictions is null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,p_partkey,material
0,141449,TIN
1,88272,TIN
2,199424,TIN
3,158424,TIN
4,55758,TIN
...,...,...
199995,93978,NICKEL
199996,50773,NICKEL
199997,29036,NICKEL
199998,108798,NICKEL


### Apply at a larger scale

In [31]:
%%bigquery
SELECT count(*) as total_rows
FROM retails_stg.Parts;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,total_rows
0,200000


## Part 5 : supplier.area_code

### observe the data we are working with
fill the null values for areacode for merged supplier  

In [40]:
%%bigquery
select * except (s_nationkey, load_time, s_name)
from retails_stg.Supplier
where area_code is not null
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,s_suppkey,s_comment,s_address,area_code,phone_number,s_acctbal,data_source,area_code_predictions
0,57,blithely regular excuses haggle of the careful...,oMv1GPNlQ873mLt3G6TbaTiBn,100,510-7176,9200.46,bird,
1,2101,bold excuses nag among the fluffy packages. bl...,b9nbtxAqjlxeOZu8QcBHijQhXQM7zfKl,100,309-3812,3389.46,bird,
2,3790,dependencies haggle fluffily slyly regular,"UH1,ReFsf8e",100,811-5774,7109.91,bird,
3,1820,fluffily final excuses use above the furiously...,FuRNFCTOX2md01qjXYFBMFRSxCMaAQh9W4fj,100,383-1466,9912.52,bird,
4,9883,slyly silent excuses within,iEILVgbCREGmcdVQ6rvKfpFwCeToFXGi,100,133-5786,6914.76,bird,


Executing query with job ID: b49ea0b1-2aba-4551-8724-519c50f8d88a
Query executing: 0.22s


ERROR:
 400 Syntax error: Unexpected ")" at [1:109]

Location: US
Job ID: b49ea0b1-2aba-4551-8724-519c50f8d88a



### Test the generate_text function

In [42]:
%%bigquery
declare prompt_query STRING default "Suggest a new area_code for each supply key. Return the output as json, include the s_suppkey and area_code in the output";
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("s_suppkey", s_suppkey, "s_nationkey", s_nationkey, "s_comment", s_comment,
                  "s_name", s_name, "s_address", s_address, "area_code", area_code, "phone_number", phone_number, "s_acctbal", s_acctbal))) as prompt
    from retails_stg.Supplier

    order by s_suppkey
    limit 10
  ),
  struct(TRUE as flatten_json_output)
);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,ml_generate_text_rai_result,ml_generate_text_status,prompt
0,"```json\n{""area_code"":""800"",""phone_number"":""80...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a new area_code for each supply key. R...
1,"```json\n{""area_code"":""348"",""s_suppkey"":2},\n{...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a new area_code for each supply key. R...
2,"```json\n[\n {\n ""area_code"": ""471"",\n ...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a new area_code for each supply key. R...
3,"```json\n[\n {\n ""area_code"": ""893"",\n ...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a new area_code for each supply key. R...
4,"```json\n[\n {\n ""area_code"": ""752"",\n ...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a new area_code for each supply key. R...
5,"```json\n[\n {\n ""s_suppkey"": 6,\n ""are...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a new area_code for each supply key. R...
6,"```json\n[\n {\n ""area_code"": ""400"",\n ...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a new area_code for each supply key. R...
7,"```json\n{""area_code"":""893"",""s_suppkey"":8},\n{...","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a new area_code for each supply key. R...
8,"```json\n{""area_code"":""277"",""s_suppkey"":9}\n```","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a new area_code for each supply key. R...
9,"```json\n{""area_code"":""454"",""s_suppkey"":10}\n```","[{""category"":1,""probability"":1,""probability_sc...",,Suggest a new area_code for each supply key. R...


###Tweak the prompt and save the output

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest an area_code for each supply key. Return the output as json, include the s_suppkey and area_code in the output";
create or replace table retails_stg_ai.area_code_predictions_raw_10 as
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("s_suppkey", s_suppkey, "s_nationkey", s_nationkey, "s_comment", s_comment,
                  "s_name", s_name, "s_address", s_address, "area_code", area_code, "phone_number", phone_number, "s_acctbal", s_acctbal))) as prompt
    from retails_stg.Supplier

    order by s_suppkey
    limit 10
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Executing query with job ID: a0f34da6-4b67-41d7-b9d2-c6b17b192309
Query executing: 20.22s

###Format the output to proper json

In [None]:
%%bigquery

select ml_generate_text_llm_result, trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as formated_result
from retails_stg_ai.area_code_predictions_raw_10

###Add the commod type field to the merged_geographical_data table

In [None]:
%%bigquery
alter table retails_stg.Supplier add column area_code_predictions string;

Query is running:   0%|          |

###Update the Supplier table


In [None]:
%%bigquery
update retails_stg.Supplier set area_code_predictions =
  (select json_value(ml_generate_text_llm_result, '$.area_code')
   from retails_stg_ai.com_type_predictions_formatted_10
   where s_suppkey = cast(json_value(ml_generate_text_llm_result, '$.s_suppkey') as int64))
where 1=1

Query is running:   0%|          |

###Inspect the output

In [None]:
%%bigquery
select * except (s_nationkey, load_time, s_name)
from retails_stg.Supplier
where area_code_predictions is not null

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,s_suppkey,s_comment,s_address,area_code,phone_number,s_acctbal,data_source,area_code_predictions


###Apply at a larger scale


In [None]:
%%bigquery
declare prompt_query STRING default "Suggest one address or phone_number for one supplier. Return the output as json, include the s_suppkey and area_code in the output";
create or replace table retails_stg_ai.state_predictions_raw_10k as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
        select concat(prompt_query, to_json_string(json_object("s_suppkey", s_suppkey, "s_nationkey", s_nationkey, "s_comment", s_comment,
                  "s_name", s_name, "s_address", s_address, "area_code", area_code, "phone_number", phone_number, "s_acctbal", s_acctbal))) as prompt
      from retails_stg.Supplier
      where area_code is null
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [None]:
%%bigquery
select s_name, area_code_predictions, count(*) as count
from retails_stg.Supplier
where area_code_predictions is not null
group by s_name, area_code_predictions
order by count(*) desc

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,s_name,area_code_predictions,count


In [None]:
%%bigquery
select *
from retails_stg.Supplier
where area_code is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,s_suppkey,s_nationkey,s_comment,s_name,s_address,area_code,phone_number,s_acctbal,data_source,load_time,area_code_predictions
0,57,1,blithely regular excuses haggle of the careful...,Supplier#000000057,oMv1GPNlQ873mLt3G6TbaTiBn,100,510-7176,9200.46,bird,2024-02-02 22:02:46.597115+00:00,
1,2101,1,bold excuses nag among the fluffy packages. bl...,Supplier#000002101,b9nbtxAqjlxeOZu8QcBHijQhXQM7zfKl,100,309-3812,3389.46,bird,2024-02-02 22:02:46.597115+00:00,
2,3790,12,dependencies haggle fluffily slyly regular,Supplier#000003790,"UH1,ReFsf8e",100,811-5774,7109.91,bird,2024-02-02 22:02:46.597115+00:00,
3,1820,13,fluffily final excuses use above the furiously...,Supplier#000001820,FuRNFCTOX2md01qjXYFBMFRSxCMaAQh9W4fj,100,383-1466,9912.52,bird,2024-02-02 22:02:46.597115+00:00,
4,9883,13,slyly silent excuses within,Supplier#000009883,iEILVgbCREGmcdVQ6rvKfpFwCeToFXGi,100,133-5786,6914.76,bird,2024-02-02 22:02:46.597115+00:00,
...,...,...,...,...,...,...,...,...,...,...,...
9995,8161,13,carefully special foxes haggle furiously again...,Supplier#000008161,vHXXLZZpW dSWeSzAT2QG l5rP,999,101-6464,9521.51,bird,2024-02-02 22:02:46.597115+00:00,
9996,7134,13,quickly pending grouches against the special d...,Supplier#000007134,"7Qi6cgmKoDZ45wpDbTEQUS2pHXh6wbS0yQi,z9",999,701-6541,3774.98,bird,2024-02-02 22:02:46.597115+00:00,
9997,3803,21,carefully ironic deposits nag above the carefu...,Supplier#000003803,YCCOFfxNYIpQOgy88T,999,529-6934,7089.73,bird,2024-02-02 22:02:46.597115+00:00,
9998,4363,22,quickly silent requests wake furio,Supplier#000004363,duR9ZJrhsEyJQENfaTRaXvbi14lkvY9wmgnXyl,999,420-8940,-846.76,bird,2024-02-02 22:02:46.597115+00:00,


In [None]:
%%bigquery
select creation_time, end_time, query
from `region-us`.INFORMATION_SCHEMA.JOBS
where job_id = ''

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,creation_time,end_time,query


## Part 6: Merge Changes into target table

####Mrds table

In [None]:
%%bigquery
create or replace table retails_csp.Mrds_copy as select * from retails_csp.Mrds

Query is running:   0%|          |

In [None]:
%%bigquery
alter table retails_csp.Mrds
  add column com_types_predictions string;

Query is running:   0%|          |

In [None]:
%%bigquery
alter table retails_csp.Mrds
  add column commod1_predictions string;

Executing query with job ID: d62909de-d566-411a-bce0-5349b637b38e
Query executing: 0.35s


ERROR:
 400 Column already exists: mineral_type at [2:14]

Location: US
Job ID: d62909de-d566-411a-bce0-5349b637b38e



In [None]:
%%bigquery
alter table retails_csp.Mrds
  add column state_predictions string;

In [None]:
%%bigquery
declare current_ts TIMESTAMP;
set current_ts = current_timestamp();

create temp table updates as
  select s.*
  from retails_csp.Mrds t join retails_stg.Mrds s
  on t.Dep_ID = s.Dep_ID
  where s.com_types_predictions != t.com_types_predictions
  or s.commod1_predictions != t.commod1_predictions
  or s.state_predictions != t.state_predictions;

update retails_csp.Mrds
set discontinue_time = timestamp_sub(current_ts, interval 1 second), status_flag = false
where Dep_ID in (select Dep_ID from updates);

insert into retails_csp.Mrds
  (Dep_id, url, mrds_id, mas_id, site_name, latitude, longitude, region,
    country, state, county, com_type, commod1, commod2, commod3, mineral_type, county_predictions, com_types_predictions, commod1_predictions, state_predictions, data_source, load_time, effective_time, status_flag)
    (select Dep_id, url, mrds_id, mas_id, site_name, latitude, longitude, region,
      country, state, county, com_type, commod1, commod2, commod3, mineral_type, county_predictions, com_types_predictions, commod1_predictions, state_predictions, data_source, load_time, current_ts, true
      from updates);

Query is running:   0%|          |

In [None]:
%%bigquery
select count(*) as num_records
from retails_csp.Mrds

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,num_records
0,304632


#### Parts table

In [32]:
%%bigquery
create or replace table retails_csp.Parts_copy as select * from retails_csp.Parts

Query is running:   0%|          |

In [33]:
%%bigquery
alter table retails_csp.Parts
  add column materials_predictions string;

Executing query with job ID: 8d069ff3-c14e-428a-b033-6d9e59eb84d9
Query executing: 0.38s


ERROR:
 400 Column already exists: materials_predictions at [2:14]

Location: US
Job ID: 8d069ff3-c14e-428a-b033-6d9e59eb84d9



In [34]:
%%bigquery
declare current_ts TIMESTAMP;
set current_ts = current_timestamp();

create temp table updates as
  select s.*
  from retails_csp.Parts t join retails_stg.Parts s
  on t.p_partkey = s.p_partkey
  where s.materials_predictions != t.materials_predictions;

update retails_csp.Parts
set discontinue_time = timestamp_sub(current_ts, interval 1 second), status_flag = false
where p_partkey in (select p_partkey from updates);

insert into retails_csp.Parts
  (p_partkey, size_description, material, p_size, p_brand, p_name, p_container, p_mfgr,
    p_retailprice, p_comment, materials_predictions, data_source, load_time, effective_time, status_flag)
    (select p_partkey, size_description, material, p_size, p_brand, p_name, p_container, p_mfgr,
      p_retailprice, p_comment, materials_predictions, data_source, load_time, current_ts, true
      from updates);

Query is running:   0%|          |

In [35]:
%%bigquery
select count(*) as num_records
from retails_csp.Parts

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,num_records
0,200000


####Supplier Table

In [36]:
%%bigquery
create or replace table retails_csp.Supplier_copy as select * from retails_csp.Supplier

Query is running:   0%|          |

In [37]:
%%bigquery
alter table retails_csp.Supplier
  add column area_code_predictions string;

Executing query with job ID: 00accf4b-f328-4afa-b975-b4693a5d1083
Query executing: 0.40s


ERROR:
 400 Column already exists: area_code_predictions at [2:14]

Location: US
Job ID: 00accf4b-f328-4afa-b975-b4693a5d1083



In [38]:
%%bigquery
declare current_ts TIMESTAMP;
set current_ts = current_timestamp();

create temp table updates as
  select s.*
  from retails_csp.Supplier t join retails_stg.Supplier s
  on t.s_suppkey = s.s_suppkey
  where s.area_code_predictions != t.area_code_predictions;

update retails_csp.Supplier
set discontinue_time = timestamp_sub(current_ts, interval 1 second), status_flag = false
where s_suppkey in (select s_suppkey from updates);

insert into retails_csp.Supplier
  (s_suppkey, s_nationkey, s_comment, s_name, s_address, area_code, phone_number, s_acctbal, area_code_predictions, data_source, load_time, effective_time, status_flag)
    (select s_suppkey, s_nationkey, s_comment, s_name, s_address, area_code, phone_number, s_acctbal, area_code_predictions, data_source, load_time, current_ts, true
      from updates);

Query is running:   0%|          |

In [39]:
%%bigquery
select count(*) as num_records
from retails_csp.Supplier

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,num_records
0,10000
