## Data Enrichment with AI

### Setup

##### Create a dataset for storing the AI tables and another for storing the AI models

In [None]:
%%bigquery
CREATE SCHEMA `automated-style-411721`.retails_stg_ai

Executing query with job ID: 6e8ee8f3-0407-45ba-9a5d-47787e9d76f6
Query executing: 0.59s


ERROR:
 409 Already Exists: Dataset automated-style-411721:retails_stg_ai

Location: US
Job ID: 6e8ee8f3-0407-45ba-9a5d-47787e9d76f6



In [None]:
%%bigquery
CREATE SCHEMA `automated-style-411721`.remote_models;

Executing query with job ID: ba22e544-7f88-40bc-8cb7-9e4e6ac1c6c9
Query executing: 0.36s


ERROR:
 409 Already Exists: Dataset automated-style-411721:remote_models

Location: US
Job ID: ba22e544-7f88-40bc-8cb7-9e4e6ac1c6c9



##### Before running this cell, create the remote connection and assign the IAM role `Vertex AI User` to the service account associated with the connection.

In [None]:
%%bigquery
create or replace model remote_models.gemini_pro
  remote with connection `projects/automated-style-411721/locations/us/connections/vertex_connection`
  options (endpoint = 'gemini-pro');

Query is running:   0%|          |


### Part 1: Predict the county of sites

In [None]:
%%bigquery
select * except (com_type, commod1, commod2, commod3, data_source, load_time)
from retails_stg.Mrds
where latitude is not null
and longitude is not null
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,url,mrds_id,mas_id,site_name,latitude,longitude,region,country,state,county
0,10306836,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Milton Silver Mine,44.52727,-68.22983,,United States,Maine,Hancock
1,10306848,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,West Bay Mine,44.48659,-68.04614,,United States,Maine,Hancock
2,10306849,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Boss O' the Bay,44.4767,-68.16378,,United States,Maine,Hancock
3,10306852,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Manhattan Mine,44.36453,-68.74084,,United States,Maine,Hancock
4,10307124,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Johnson,34.71234,-81.90956,,United States,South Carolina,Spartanburg


#### Test the generate_text function

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest a county for each site. Return the output as json, include the Dep_ID and county in the output";
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "site_name", site_name, "latitude", latitude,
                  "longitude", longitude, "region", region, "country", country, "state", state))) as prompt
    from retails_stg.Mrds
    order by Dep_ID
    limit 10
  ),
  struct(TRUE as flatten_json_output)
);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,ml_generate_text_rai_result,ml_generate_text_status,prompt
0,"{""Dep_ID"":10000001,""country"":""United States"",""...",,,Suggest a county for each site. Return the out...
1,"{""Dep_ID"":10000002,""country"":""United States"",""...",,,Suggest a county for each site. Return the out...
2,"{""Dep_ID"":10000003,""country"":""United States"",""...",,,Suggest a county for each site. Return the out...
3,"{""Dep_ID"":10000004,""country"":""United States"",""...",,,Suggest a county for each site. Return the out...
4,"{""Dep_ID"":10000005,""country"":""United States"",""...",,,Suggest a county for each site. Return the out...
5,"{""Dep_ID"":10000006,""country"":""United States"",""...",,,Suggest a county for each site. Return the out...
6,"{""Dep_ID"":10000007,""country"":""United States"",""...",,,Suggest a county for each site. Return the out...
7,"{""Dep_ID"":10000008,""country"":""United States"",""...",,,Suggest a county for each site. Return the out...
8,"{""Dep_ID"":10000009,""country"":""United States"",""...",,,Suggest a county for each site. Return the out...
9,"{""Dep_ID"":10000010,""country"":""United States"",""...",,,Suggest a county for each site. Return the out...


#### Tweak the prompt and save the output
##### [More details](https://cloud.google.com/bigquery/docs/generate-text#generate_text_from_text_data_by_using_a_prompt_from_a_query) on `ML.generate_text` parameters

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest a county for each dep_id. Return the output as json, include the Dep_ID and county in the output";
create or replace table retails_stg_ai.county_predictions_raw_10 as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
      select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "site_name", site_name, "latitude", latitude,
                    "longitude", longitude, "region", region, "country", country, "state", state))) as prompt
      from retails_stg.Mrds
      order by Dep_ID
      limit 10
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [None]:
%%bigquery
select ml_generate_text_llm_result, prompt
from retails_stg_ai.county_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,prompt
0,"```json\n{\n ""Dep_ID"": 10000002,\n ""county"":...",Suggest a county for each dep_id. Return the o...
1,"```json\n{\n ""Dep_ID"": 10000006,\n ""county"":...",Suggest a county for each dep_id. Return the o...
2,"```json\n{\n ""Dep_ID"": 10000004,\n ""county"":...",Suggest a county for each dep_id. Return the o...
3,"```json\n{\n ""Dep_ID"": 10000001,\n ""county"":...",Suggest a county for each dep_id. Return the o...
4,"{""Dep_ID"":10000003,""county"":""Fairbanks North S...",Suggest a county for each dep_id. Return the o...
5,"{""Dep_ID"":10000007,""country"":""United States"",""...",Suggest a county for each dep_id. Return the o...
6,"{""Dep_ID"":10000009,""country"":""United States"",""...",Suggest a county for each dep_id. Return the o...
7,"{""Dep_ID"":10000008,""county"":""Aleutians East Bo...",Suggest a county for each dep_id. Return the o...
8,"{""Dep_ID"":10000005,""county"":""Matanuska-Susitna...",Suggest a county for each dep_id. Return the o...
9,"{""Dep_ID"":10000010,""country"":""United States"",""...",Suggest a county for each dep_id. Return the o...


#### Format the output to proper json

In [None]:
%%bigquery
select ml_generate_text_llm_result, trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as formated_result
from retails_stg_ai.county_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,formated_result
0,"```json\n{\n ""Dep_ID"": 10000002,\n ""county"":...","{ ""Dep_ID"": 10000002, ""county"": ""Anchorage""}"
1,"```json\n{\n ""Dep_ID"": 10000006,\n ""county"":...","{ ""Dep_ID"": 10000006, ""county"": ""Prince of W..."
2,"```json\n{\n ""Dep_ID"": 10000004,\n ""county"":...","{ ""Dep_ID"": 10000004, ""county"": ""Nome""}"
3,"```json\n{\n ""Dep_ID"": 10000001,\n ""county"":...","{ ""Dep_ID"": 10000001, ""county"": ""Anchorage""}"
4,"{""Dep_ID"":10000003,""county"":""Fairbanks North S...","{""Dep_ID"":10000003,""county"":""Fairbanks North S..."
5,"{""Dep_ID"":10000007,""country"":""United States"",""...","{""Dep_ID"":10000007,""country"":""United States"",""..."
6,"{""Dep_ID"":10000009,""country"":""United States"",""...","{""Dep_ID"":10000009,""country"":""United States"",""..."
7,"{""Dep_ID"":10000008,""county"":""Aleutians East Bo...","{""Dep_ID"":10000008,""county"":""Aleutians East Bo..."
8,"{""Dep_ID"":10000005,""county"":""Matanuska-Susitna...","{""Dep_ID"":10000005,""county"":""Matanuska-Susitna..."
9,"{""Dep_ID"":10000010,""country"":""United States"",""...","{""Dep_ID"":10000010,""country"":""United States"",""..."


In [None]:
%%bigquery
create or replace table retails_stg_ai.county_predictions_formatted_10 as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from retails_stg_ai.county_predictions_raw_10

Query is running:   0%|          |

In [None]:
%%bigquery
select json_value(ml_generate_text_llm_result, '$.Dep_ID') as Dep_ID,
  json_value(ml_generate_text_llm_result, '$.county') as county
from retails_stg_ai.county_predictions_formatted_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,county
0,10000007,Haines Borough
1,10000010,
2,10000002,Anchorage
3,10000004,Nome
4,10000009,
5,10000005,Matanuska-Susitna Borough
6,10000003,Fairbanks North Star Borough
7,10000001,Anchorage
8,10000006,Prince of Wales-Outer Ketchikan
9,10000008,Aleutians East Borough


#### Add the county field to the Mrds table

In [None]:
%%bigquery
alter table retails_stg.Mrds add column county_predictions string;

Query is running:   0%|          |

#### Update the Mrds records with the predicted county

In [None]:
%%bigquery
update retails_stg.Mrds set county_predictions =
  (select json_value(ml_generate_text_llm_result, '$.county')
   from retails_stg_ai.county_predictions_formatted_10
   where Dep_ID = cast(json_value(ml_generate_text_llm_result, '$.Dep_ID') as int64))
where 1=1

Query is running:   0%|          |

#### Inspect the output

In [None]:
%%bigquery
select * except(com_type, commod1, commod2, commod3, data_source, load_time)
from retails_stg.Mrds
where county_predictions is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,url,mrds_id,mas_id,site_name,latitude,longitude,region,country,state,county,county_predictions
0,10307293,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Unnamed (upper Ophir Creek),65.04933,-163.69331,,United States,Alaska,,Yukon-Koyukuk Census Area
1,10307504,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Unnamed (east of Sugar Loaf Mountain),63.78961,-148.79286,,United States,Alaska,,Denali Borough
2,10307020,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,St. Pierre Pit,43.36227,-72.32379,,United States,New Hampshire,Sullivan,Grafton County
3,10305981,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Mammoth Mine,34.31123,-83.81660,,United States,Georgia,Hall,Fulton County
4,10400891,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Unknown Borrow Pits,41.77000,-107.47300,,United States,Wyoming,Carbon,Sweetwater County
...,...,...,...,...,...,...,...,...,...,...,...,...
6909,10161241,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,0021110094,Pipe Dream,58.86222,-136.86882,,United States,Alaska,,Fairbanks North Star
6910,10258319,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,0021110096,Helicopter Pilot's Molybdenum,58.71361,-137.27852,,United States,Alaska,,Haines Borough
6911,10282564,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,0021110107,Dagelet River Beach Sands,58.51750,-137.40631,,United States,Alaska,,Prince of Wales-Outer Ketchikan Census Area
6912,10186027,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,0021110110,Kaknau Creek Beach Sands,58.40141,-137.08070,,United States,Alaska,,Haines Borough


#### Apply at larger scale

###### Create a smaller Mrds table with 10K records in order to finish within 30 minutes

In [None]:
%%bigquery
create or replace table retails_stg.Mrds_10k as
  select *
  from retails_stg.Mrds
  where Dep_ID is not null
  and site_name is not null
  and latitude is not null
  and longitude is not null
  and region is not null
  and state is not null
  limit 10000

Query is running:   0%|          |

In [None]:
%%bigquery
select count(*)
from retails_stg.Mrds_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,f0_
0,10000


In [None]:
%%bigquery
declare prompt_query STRING default "Suggest a county for each dep_id. Return the output as json, include the Dep_ID and county in the output";
create or replace table retails_stg_ai.county_predictions_raw_10k as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
      select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "site_name", site_name, "latitude", latitude,
                    "longitude", longitude, "region", region, "country", country, "state", state))) as prompt
      from retails_stg.Mrds_10k
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [None]:
%%bigquery
select creation_time, end_time, query
from `region-us`.INFORMATION_SCHEMA.JOBS
where job_id = '90f3a532-3af8-4a2d-aba9-962fc313ef6a'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,creation_time,end_time,query
0,2024-04-10 18:41:36.410000+00:00,2024-04-10 19:25:21.814000+00:00,"declare prompt_query STRING default ""Suggest a..."


In [None]:
%%bigquery
create or replace table retails_stg_ai.county_predictions_formatted_10k as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from retails_stg_ai.county_predictions_raw_10k

Query is running:   0%|          |

In [None]:
%%bigquery
select count(*) as county_count
from retails_stg_ai.county_predictions_formatted_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,county_count
0,10000


In [None]:
%%bigquery
select json_value(ml_generate_text_llm_result, '$.Dep_ID') as Dep_ID,
  json_value(ml_generate_text_llm_result, '$.county') as county_predictions
from retails_stg_ai.county_predictions_formatted_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,county_predictions
0,,
1,,
2,,
3,,
4,,
...,...,...
9995,10257920,Calhoun County
9996,10258319,Haines Borough
9997,10306937,Warren County
9998,10306989,Clay County


Add the county field to the Mrds table


In [None]:
%%bigquery
alter table retails_stg.Mrds add column county_predictions string;

Executing query with job ID: 0a7788e4-faae-4a6c-83d9-a2d8aac53145
Query executing: 0.52s


ERROR:
 400 Column already exists: county_predictions at [1:41]

Location: US
Job ID: 0a7788e4-faae-4a6c-83d9-a2d8aac53145



Update the Mrds records with the predicted county

In [None]:
%%bigquery
update retails_stg.Mrds set county_predictions =
  (select json_value(ml_generate_text_llm_result, '$.county')
   from retails_stg_ai.county_predictions_formatted_10k
   where Dep_ID = cast(json_value(ml_generate_text_llm_result, '$.Dep_ID') as int64))
where 1=1

Query is running:   0%|          |

In [None]:
%%bigquery
select *
from retails_stg.Mrds
where county_predictions is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,url,mrds_id,mas_id,site_name,latitude,longitude,region,country,state,county,com_type,commod1,commod2,commod3,data_source,load_time,county_predictions
0,10308191,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Unnamed (west of Lupine River),68.79977,-148.35348,,United States,Alaska,,,Phosphorus-Phosphates,,,usgs,2024-02-02 22:02:24.030800+00:00,Fairbanks North Star Borough
1,10310446,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Rat Deposit,39.92381,-115.54700,,United States,Nevada,White Pine,M,Gold,,Mercury,usgs,2024-02-02 22:02:24.030800+00:00,Elko County
2,10307204,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,North Pacolet River Mine,35.16641,-82.02874,,United States,South Carolina,Spartanburg,N,Sand,,,usgs,2024-02-02 22:02:24.030800+00:00,Spartanburg
3,10307081,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Masters Kiln,34.43452,-82.17765,,United States,South Carolina,Laurens,N,"Limestone, General",,,usgs,2024-02-02 22:02:24.030800+00:00,Greenville
4,10400525,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Boden Placer,41.41816,-106.45450,,United States,Wyoming,Carbon,N,Diamond,,,usgs,2024-02-02 22:02:24.030800+00:00,Albany County
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6909,10161241,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,0021110094,Pipe Dream,58.86222,-136.86882,,United States,Alaska,,M,Gold,,,usgs,2024-02-02 22:02:24.030800+00:00,Fairbanks North Star
6910,10258319,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,0021110096,Helicopter Pilot's Molybdenum,58.71361,-137.27852,,United States,Alaska,,M,Molybdenum,,,usgs,2024-02-02 22:02:24.030800+00:00,Haines Borough
6911,10282564,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,0021110107,Dagelet River Beach Sands,58.51750,-137.40631,,United States,Alaska,,M,"Gold, Titanium",,"Iron, PGE",usgs,2024-02-02 22:02:24.030800+00:00,Prince of Wales-Outer Ketchikan Census Area
6912,10186027,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,0021110110,Kaknau Creek Beach Sands,58.40141,-137.08070,,United States,Alaska,,M,Titanium,,"Gold, Iron, Platinum",usgs,2024-02-02 22:02:24.030800+00:00,Haines Borough


In [None]:
%%bigquery
select count(*) as county_predictions_count
from retails_stg.Mrds
where county_predictions is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,county_predictions_count
0,6914


In [None]:
%%bigquery
select state, county_predictions, count(*) as count
from retails_stg.Mrds
where county_predictions is not null
group by state, county_predictions
order by count(*) desc

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,state,county_predictions,count
0,Alaska,Fairbanks North Star,1030
1,Alabama,Jefferson County,1009
2,Alabama,Montgomery County,502
3,Alaska,Anchorage,294
4,Wyoming,Sweetwater County,268
...,...,...,...
485,Alaska,Frederick,1
486,Alaska,Bay City,1
487,Alaska,Hale-Peel-Lyngholm,1
488,Alaska,Josephine,1


In [None]:
%%bigquery
update retails_stg.Mrds
  set data_source = 'usgs' where county_predictions is not null

Query is running:   0%|          |

### Part 2: Find the mineral types of a mineral site

#### Experiment with prompt

In [None]:
%%bigquery
select *
from retails_stg.Mrds_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,url,mrds_id,mas_id,site_name,latitude,longitude,region,country,state,county,com_type,commod1,commod2,commod3,data_source,load_time,county_predictions
0,10307950,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Unnamed (near Ahlfield Creek),64.80379,-165.17020,,United States,Alaska,,,,,,usgs,2024-02-02 22:02:24.030800+00:00,
1,10308100,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Unnamed (southwest of Portage Valley),55.69924,-160.60244,,United States,Alaska,,,,,,usgs,2024-02-02 22:02:24.030800+00:00,
2,10307951,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Unnamed (on ridge between Ahlfield and Christi...,64.80729,-165.18610,,United States,Alaska,,,,,,usgs,2024-02-02 22:02:24.030800+00:00,
3,10307952,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Little Gulch,64.80909,-165.23120,,United States,Alaska,,,,,,usgs,2024-02-02 22:02:24.030800+00:00,
4,10308079,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Unnamed (south end of Inner IIiasik Island),55.03921,-161.93942,,United States,Alaska,,,,,,usgs,2024-02-02 22:02:24.030800+00:00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,10307221,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Illinois - Kentucky Fluorite District,37.41673,-88.35004,,United States,"Kentucky, Illinois","Livingston, Pope, Hardin, Crittenden",B,"Lead, Zinc, Fluorine-Fluorite","Silver, Cadmium",,usgs,2024-02-02 22:02:24.030800+00:00,
9996,10400337,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Viburnum Trend,37.53837,-91.12903,,United States,Missouri,"Washington, Crawford, Reynolds, Shannon, Iron",,"Lead, Zinc","Cobalt, Silver, Copper","Cadmium, Nickel",usgs,2024-02-02 22:02:24.030800+00:00,
9997,10400325,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Olive Hill Fire Clay District,38.31667,-83.03333,,United States,Kentucky,"Lewis, Greenup, Rowan, Elliott, Morgan, Carter",N,Fire Clay (Refractory),,,usgs,2024-02-02 22:02:24.030800+00:00,
9998,10400463,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Central Florida Land Pebble Phosphate Deposit,27.66000,-82.04000,,United States,Florida,"Polk, Hillsborough, Manatee, Hardee, DeSoto, S...",,Phosphorus-Phosphates,,"Uranium, Fluorine-Fluorite",usgs,2024-02-02 22:02:24.030800+00:00,


In [None]:
%%bigquery
declare prompt_query STRING default "Suggest a mineral type of each dep_id. Choose between silicates, sulfides, or carbonates. Return the output as json, include the Dep_ID and the mineral_type in the output as well";
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
      select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "com_type", com_type,
                    "commod1", commod1, "commod2", commod1, "commod3", commod3))) as prompt
    from retails_stg.Mrds_10k
    order by site_name
    limit 10
  ),
  struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,ml_generate_text_rai_result,ml_generate_text_status,prompt
0,"{""Dep_ID"":10282235,""com_type"":""M"",""commod1"":""I...",,,Suggest a mineral type of each dep_id. Choose ...
1,"{""Dep_ID"":10112331,""mineral_type"":""sulfides""}",,,Suggest a mineral type of each dep_id. Choose ...
2,"```json\n{""Dep_ID"":10184883,""mineral_type"":""si...",,,Suggest a mineral type of each dep_id. Choose ...
3,"{""Dep_ID"":10257940,""com_type"":""M"",""commod1"":""G...",,,Suggest a mineral type of each dep_id. Choose ...
4,"{""Dep_ID"":10306857,""com_type"":""M"",""commod1"":""L...",,,Suggest a mineral type of each dep_id. Choose ...
5,"{""Dep_ID"":10234107,""com_type"":""N"",""commod1"":""S...",,,Suggest a mineral type of each dep_id. Choose ...
6,"{""Dep_ID"":10282682,""com_type"":""M"",""commod1"":""G...",,,Suggest a mineral type of each dep_id. Choose ...
7,"{""Dep_ID"":10132620,""com_type"":""E"",""commod1"":""U...",,,Suggest a mineral type of each dep_id. Choose ...
8,"{""Dep_ID"":10233783,""mineral_type"":""sulfides""}",,,Suggest a mineral type of each dep_id. Choose ...
9,"{""Dep_ID"":10112394,""com_type"":""N"",""commod1"":""S...",,,Suggest a mineral type of each dep_id. Choose ...


In [None]:
%%bigquery
declare prompt_query STRING default "Suggest a mineral type of each dep_id. Choose between silicates, sulfides, or carbonates. Return the output as json, include the Dep_ID and the mineral_type in the output as well";
create or replace table retails_stg_ai.type_predictions_raw_10 as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
      select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "com_type", com_type,
                    "commod1", commod1, "commod2", commod1, "commod3", commod3))) as prompt
    from retails_stg.Mrds_10k
    order by site_name
    limit 10
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );


Query is running:   0%|          |

In [None]:
%%bigquery
select ml_generate_text_llm_result, prompt
from retails_stg_ai.type_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,prompt
0,"{""Dep_ID"":10112331,""mineral_type"":""sulfides""}",Suggest a mineral type of each dep_id. Choose ...
1,"{""Dep_ID"":10282235,""com_type"":""M"",""commod1"":""I...",Suggest a mineral type of each dep_id. Choose ...
2,"{""Dep_ID"":10306857,""com_type"":""M"",""commod1"":""L...",Suggest a mineral type of each dep_id. Choose ...
3,"```json\n{""Dep_ID"":10184883,""mineral_type"":""si...",Suggest a mineral type of each dep_id. Choose ...
4,"{""Dep_ID"":10282682,""com_type"":""M"",""commod1"":""G...",Suggest a mineral type of each dep_id. Choose ...
5,"{""Dep_ID"":10257940,""com_type"":""M"",""commod1"":""G...",Suggest a mineral type of each dep_id. Choose ...
6,"{""Dep_ID"":10132620,""com_type"":""E"",""commod1"":""U...",Suggest a mineral type of each dep_id. Choose ...
7,"{""Dep_ID"":10234107,""com_type"":""N"",""commod1"":""S...",Suggest a mineral type of each dep_id. Choose ...
8,"{""Dep_ID"":10112394,""com_type"":""N"",""commod1"":""S...",Suggest a mineral type of each dep_id. Choose ...
9,"{""Dep_ID"":10233783,""mineral_type"":""sulfides""}",Suggest a mineral type of each dep_id. Choose ...


In [None]:
%%bigquery
create or replace table retails_stg_ai.type_predictions_formatted_10 as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from retails_stg_ai.type_predictions_raw_10

Query is running:   0%|          |

In [None]:
%%bigquery
select * from retails_stg_ai.type_predictions_formatted_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result
0,"{""Dep_ID"":10282682,""com_type"":""M"",""commod1"":""G..."
1,"{""Dep_ID"":10306857,""com_type"":""M"",""commod1"":""L..."
2,"{""Dep_ID"":10132620,""com_type"":""E"",""commod1"":""U..."
3,"{""Dep_ID"":10112394,""com_type"":""N"",""commod1"":""S..."
4,"{""Dep_ID"":10233783,""mineral_type"":""sulfides""}"
5,"{""Dep_ID"":10234107,""com_type"":""N"",""commod1"":""S..."
6,"{""Dep_ID"":10282235,""com_type"":""M"",""commod1"":""I..."
7,"{""Dep_ID"":10112331,""mineral_type"":""sulfides""}"
8,"{""Dep_ID"":10257940,""com_type"":""M"",""commod1"":""G..."
9,"{""Dep_ID"":10184883,""mineral_type"":""silicates""}"


In [None]:
%%bigquery
alter table retails_stg.Mrds add column mineral_type string;

Query is running:   0%|          |

In [None]:
%%bigquery
update retails_stg.Mrds set mineral_type =
  (select json_value(ml_generate_text_llm_result, '$.mineral_type')
   from retails_stg_ai.type_predictions_formatted_10
   where Dep_ID = cast(json_value(ml_generate_text_llm_result, '$.Dep_ID') as int64))
where 1=1;

Query is running:   0%|          |

In [None]:
%%bigquery
select * except(latitude, longitude, region, country, state, county, data_source, load_time)
from retails_stg.Mrds
where mineral_type is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,url,mrds_id,mas_id,site_name,com_type,commod1,commod2,commod3,county_predictions,mineral_type
0,10306857,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,1580 Brook Prospet,M,Lithium,,,,silicates
1,10132620,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,560070634.0,2 D Pit,E,Uranium,,,Albany County,silicates
2,10184883,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,10250006.0,02,N,"Sand and Gravel, Construction",,,,silicates
3,10282235,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,10719003.0,(Facility) Bridgeport Jv Smelter,M,"Iron, Silica",,,,silicates
4,10112331,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,11259902.0,(Facility) Tuscaloosa Refinery,N,Sulfur,,,,sulfides
5,10257940,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,20500371.0,101 Mine,M,Gold,,,,sulfides
6,10233783,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,20860176.0,2nd Adit East of Cliff Mine,M,Gold,,Silver,Valdez-Cordova,sulfides
7,10282682,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,20860177.0,1st Adit East of Cliff Mine,M,Gold,,Silver,Valdez-Cordova,sulfides
8,10112394,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,20960086.0,3 Mile Bay Pit,N,"Sand and Gravel, Construction",,,Denali Borough,silicates
9,10234107,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,20960090.0,17 Mile Quarry,N,Stone,,,,silicates


#### Apply at larger scale

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest a mineral type of each dep_id. Choose between silicates, sulfides, or carbonates. Return the output as json, include the Dep_ID and the mineral_type in the output as well";
create or replace table retails_stg_ai.type_predictions_raw_10k as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
        select concat(prompt_query, to_json_string(json_object("Dep_ID", Dep_ID, "com_type", com_type,
                      "commod1", commod1, "commod2", commod1, "commod3", commod3))) as prompt
      from retails_stg.Mrds_10k
      order by Dep_ID
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Executing query with job ID: 405708fc-b807-4842-85d8-3dac263c39c1
Query executing: 3052.44s

In [None]:
%%bigquery
select ml_generate_text_llm_result, prompt
from retails_stg_ai.type_predictions_raw_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,prompt
0,"{""Dep_ID"":10111672,""mineral_type"":""silicates""}",Suggest a mineral type of each dep_id. Choose ...
1,"{""Dep_ID"":10111768,""com_type"":""M"",""commod1"":""G...",Suggest a mineral type of each dep_id. Choose ...
2,"```json\n{\n ""Dep_ID"": 10111798,\n ""mineral_...",Suggest a mineral type of each dep_id. Choose ...
3,"{""Dep_ID"":10111808,""com_type"":""M"",""commod1"":""I...",Suggest a mineral type of each dep_id. Choose ...
4,"```json\n{\n ""Dep_ID"": 10111844,\n ""mineral_...",Suggest a mineral type of each dep_id. Choose ...
...,...,...
9995,,Suggest a mineral type of each dep_id. Choose ...
9996,,Suggest a mineral type of each dep_id. Choose ...
9997,,Suggest a mineral type of each dep_id. Choose ...
9998,,Suggest a mineral type of each dep_id. Choose ...


In [None]:
%%bigquery
select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
from retails_stg_ai.type_predictions_raw_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result
0,"{""Dep_ID"":10111672,""mineral_type"":""silicates""}"
1,"{""Dep_ID"":10111768,""com_type"":""M"",""commod1"":""G..."
2,"{ ""Dep_ID"": 10111798, ""mineral_type"": ""silic..."
3,"{""Dep_ID"":10111808,""com_type"":""M"",""commod1"":""I..."
4,"{ ""Dep_ID"": 10111844, ""mineral_type"": ""silic..."
...,...
9995,
9996,
9997,
9998,


In [None]:
%%bigquery
create or replace table retails_stg_ai.type_predictions_formatted_10k as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from retails_stg_ai.type_predictions_raw_10k

Query is running:   0%|          |

In [None]:
%%bigquery
select count(*) as mineraltype_count
from retails_stg_ai.type_predictions_formatted_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,mineraltype_count
0,10000


In [None]:
%%bigquery
select *
from retails_stg_ai.type_predictions_formatted_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result
0,
1,
2,
3,
4,
...,...
9995,"{""Dep_ID"":10136855,""com_type"":""B"",""commod1"":""G..."
9996,"{""Dep_ID"":10136550,""com_type"":""B"",""commod1"":""B..."
9997,"{""Dep_ID"":10310452,""com_type"":""B"",""commod1"":""G..."
9998,"{""Dep_ID"":10306815,""com_type"":""B"",""commod1"":""S..."


In [None]:
%%bigquery
alter table retails_stg.Mrds add column type string;

Query is running:   0%|          |

In [None]:
%%bigquery
update retails_stg.Mrds set mineral_type =
  (select json_value(ml_generate_text_llm_result, '$.mineral_type')
   from retails_stg_ai.type_predictions_formatted_10k
   where Dep_ID = cast(json_value(ml_generate_text_llm_result, '$.Dep_ID')as int64))
where 1=1

Query is running:   0%|          |

In [None]:
%%bigquery
select * except(latitude, longitude, region, country, state, county, data_source, load_time,type)
from retails_stg.Mrds
where mineral_type is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Dep_ID,url,mrds_id,mas_id,site_name,com_type,commod1,commod2,commod3,county_predictions,mineral_type
0,10308679,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Butte Creek,M,Gold,,,,sulfides
1,10308595,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Caribou,M,Gold,,,,sulfides
2,10308704,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Red Creek,M,Gold,"Zinc, Lead, Mercury",,,sulfides
3,10306970,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Southern Belt-Sand and Gravel,N,"Sand and Gravel, Construction",,,,silicates
4,10308130,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,,Unnamed (south of Zachary Bay),,"Copper, Gold",Molybdenum,,Aleutians East Borough,sulfides
...,...,...,...,...,...,...,...,...,...,...,...
9838,10185523,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,0021110106,Topsy Creek Beach Sands,M,Gold,,"Iron, Platinum, Titanium",,Sulfides
9839,10282564,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,0021110107,Dagelet River Beach Sands,M,"Gold, Titanium",,"Iron, PGE",Prince of Wales-Outer Ketchikan Census Area,Sulfides
9840,10185682,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,0021110109,La Perouse-Icy Beach Sand,M,Titanium,,"Gold, Iron, Platinum",,silicates
9841,10186027,https://mrdata.usgs.gov/mrds/show-mrds.php?dep...,,0021110110,Kaknau Creek Beach Sands,M,Titanium,,"Gold, Iron, Platinum",Haines Borough,silicates


In [None]:
%%bigquery
select mineral_type, count(*) as count
from retails_stg.Mrds
where mineral_type is not null
group by  mineral_type
order by count(*) desc

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,mineral_type,count
0,silicates,4845
1,sulfides,4387
2,carbonates,254
3,Sulfides,245
4,Carbonates,51
5,Phosphates,23
6,phosphates,13
7,Fluorite,10
8,tungstenite,7
9,sulfates,6


In [None]:
%%bigquery
select count(*) as mineraltype_count
from retails_stg.Mrds
where mineral_type is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,mineraltype_count
0,9843


In [None]:
%%bigquery
update retails_stg.Mrds
  set data_source = 'usgs' where mineral_type is not null

Query is running:   0%|          |

### Part 3: Find what holiday a transaction occured in

####Experiment with prompt

In [None]:
%%bigquery
select * except (Day, Month, Year, store_nbr, family, sales_amount, data_source, load_time)
from retails_stg.Transactions
where date is not null
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Transaction_ID,date,on_promotion,holiday
0,735,2013-01-01,False,
1,636,2013-01-01,False,
2,372,2013-01-01,False,
3,273,2013-01-01,False,
4,801,2013-01-01,False,


In [None]:
%%bigquery
declare prompt_query STRING default "Suggest a holiday for each transaction. Return the output as json, include the Transaction_ID and holiday in the output";
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("Transaction_ID", Transaction_ID, "date", date,
                  "on_promotion", on_promotion))) as prompt
    from retails_stg.Transactions
    order by Transaction_ID
    limit 10
  ),
  struct(TRUE as flatten_json_output)
);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,ml_generate_text_rai_result,ml_generate_text_status,prompt
0,"```json\n{\n ""Transaction_ID"": 0,\n ""date"": ...",,,Suggest a holiday for each transaction. Return...
1,"```json\n{\n ""Transaction_ID"": 1,\n ""date"": ...",,,Suggest a holiday for each transaction. Return...
2,"```json\n{\n ""Transaction_ID"": 2,\n ""date"": ...",,,Suggest a holiday for each transaction. Return...
3,"```json\n{\n ""Transaction_ID"": 3,\n ""date"": ...",,,Suggest a holiday for each transaction. Return...
4,"```json\n{\n ""Transaction_ID"": 4,\n ""date"": ...",,,Suggest a holiday for each transaction. Return...
5,"```json\n{\n ""Transaction_ID"": 5,\n ""date"": ...",,,Suggest a holiday for each transaction. Return...
6,"```json\n{\n ""Transaction_ID"": 6,\n ""date"": ...",,,Suggest a holiday for each transaction. Return...
7,"```json\n{\n ""Transaction_ID"": 7,\n ""date"": ...",,,Suggest a holiday for each transaction. Return...
8,"```json\n{\n ""Transaction_ID"": 8,\n ""date"": ...",,,Suggest a holiday for each transaction. Return...
9,"```json\n{\n ""Transaction_ID"": 9,\n ""date"": ...",,,Suggest a holiday for each transaction. Return...


####Tweak the prompt and save the output

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest a holiday for each transaction. Return the output as json, include the Transaction_ID and holiday in the output";
create or replace table retails_stg_ai.holiday_predictions_raw_10 as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
    select concat(prompt_query, to_json_string(json_object("Transaction_ID", Transaction_ID, "date", date,
                  "on_promotion", on_promotion))) as prompt
    from retails_stg.Transactions
    order by Transaction_ID
    limit 10
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [None]:
%%bigquery
select ml_generate_text_llm_result, prompt
from retails_stg_ai.holiday_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,prompt
0,"```json\n{\n ""Transaction_ID"": 4,\n ""date"": ...",Suggest a holiday for each transaction. Return...
1,"```json\n{\n ""Transaction_ID"": 3,\n ""date"": ...",Suggest a holiday for each transaction. Return...
2,"```json\n{\n ""Transaction_ID"": 2,\n ""date"": ...",Suggest a holiday for each transaction. Return...
3,"```json\n{\n ""Transaction_ID"": 6,\n ""date"": ...",Suggest a holiday for each transaction. Return...
4,"```json\n{\n ""Transaction_ID"": 1,\n ""date"": ...",Suggest a holiday for each transaction. Return...
5,"```json\n{\n ""Transaction_ID"": 7,\n ""date"": ...",Suggest a holiday for each transaction. Return...
6,"```json\n{\n ""Transaction_ID"": 9,\n ""date"": ...",Suggest a holiday for each transaction. Return...
7,"```json\n{\n ""Transaction_ID"": 5,\n ""date"": ...",Suggest a holiday for each transaction. Return...
8,"```json\n{\n ""Transaction_ID"": 0,\n ""date"": ...",Suggest a holiday for each transaction. Return...
9,"```json\n{\n ""Transaction_ID"": 8,\n ""date"": ...",Suggest a holiday for each transaction. Return...


####Format the output to proper json


In [None]:
%%bigquery
select ml_generate_text_llm_result, trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as formated_result
from retails_stg_ai.holiday_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,formated_result
0,"```json\n{\n ""Transaction_ID"": 4,\n ""date"": ...","{ ""Transaction_ID"": 4, ""date"": ""2013-01-01"",..."
1,"```json\n{\n ""Transaction_ID"": 3,\n ""date"": ...","{ ""Transaction_ID"": 3, ""date"": ""2013-01-01"",..."
2,"```json\n{\n ""Transaction_ID"": 2,\n ""date"": ...","{ ""Transaction_ID"": 2, ""date"": ""2013-01-01"",..."
3,"```json\n{\n ""Transaction_ID"": 6,\n ""date"": ...","{ ""Transaction_ID"": 6, ""date"": ""2013-01-01"",..."
4,"```json\n{\n ""Transaction_ID"": 1,\n ""date"": ...","{ ""Transaction_ID"": 1, ""date"": ""2013-01-01"",..."
5,"```json\n{\n ""Transaction_ID"": 7,\n ""date"": ...","{ ""Transaction_ID"": 7, ""date"": ""2013-01-01"",..."
6,"```json\n{\n ""Transaction_ID"": 9,\n ""date"": ...","{ ""Transaction_ID"": 9, ""date"": ""2013-01-01"",..."
7,"```json\n{\n ""Transaction_ID"": 5,\n ""date"": ...","{ ""Transaction_ID"": 5, ""date"": ""2013-01-01"",..."
8,"```json\n{\n ""Transaction_ID"": 0,\n ""date"": ...","{ ""Transaction_ID"": 0, ""date"": ""2013-01-01"",..."
9,"```json\n{\n ""Transaction_ID"": 8,\n ""date"": ...","{ ""Transaction_ID"": 8, ""date"": ""2013-01-01"",..."


In [None]:
%%bigquery
create or replace table retails_stg_ai.holiday_predictions_formatted_10 as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from retails_stg_ai.holiday_predictions_raw_10

Query is running:   0%|          |

In [None]:
%%bigquery
select json_value(ml_generate_text_llm_result, '$.Transaction_ID') as Transaction_ID,
  json_value(ml_generate_text_llm_result, '$.holiday') as holiday
from retails_stg_ai.holiday_predictions_formatted_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Transaction_ID,holiday
0,8,New Year's Day
1,0,New Year's Day
2,2,New Year's Day
3,9,New Year's Day
4,5,New Year's Day
5,3,New Year's Day
6,1,New Year's Day
7,4,New Year's Day
8,7,New Year's Day
9,6,New Year's Day


####Add the holiday field to the Transaction table

In [None]:
%%bigquery
alter table retails_stg.Transactions add column holiday string;

Executing query with job ID: 6e2212eb-125c-4eae-9f78-ad08986dfee7
Query executing: 0.41s


ERROR:
 400 Column already exists: holiday at [1:49]

Location: US
Job ID: 6e2212eb-125c-4eae-9f78-ad08986dfee7



####Update the Mineral records with the predicted holiday


In [None]:
%%bigquery
update retails_stg.Transactions set holiday =
  (select json_value(ml_generate_text_llm_result, '$.holiday')
   from retails_stg_ai.holiday_predictions_formatted_10
   where Transaction_ID = cast(json_value(ml_generate_text_llm_result, '$.Transaction_ID') as int64))
where 1=1

Query is running:   0%|          |

####Inspect the output

In [None]:
%%bigquery
select * except (Day, Month, Year, store_nbr, family, sales_amount, data_source, load_time)
from retails_stg.Transactions
where holiday is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Transaction_ID,date,on_promotion,holiday
0,5,2013-01-01,False,New Year's Day
1,4,2013-01-01,False,New Year's Day
2,6,2013-01-01,False,New Year's Day
3,7,2013-01-01,False,New Year's Day
4,3,2013-01-01,False,New Year's Day
5,2,2013-01-01,False,New Year's Day
6,1,2013-01-01,False,New Year's Day
7,8,2013-01-01,False,New Year's Day
8,9,2013-01-01,False,New Year's Day
9,0,2013-01-01,False,New Year's Day


Apply at a larger scale

In [None]:
%%bigquery
create or replace table retails_stg.Transactions_10k as
  select *
  from retails_stg.Transactions
  where date is not null
  and on_promotion is not null
  limit 10000

Query is running:   0%|          |

In [None]:
%%bigquery
select *
from retails_stg.Transactions_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Transaction_ID,date,Day,Month,Year,store_nbr,family,sales_amount,on_promotion,data_source,load_time,holiday
0,1692,2013-01-01,01,01,2013,7,DELI,0.0,False,kaggle,2024-02-10 18:23:57.714487+00:00,
1,9,2013-01-01,01,01,2013,1,DELI,0.0,False,kaggle,2024-02-10 18:23:57.714487+00:00,New Year's Day
2,273,2013-01-01,01,01,2013,17,DELI,0.0,False,kaggle,2024-02-10 18:23:57.714487+00:00,
3,1263,2013-01-01,01,01,2013,44,DELI,0.0,False,kaggle,2024-02-10 18:23:57.714487+00:00,
4,1230,2013-01-01,01,01,2013,43,DELI,0.0,False,kaggle,2024-02-10 18:23:57.714487+00:00,
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,96226,2013-02-23,23,02,2013,9,SCHOOL AND OFFICE SUPPLIES,0.0,False,kaggle,2024-02-10 18:23:57.714487+00:00,
9996,96721,2013-02-24,24,02,2013,22,SCHOOL AND OFFICE SUPPLIES,0.0,False,kaggle,2024-02-10 18:23:57.714487+00:00,
9997,97117,2013-02-24,24,02,2013,33,SCHOOL AND OFFICE SUPPLIES,0.0,False,kaggle,2024-02-10 18:23:57.714487+00:00,
9998,97810,2013-02-24,24,02,2013,52,SCHOOL AND OFFICE SUPPLIES,0.0,False,kaggle,2024-02-10 18:23:57.714487+00:00,


In [None]:
%%bigquery
declare prompt_query STRING default "Suggest a holiday for each transaction. Return the output as json, include the Transaction_ID and holiday in the output";
create or replace table retails_stg_ai.holiday_predictions_raw_10k as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
    select concat(prompt_query, to_json_string(json_object("Transaction_ID", Transaction_ID, "date", date,
                  "on_promotion", on_promotion))) as prompt
    from retails_stg.Transactions_10k
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [None]:
%%bigquery
select creation_time, end_time, query
from `region-us`.INFORMATION_SCHEMA.JOBS
where job_id = 'cc8479fa-607b-42ad-b5a9-142dd8b26504'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,creation_time,end_time,query
0,2024-04-10 16:59:02.556000+00:00,2024-04-10 17:43:55.764000+00:00,"declare prompt_query STRING default ""Suggest a..."


In [None]:
%%bigquery
create or replace table retails_stg_ai.holiday_predictions_formatted_10k as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from retails_stg_ai.holiday_predictions_raw_10k;

Query is running:   0%|          |

In [None]:
%%bigquery
select count(*) as holiday_count
from retails_stg_ai.holiday_predictions_formatted_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,holiday_count
0,10000


In [None]:
%%bigquery
select *
from retails_stg_ai.holiday_predictions_formatted_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result
0,
1,
2,
3,
4,
...,...
9995,"{ ""Transaction_ID"": 40453, ""date"": ""2013-01-..."
9996,"{ ""Transaction_ID"": 40518, ""date"": ""2013-01-..."
9997,"{ ""Transaction_ID"": 40895, ""date"": ""2013-01-..."
9998,"{ ""Transaction_ID"": 40933, ""date"": ""2013-01-..."


In [None]:
%%bigquery
update retails_stg.Transactions set holiday =
  (select json_value(ml_generate_text_llm_result, '$.holiday')
   from retails_stg_ai.holiday_predictions_formatted_10k
   where Transaction_ID = cast(json_value(ml_generate_text_llm_result, '$.Transaction_ID') as int64))
where 1=1

Query is running:   0%|          |

In [None]:
%%bigquery
select count(*) as holiday_count
from retails_stg.Transactions
where holiday is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,holiday_count
0,9980


In [None]:
%%bigquery
select on_promotion, holiday, count(*) as count
from retails_stg.Transactions
where on_promotion is not null
group by on_promotion, holiday
order by count(*) desc

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,on_promotion,holiday,count
0,False,,2379579
1,True,,611329
2,False,New Year's Day,2883
3,False,Presidents Day,1693
4,False,Martin Luther King Jr. Day,1461
5,False,Chinese New Year,935
6,False,Lincoln's Birthday,740
7,False,Groundhog Day,657
8,False,Valentine's Day,390
9,False,Australia Day,378


In [None]:
%%bigquery
update retails_stg.Transactions
  set data_source = 'kaggle' where holiday is not null

Query is running:   0%|          |

### Part 4: Merge changes into target table

####Mrds table

In [21]:
%%bigquery
create or replace table retails_csp.Mrds_copy as select * from retails_csp.Mrds

Query is running:   0%|          |

In [22]:
%%bigquery
alter table retails_csp.Mrds
  add column county_predictions string;

Query is running:   0%|          |

In [23]:
%%bigquery
alter table retails_csp.Mrds
  add column mineral_type string;

Query is running:   0%|          |

In [26]:
%%bigquery
declare current_ts TIMESTAMP;
set current_ts = current_timestamp();

create temp table updates as
  select s.*
  from retails_csp.Mrds t join retails_stg.Mrds s
  on t.Dep_ID = s.Dep_ID
  where s.county_predictions != t.county_predictions
  or s.mineral_type != t.mineral_type;

update retails_csp.Mrds
set discontinue_time = timestamp_sub(current_ts, interval 1 second), status_flag = false
where Dep_ID in (select Dep_ID from updates);

insert into retails_csp.Mrds
  (Dep_id, url, mrds_id, mas_id, site_name, latitude, longitude, region,
    country, state, county, com_type, commod1, commod2, commod3, mineral_type, county_predictions, data_source, load_time, effective_time, status_flag)
    (select Dep_id, url, mrds_id, mas_id, site_name, latitude, longitude, region,
      country, state, county, com_type, commod1, commod2, commod3, mineral_type, county_predictions, data_source, load_time, current_ts, true
      from updates);

Query is running:   0%|          |

In [27]:
%%bigquery
select count(*) as num_records
from retails_csp.Mrds

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,num_records
0,304632


####Transactions table

In [28]:
%%bigquery
create or replace table retails_csp.Transactions_copy as select * from retails_csp.Transactions

Query is running:   0%|          |

In [29]:
%%bigquery
alter table retails_csp.Transactions
  add column holiday string;

Query is running:   0%|          |

In [35]:
%%bigquery
declare current_ts TIMESTAMP;
set current_ts = current_timestamp();

create temp table updates as
  select s.*
  from retails_csp.Transactions t join retails_stg.Transactions s
  on t.Transaction_ID = s.Transaction_ID
  where t.status_flag = true
  and (s.Transaction_ID is not null and t.Transaction_ID is null
  or s.holiday is not null and t.holiday is null);

update retails_csp.Transactions
set discontinue_time = timestamp_sub(current_ts, interval 1 second), status_flag = false
where Transaction_ID in (select Transaction_ID from updates);

insert into retails_csp.Transactions
  (Transaction_ID, date, Day, Month, Year, store_nbr, family, sales_amount,
    on_promotion, holiday, data_source, load_time, effective_time, status_flag)
    (select Transaction_ID, date, Day, Month, Year, store_nbr, family, sales_amount,
    on_promotion, holiday, data_source, load_time, current_ts, true
      from updates);

Query is running:   0%|          |

In [36]:
%%bigquery
select count(*) as num_records
from retails_csp.Transactions

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,num_records
0,3000888
