### Ingest data from BigQuery → generate statistics → infer schema → detect anomalies. Copy these cells in order.

#### Imports & minimal configs

In [1]:
import numpy as np
import pandas as pd
import tfx
import tensorflow_data_validation as tfdv
import tensorflow_transform as tft
import tensorflow_model_analysis as tfma
import os
import google.auth
from tfx.orchestration.experimental.interactive import interactive_context
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.v1.extensions.google_cloud_big_query import BigQueryExampleGen
from tfx.v1.components import StatisticsGen, SchemaGen, ExampleValidator


print("Environment is consistent")


2025-11-11 01:02:34.338047: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-11 01:02:34.343254: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-11 01:02:34.359526: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-11-11 01:02:34.391989: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-11-11 01:02:34.392046: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-11-11 01:02:34.412154: I tensorflow/core/platform/cpu_feature_guard.cc:

Environment is consistent


  from google.cloud.aiplatform.utils import gcs_utils


In [2]:
PROJECT_ID = "uci-bank-marketing-ml-lab"
GCP_REGION = "us-central1"
BQ_PROJECT = PROJECT_ID
BQ_DATASET = "bank_data"
BQ_TABLE = "uci_bank_marketing"  
PIPELINE_NAME = "bq_classifier_validation"
LOCAL_PIPELINE_ROOT = os.path.join(os.getcwd(), "tfx_local", PIPELINE_NAME)
os.makedirs(LOCAL_PIPELINE_ROOT, exist_ok=True)

print("Project:", PROJECT_ID)
print("BQ table:", f"{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}")
print("Local pipeline root:", LOCAL_PIPELINE_ROOT)

Project: uci-bank-marketing-ml-lab
BQ table: uci-bank-marketing-ml-lab.bank_data.uci_bank_marketing
Local pipeline root: /home/jupyter/tfx_local/bq_classifier_validation


#### InteractiveContext

In [3]:
context = InteractiveContext(pipeline_root=LOCAL_PIPELINE_ROOT)
print("InteractiveContext created with pipeline_root:", context.pipeline_root)

InteractiveContext created with pipeline_root: /home/jupyter/tfx_local/bq_classifier_validation




### Data Ingestion 

#### Create & run BigQueryExampleGen

In [4]:
os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
os.environ["PROJECT_ID"] = PROJECT_ID
os.environ["GCLOUD_PROJECT"] = PROJECT_ID  


In [5]:
TMP_LOCATION = "gs://uci-bank-ml-bucket/tfx/tmp" 

beam_pipeline_args = [
    f"--project={PROJECT_ID}",
    f"--region={GCP_REGION}",
    f"--temp_location={TMP_LOCATION}",
]

In [6]:
query = f"""
SELECT *
FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
"""

example_gen = BigQueryExampleGen(query=query)
context.run(example_gen, beam_pipeline_args=beam_pipeline_args)






0,1
.execution_id,7
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } BigQueryExampleGen at 0x7f3308cb75e0.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f325ecde920.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7) at 0x7f3308cb5ba0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7.span0.split_names[""train"", ""eval""].version0.exec_properties['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT *\nFROM `uci-bank-marketing-ml-lab.bank_data.uci_bank_marketing`\n""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']None"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f325ecde920.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7) at 0x7f3308cb5ba0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7.span0.split_names[""train"", ""eval""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f325ecde920.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7) at 0x7f3308cb5ba0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT *\nFROM `uci-bank-marketing-ml-lab.bank_data.uci_bank_marketing`\n""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']None"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f325ecde920.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7) at 0x7f3308cb5ba0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7) at 0x7f3308cb5ba0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7) at 0x7f3308cb5ba0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT *\nFROM `uci-bank-marketing-ml-lab.bank_data.uci_bank_marketing`\n""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['output_file_format'],5
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f325ecde920.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7) at 0x7f3308cb5ba0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7) at 0x7f3308cb5ba0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7) at 0x7f3308cb5ba0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/home/jupyter/tfx_local/bq_classifier_validation/BigQueryExampleGen/examples/7
.span,0
.split_names,"[""train"", ""eval""]"
.version,0


In [7]:
from google.cloud import bigquery

client = bigquery.Client(project=PROJECT_ID)

preview_query = f"""
SELECT *
FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
LIMIT 20
"""

df_preview = client.query(preview_query).to_dataframe()
df_preview

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,False
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,False
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,False
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,False
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,False
5,45,services,married,basic.9y,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,False
6,59,admin.,married,professional.course,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,False
7,41,blue-collar,married,unknown,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,False
8,24,technician,single,professional.course,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,False
9,25,services,single,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,False


### Generate statistics (TFDV)

In [8]:
statistics_gen = StatisticsGen(
    examples=example_gen.outputs["examples"]
)

context.run(statistics_gen, beam_pipeline_args=beam_pipeline_args)

# Visualize stats
context.show(statistics_gen.outputs["statistics"])

### Infer schema

In [9]:
schema_gen = SchemaGen(
    statistics=statistics_gen.outputs["statistics"],
    infer_feature_shape=True,
)

context.run(schema_gen, beam_pipeline_args=beam_pipeline_args)

# Visualize schema
context.show(schema_gen.outputs["schema"])

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'age',INT,required,,-
'campaign',INT,required,,-
'cons_conf_idx',FLOAT,required,,-
'cons_price_idx',FLOAT,required,,-
'contact',STRING,required,,'contact'
'day_of_week',STRING,required,,'day_of_week'
'default',STRING,required,,'default'
'duration',INT,required,,-
'education',STRING,required,,'education'
'emp_var_rate',FLOAT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'contact',"'cellular', 'telephone'"
'day_of_week',"'fri', 'mon', 'thu', 'tue', 'wed'"
'default',"'no', 'unknown', 'yes'"
'education',"'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate', 'professional.course', 'university.degree', 'unknown'"
'housing',"'no', 'unknown', 'yes'"
'job',"'admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed', 'unknown'"
'loan',"'no', 'unknown', 'yes'"
'marital',"'divorced', 'married', 'single', 'unknown'"
'month',"'apr', 'aug', 'dec', 'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep'"
'poutcome',"'failure', 'nonexistent', 'success'"


### Validate data

In [10]:
example_validator = ExampleValidator(
    statistics=statistics_gen.outputs["statistics"],
    schema=schema_gen.outputs["schema"],
)

context.run(example_validator, beam_pipeline_args=beam_pipeline_args)

# Show anomalies
context.show(example_validator.outputs["anomalies"])