# 定義環境

In [None]:
!pip install apache-beam[gcp,dataframe] --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.5/173.5 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.7/215.7 kB[0m [31m16.7 MB/s[0m eta [36m0:0

In [None]:
import pickle
from sklearn import linear_model
from typing import Tuple

import numpy as np
import apache_beam as beam

from apache_beam.ml.inference.sklearn_inference import ModelFileType
from apache_beam.ml.inference.sklearn_inference import SklearnModelHandlerNumpy
from apache_beam.ml.inference.base import KeyedModelHandler
from apache_beam.ml.inference.base import PredictionResult
from apache_beam.ml.inference.base import RunInference
from apache_beam.options.pipeline_options import PipelineOptions

In [6]:
import os
# 定義常數（Constants）
project = "tibame-gad253-14-dataflow"
bucket = "tibame-gad253-14-bucket-tw"

# 設定專案 ID。
os.environ['GOOGLE_CLOUD_PROJECT'] = project

In [5]:
from google.colab import auth
auth.authenticate_user()

In [None]:
# 測試 列出值區
!gcloud storage buckets list --project tibame-gad253-14-dataflow

# 建立Scikit-learn模型

In [8]:
# 準備訓練 sklearn 模型的輸入資料（5 倍表）。
x = np.arange(0, 100, dtype=np.float32).reshape(-1, 1)
y = (x * 5).reshape(-1, 1)

def train_and_save_model(x, y, model_file_name):
  """訓練線性回歸模型並儲存至檔案。"""
  regression = linear_model.LinearRegression()
  regression.fit(x, y)

  with open(model_file_name, 'wb') as f:
      pickle.dump(regression, f)

# 訓練並儲存 5 倍表模型。
five_times_model_filename = 'sklearn_5x_model.pkl'
train_and_save_model(x, y, five_times_model_filename)

# 訓練並儲存 10 倍表模型。
ten_times_model_filename = 'sklearn_10x_model.pkl'
train_and_save_model(x, y, ten_times_model_filename)
y = (x * 10).reshape(-1, 1)
train_and_save_model(x, y, 'sklearn_10x_model.pkl')

# 準備BigQuery資料來源

In [9]:
%pip install --upgrade google-cloud-bigquery --quiet

In [10]:
!gcloud config set project $project

Updated property [core/project].


In [11]:
## 將資料填入 BigQuery 資料表

from google.cloud import bigquery

client = bigquery.Client(project=project)

# 確保 dataset_id 在專案中是唯一的。
dataset_id = '{project}.maths'.format(project=project)
dataset = bigquery.Dataset(dataset_id)

# 根據專案設定修改位置。
dataset.location = 'asia-east1'
dataset = client.create_dataset(dataset, exists_ok=True)

# BigQuery 資料集中的資料表名稱。
table_name = 'maths_problems_1'

query = """
CREATE OR REPLACE TABLE
  {project}.maths.{table} (
    key STRING OPTIONS(description="A unique key for the maths problem"),
    value FLOAT64 OPTIONS(description="Our maths problem")
  );

INSERT INTO {project}.maths.{table}
VALUES
  ("first_example", 105.00),
  ("second_example", 108.00),
  ("third_example", 1000.00),
  ("fourth_example", 1013.00);
""".format(project=project, table=table_name)

create_job = client.query(query)
create_job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7d324271bf50>

# 建立測試管道

In [12]:
sklearn_model_handler = SklearnModelHandlerNumpy(model_uri=five_times_model_filename)

# 設定 Dataflow 管道選項，並指定暫存位置。
pipeline_options = PipelineOptions().from_dictionary({'temp_location': f'gs://{bucket}/tmp'})

# 定義 BigQuery 資料表規格。
table_name = 'maths_problems_1'
table_spec = f'{project}:maths.{table_name}'

with beam.Pipeline(options=pipeline_options) as p:
  (
      p
      | "從 BigQuery 讀取資料" >> beam.io.ReadFromBigQuery(table=table_spec)
      | "提取輸入值" >> beam.Map(lambda x: [x['value']])
      | "執行 Sklearn 推論" >> RunInference(model_handler=sklearn_model_handler)
      | beam.Map(print)
  )





PredictionResult(example=[1013.0], inference=array([5065.]), model_id='sklearn_5x_model.pkl')
PredictionResult(example=[1000.0], inference=array([5000.]), model_id='sklearn_5x_model.pkl')
PredictionResult(example=[105.0], inference=array([525.]), model_id='sklearn_5x_model.pkl')
PredictionResult(example=[108.0], inference=array([540.]), model_id='sklearn_5x_model.pkl')


# 輸入key-value於key-mod

In [None]:
sklearn_model_handler = SklearnModelHandlerNumpy(model_uri=five_times_model_filename)
keyed_sklearn_model_handler = KeyedModelHandler(sklearn_model_handler)

# 設定 Dataflow 管道選項，並指定暫存位置。
pipeline_options = PipelineOptions().from_dictionary({'temp_location': f'gs://{bucket}/tmp'})

with beam.Pipeline(options=pipeline_options) as p:
  (
  p
  | "從 BigQuery 讀取資料" >> beam.io.ReadFromBigQuery(table=table_spec)
  | "提取輸入值" >> beam.Map(lambda x: (x['key'], [x['value']]))
  | "執行 Sklearn 推論" >> RunInference(model_handler=keyed_sklearn_model_handler)
  | beam.Map(print)
  )

# 多模型測試

In [14]:
from typing import Tuple

def format_output(run_inference_output) -> str:
  """從 RunInference 的輸出提取 scikit-learn 預測結果。"""
  key, prediction_result = run_inference_output
  example = prediction_result.example[0]
  prediction = prediction_result.inference[0]
  return f"鍵值 = {key}, 範例 = {example} -> 預測結果 {prediction}"

five_times_model_handler = KeyedModelHandler(
    SklearnModelHandlerNumpy(model_uri=five_times_model_filename))
ten_times_model_handler = KeyedModelHandler(
    SklearnModelHandlerNumpy(model_uri=ten_times_model_filename))

# 設定 Dataflow 管道選項，並指定暫存位置。
pipeline_options = PipelineOptions().from_dictionary({'temp_location': f'gs://{bucket}/tmp'})

with beam.Pipeline(options=pipeline_options) as p:
  inputs = (p
    | "從 BigQuery 讀取資料" >> beam.io.ReadFromBigQuery(table=table_spec))

  five_times = (inputs
    | "提取 5 倍運算" >> beam.Map(lambda x: ('{} {}'.format(x['key'], '* 5'), [x['value']]))
    | "執行 5 倍推論" >> RunInference(model_handler = five_times_model_handler))

  ten_times = (inputs
    | "提取 10 倍運算" >> beam.Map(lambda x: ('{} {}'.format(x['key'], '* 10'), [x['value']]))
    | "執行 10 倍推論" >> RunInference(model_handler = ten_times_model_handler))

  _ = ((five_times, ten_times)
    | "合併結果" >> beam.Flatten()
    | "格式化輸出" >> beam.Map(format_output)
    | "輸出結果" >> beam.Map(print))



鍵值 = fourth_example * 10, 範例 = 1013.0 -> 預測結果 10130.0
鍵值 = third_example * 10, 範例 = 1000.0 -> 預測結果 10000.0
鍵值 = first_example * 10, 範例 = 105.0 -> 預測結果 1050.0
鍵值 = second_example * 10, 範例 = 108.0 -> 預測結果 1080.0
鍵值 = fourth_example * 5, 範例 = 1013.0 -> 預測結果 5065.0
鍵值 = third_example * 5, 範例 = 1000.0 -> 預測結果 5000.0
鍵值 = first_example * 5, 範例 = 105.0 -> 預測結果 525.0
鍵值 = second_example * 5, 範例 = 108.0 -> 預測結果 540.0
