# 環境設定

In [1]:
import os
from google.cloud import bigquery

# SET GCP 金鑰
os.environ['GOOGLE_APPLICATION_CREDENTIALS']=r"/content/tibame-gad253-14-bigquery-c5993cbf5beb.json"

# 初始化BigQuery客戶端
client = bigquery.Client()
print("Client creating using default project: {}".format(client.project))

Client creating using default project: tibame-gad253-14-bigquery


In [2]:
# 設定資料集
dataset_id = 'tibame_gad253_14_dataset_python'
us_dataset_id = 'tibame_gad253_14_dataset_us'
eu_dataset_id = 'tibame_gad253_14_dataset_eu'

# 聚類模型

In [5]:
kmeans_model='london_station_clusters'
# 定義SQL查詢建立模型
query=f"""
CREATE OR REPLACE MODEL `{eu_dataset_id}.{kmeans_model}`
OPTIONS
  (model_type='kmeans',
  num_clusters = 4
  ) AS
WITH
  hs AS(
    SELECT
      h.start_station_name AS station_name,
      IF(EXTRACT(DAYOFWEEK FROM h.start_date) = 1 OR EXTRACT(DAYOFWEEK FROM h.start_date) = 7, "weekend", "weekday") AS isweekday,
      h.duration,
      ST_DISTANCE(ST_GEOGPOINT(s.longitude, s.latitude), ST_GEOGPOINT(-0.1, 51.5))/1000 AS distance_from_city_center
    FROM
      `bigquery-public-data.london_bicycles.cycle_hire` AS h
    JOIN
      `bigquery-public-data.london_bicycles.cycle_stations` AS s
    ON
      h.start_station_id = s.id
    WHERE
      h.start_date BETWEEN CAST('2015-01-01 00:00:00' AS TIMESTAMP) AND CAST('2016-01-01 00:00:00' AS TIMESTAMP)
  ),
  stationstats AS (
    SELECT
      station_name,
      isweekday,
      AVG(duration) AS duration,
      COUNT(duration) AS num_trips,
      MAX(distance_from_city_center) AS distance_from_city_center
    FROM
      hs
    GROUP BY
      station_name, isweekday
  )

  SELECT
  *
  FROM
  stationstats
  ORDER BY
  distance_from_city_center ASC
"""
# 印出查詢
print(query)

# 執行SQL查詢
query_job=client.query(query)
query_job.result()


CREATE OR REPLACE MODEL `tibame_gad253_14_dataset_eu.london_station_clusters`
OPTIONS
  (model_type='kmeans',
  num_clusters = 4
  ) AS
WITH
  hs AS(
    SELECT
      h.start_station_name AS station_name,
      IF(EXTRACT(DAYOFWEEK FROM h.start_date) = 1 OR EXTRACT(DAYOFWEEK FROM h.start_date) = 7, "weekend", "weekday") AS isweekday,
      h.duration,
      ST_DISTANCE(ST_GEOGPOINT(s.longitude, s.latitude), ST_GEOGPOINT(-0.1, 51.5))/1000 AS distance_from_city_center
    FROM
      `bigquery-public-data.london_bicycles.cycle_hire` AS h
    JOIN
      `bigquery-public-data.london_bicycles.cycle_stations` AS s
    ON
      h.start_station_id = s.id
    WHERE
      h.start_date BETWEEN CAST('2015-01-01 00:00:00' AS TIMESTAMP) AND CAST('2016-01-01 00:00:00' AS TIMESTAMP)
  ),
  stationstats AS (
    SELECT
      station_name,
      isweekday,
      AVG(duration) AS duration,
      COUNT(duration) AS num_trips,
      MAX(distance_from_city_center) AS distance_from_city_center
    FROM
     

<google.cloud.bigquery.table._EmptyRowIterator at 0x7ba148661010>

# 測試聚類模型

In [9]:
# 定義SQL測試聚類模型
query=f"""
WITH
  hs AS (
    SELECT
      h.start_station_name AS station_name,
      IF(EXTRACT(DAYOFWEEK FROM h.start_date) = 1 OR EXTRACT(DAYOFWEEK FROM h.start_date) = 7, "weekend", "weekday") AS isweekday,
      h.duration,
      ST_DISTANCE(ST_GEOGPOINT(s.longitude, s.latitude), ST_GEOGPOINT(-0.1, 51.5))/1000 AS distance_from_city_center
    FROM
      `bigquery-public-data.london_bicycles.cycle_hire` AS h
    JOIN
      `bigquery-public-data.london_bicycles.cycle_stations` AS s
    ON
      h.start_station_id = s.id
    WHERE
      h.start_date BETWEEN CAST('2022-01-01 00:00:00' AS TIMESTAMP) AND CAST('2023-01-01 00:00:00' AS TIMESTAMP)
  ),
  stationstats AS (
    SELECT
      station_name,
      isweekday,
      AVG(duration) AS duration,
      COUNT(duration) AS num_trips,
      MAX(distance_from_city_center) AS distance_from_city_center
    FROM
      hs
    GROUP BY
      station_name, isweekday
  )
SELECT
  * EXCEPT(nearest_centroids_distance)
FROM
  ML.PREDICT( MODEL `{eu_dataset_id}.{kmeans_model}`,
    (
    SELECT
      *
    FROM
      stationstats
    WHERE
      REGEXP_CONTAINS(station_name, 'Kennington')))
      ORDER BY CENTROID_ID

"""
# 印出查詢
print(query)
# 執行查詢
query_job=client.query(query)
results=query_job.result()
# 將results的row解成tuple
rows_data= [tuple(row.values()) for row in results]
# 抓出results欄位名稱
columns=[field.name for field in results.schema]
# import panda
import pandas as pd
# 顯示資料
df = pd.DataFrame(rows_data,columns=columns)
# 取消自動截斷
pd.set_option('display.max_colwidth', None)
df


WITH
  hs AS (
    SELECT
      h.start_station_name AS station_name,
      IF(EXTRACT(DAYOFWEEK FROM h.start_date) = 1 OR EXTRACT(DAYOFWEEK FROM h.start_date) = 7, "weekend", "weekday") AS isweekday,
      h.duration,
      ST_DISTANCE(ST_GEOGPOINT(s.longitude, s.latitude), ST_GEOGPOINT(-0.1, 51.5))/1000 AS distance_from_city_center
    FROM
      `bigquery-public-data.london_bicycles.cycle_hire` AS h
    JOIN
      `bigquery-public-data.london_bicycles.cycle_stations` AS s
    ON
      h.start_station_id = s.id
    WHERE
      h.start_date BETWEEN CAST('2022-01-01 00:00:00' AS TIMESTAMP) AND CAST('2023-01-01 00:00:00' AS TIMESTAMP)
  ),
  stationstats AS (
    SELECT
      station_name,
      isweekday,
      AVG(duration) AS duration,
      COUNT(duration) AS num_trips,
      MAX(distance_from_city_center) AS distance_from_city_center
    FROM
      hs
    GROUP BY
      station_name, isweekday
  )
SELECT
  * EXCEPT(nearest_centroids_distance)
FROM
  ML.PREDICT( MODEL `tibame_gad25

Unnamed: 0,CENTROID_ID,station_name,isweekday,duration,num_trips,distance_from_city_center
0,1,"Kennington Road Post Office, Oval",weekend,1371.58041,2879,1.846033
1,1,"Kennington Station, Kennington",weekend,1241.718093,2852,1.298668
2,1,"Kennington Oval, Oval",weekend,1218.290676,2188,2.083134
3,1,"Doddington Grove, Kennington",weekend,1325.194249,3269,1.468141
4,1,"Kennington Road , Vauxhall",weekend,1176.327543,2821,0.891565
5,1,"Cleaver Street, Kennington",weekend,888.353949,1051,1.496792
6,1,"Cotton Garden Estate, Kennington",weekend,1447.442371,1822,1.117034
7,1,"Kennington Cross, Kennington",weekend,1104.610169,4130,1.462588
8,4,"Cotton Garden Estate, Kennington",weekday,1220.839347,5266,1.117034
9,4,"Doddington Grove, Kennington",weekday,1082.736,8750,1.468141
