In [23]:
import pandas as pd
from google.cloud import bigquery
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [24]:
# Define the Source data 
BQ_PROJECT = 'myproject'
BQ_DATASET = 'personal_sauravanand'
BQ_TABLE = 'iris'

In [25]:
bq = bigquery.Client()
BQ_SOURCE = f'{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}'

In [26]:
# Python Client 

query = f"""
    SELECT * 
    FROM `{BQ_SOURCE}`
"""
data = bq.query(query = query).to_dataframe()

In [27]:
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,99,5.1,2.5,3.0,1.1,Iris-versicolor
1,61,5.0,2.0,3.5,1.0,Iris-versicolor
2,80,5.7,2.6,3.5,1.0,Iris-versicolor
3,63,6.0,2.2,4.0,1.0,Iris-versicolor
4,93,5.8,2.6,4.0,1.2,Iris-versicolor


In [28]:
# Split the data into features and target 

X = data[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
y = data['Species']

In [29]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
# Write the training data to BigQuery
train_data_path = 'myproject.personal_sauravanand.train_data'
X_train['Species'] = y_train
X_train.to_gbq(train_data_path, project_id=BQ_PROJECT, if_exists='replace')

100%|██████████| 1/1 [00:00<00:00, 1957.21it/s]


# Logistic Regression

In [35]:
# Train a classification model using BigQuery ML
model_query = """
CREATE OR REPLACE MODEL personal_sauravanand.iris_classification_model
OPTIONS(model_type='logistic_reg',input_label_cols=['Species']) AS
SELECT *
FROM `{0}`
""".format(train_data_path)

bq.query(model_query).result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f67657ef190>

In [37]:
# Evaluate the model
evaluation_query = """
SELECT
  *
FROM
  ML.EVALUATE(MODEL personal_sauravanand.iris_classification_model,
    (
    SELECT *
    FROM `{0}`
    ))
""".format(train_data_path)

evaluation_result = bq.query(evaluation_query).to_dataframe()
print(evaluation_result)

   precision    recall  accuracy  f1_score  log_loss  roc_auc
0   0.992593  0.990741  0.991667   0.99156  0.059278      1.0


# Random Forest

In [40]:
# Train a classification model using BigQuery ML
model_query_rf = """
CREATE OR REPLACE MODEL personal_sauravanand.iris_classification_model_rf
OPTIONS(model_type='random_forest_classifier',input_label_cols=['Species']) AS
SELECT *
FROM `{0}`
""".format(train_data_path)

bq.query(model_query_rf).result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f676634dc60>

In [41]:
# Evaluate the model
evaluation_query = """
SELECT
  *
FROM
  ML.EVALUATE(MODEL personal_sauravanand.iris_classification_model_rf,
    (
    SELECT *
    FROM `{0}`
    ))
""".format(train_data_path)

evaluation_result = bq.query(evaluation_query).to_dataframe()
print(evaluation_result)

   precision    recall  accuracy  f1_score  log_loss  roc_auc
0    0.97423  0.975589     0.975  0.974807  0.267127      1.0


# Other Classification Algorithms

Similar way we can try with other classification  algorithms like :- 

- BOOSTED_TREE_CLASSIFIER
- DNN_CLASSIFIER (Deep Neural Network Classifier)
