### **Import packages**

In [None]:
from google.cloud import bigquery
import time
from vertexai.language_models import TextEmbeddingModel

### **Step1 : Create Train and Test Data Tables in BigQuery**

### **Step 2: Create Embeddings**


- **Create Embedding Model in BigQuery**

In [2]:
def create_embeddings_model(client,
                        project_id,
                        dataset_id, 
                        model_name,
                        connection_id):
    """
    Create a text embeddings model in BigQuery
    See https://cloud.google.com/blog/products/data-analytics/introducing-bigquery-text-embeddings for additional info
    Note: current model is set to 'textembedding-gecko@003', to update this model change it in the ENDPOINT BELOW
    
    Args:
        client: A BigQuery client object.
        project_id: The ID of the project to save the model,
        dataset_id: The ID of the dataset to save the model.
        model_name: The name to save the text embedding model under.
    """
    ## UPDATE MODEL VERSION BY CHANGING ENDPOINT BELOW
    query = f"""
    CREATE MODEL `{project_id}.{dataset_id}.{model_name}`
    REMOTE WITH CONNECTION `{project_id}.{connection_id}`
    OPTIONS(ENDPOINT='textembedding-gecko@003');
    """

    query_job = client.query(query)
    while query_job.done()!=True: 
        print('waiting for bigquery') 
        time.sleep(10) 
    if query_job.errors==None:
        print(f'model successfully created')
    else:
        print('model creation unsuccessful with errors:',query_job.errors)

In [3]:
def create_embedding(text) -> list:
    """
    Turn text a single string of text into embeddings
    Input: a single string of text
    Returns: a list of embeddings corresponding to the text 
    """
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")
    embeddings = model.get_embeddings([text])
    for embedding in embeddings:
        vector = embedding.values
    return vector

- **Generate Embeddings for both train and test data**

In [4]:
def generate_embeddings(client,
                        project_id,
                        dataset_id, 
                        input_table_name, 
                        output_table_name, 
                        uids,
                        target_attribute,
                        model_name='us-gcp-ame-con-52dbb-sbx-1.language_models.palm2_embeddings'):
    """
    Converts text to embeddings at scale using BigQuery ML.GENERATE_TEXT_EMBEDDING,
    Input: a data frame with a column containing multiple rows each with text
    Returns a data frame with embeddings for each row

    Args:
        client: A BigQuery client object.
        project_id: The ID of the project containincontaining the datasets,tables
        dataset_id: The ID of the dataset containing the input, output tables, and model.
        input_table_name: The name of the table containing the text content.
        output_table_name: The name of the table to create with the embeddings.
        model_name: The full name of the text embedding model within the dataset (optional, defaults to 'language_models.palm2_embeddings').
    """

    query = f"""
            CREATE OR REPLACE TABLE `{project_id}.{dataset_id}.{output_table_name}` AS 
            SELECT {','.join(uids)}, content as {target_attribute}, text_embedding
    FROM ML.GENERATE_TEXT_EMBEDDING(
        MODEL `{model_name}`,
        (SELECT {','.join(uids)}, {target_attribute} as content FROM `{project_id}.{dataset_id}.{input_table_name}`),
        STRUCT(TRUE AS flatten_json_output)
    )
    """

    query_job = client.query(query)
    while query_job.done()!=True: 
        print('waiting for bigquery') 
        time.sleep(10) 
    if query_job.errors==None:
        print(f"Embeddings generated successfully in table `{dataset_id}.{output_table_name}`")
    else:
        print('embeddings generation unsuccessful with errors:',query_job.errors)

In [5]:
client = bigquery.Client()

In [6]:
# create_embeddings_model(client=client,
#                         project_id='us-gcp-ame-con-52dbb-sbx-1',
#                         dataset_id='language_models', 
#                         model_name='palm2_embeddings',
#                         connection_id='us.dataquality')

In [7]:
# # generate embeddings for train data
# generate_embeddings(client=client,
#                     project_id='us-gcp-ame-con-52dbb-sbx-1',
#                     dataset_id='test_datasets',
#                     input_table_name='outlier_train',
#                     output_table_name='outlier_train_emb',
#                     uids=['asin','category'], 
#                     target_attribute='title')

In [8]:
# # generate embeddings for test data
# generate_embeddings(client=client,
#                     project_id='us-gcp-ame-con-52dbb-sbx-1',
#                     dataset_id='test_datasets',
#                     input_table_name='sample_data_gdrive',
#                     output_table_name='sample_data_emb',
#                     uids=['asin','category'], 
#                     target_attribute='title')


## **Step 3 : Text Outlier Detection:**

- **Class: Implement or import an OutlierDetector class for your chosen method (e.g., Isolation Forest).**
    
- **Training: Train the outlier detection model using the embeddings table for the training data.**
    
- **Identification: Apply the trained model to the embeddings table for the test data to identify potential outliers.**
    
- **Analysis: Analyze the identified outliers based on the original text data for insights and further actions.**

In [9]:
from sqlalchemy import *
from sqlalchemy.engine import create_engine
from sqlalchemy.schema import *
import datetime
from modules.outlier_detection import TextOutlierDetector
from modules.dq_engine import DqEngine

In [10]:
engine = create_engine('bigquery://us-gcp-ame-con-52dbb-sbx-1')
dqe = DqEngine(engine)
detector = TextOutlierDetector(engine)

  engine = create_engine('bigquery://us-gcp-ame-con-52dbb-sbx-1')


In [11]:
train_embeddings = detector.load_train(schema_name='test_datasets',
                                       table_name='outlier_train_emb',
                                       uids=['asin','category','title'], 
                                       target_attribute='title',
                                       embeddings='text_embedding')
train_embeddings.head(10)

Unnamed: 0,asin,category,title,TARGET_ATTRIBUTE_catalog_value,text_embedding
0,B01936EMBO,Prime_Pantry,"KIT KAT Chocolate Candy Bar, 4.5 Ounce","KIT KAT Chocolate Candy Bar, 4.5 Ounce","[0.03588782250881195, -0.03371730074286461, -0..."
1,B00IAE94BY,Prime_Pantry,"Post Shredded Wheat Original Breakfast Cereal,...","Post Shredded Wheat Original Breakfast Cereal,...","[0.010123880580067635, -0.03477214649319649, -..."
2,B00ZX1QHYO,Prime_Pantry,"Streit's Matzos, Egg and Onion, 11 oz","Streit's Matzos, Egg and Onion, 11 oz","[-0.001863841898739338, -0.0476984977722168, -..."
3,B0105MIRIE,Prime_Pantry,"Kashi, Breakfast Cereal, Organic Indigo Mornin...","Kashi, Breakfast Cereal, Organic Indigo Mornin...","[0.03369826450943947, -0.0232285987585783, -0...."
4,B00UQPPD24,Prime_Pantry,"Kool-Aid Flavored Drink Mix, Sugar Sweetened O...","Kool-Aid Flavored Drink Mix, Sugar Sweetened O...","[-0.009082681499421597, -0.05855241045355797, ..."
5,B000VA2ZFM,Prime_Pantry,"Ortega Green Chiles, Mild Whole Chiles, 4 Ounce","Ortega Green Chiles, Mild Whole Chiles, 4 Ounce","[0.010509694926440716, -0.024475159123539925, ..."
6,B016LO9BQW,Prime_Pantry,"NUTRO Senior Chicken, Whole Brown Rice and Oat...","NUTRO Senior Chicken, Whole Brown Rice and Oat...","[0.021382147446274757, -0.025307850912213326, ..."
7,B014CZ4ADG,Prime_Pantry,"Hime Seaweed Sushi Nori, 1 Ounce","Hime Seaweed Sushi Nori, 1 Ounce","[0.0018205390078946948, -0.04342801123857498, ..."
8,B010HKIP4K,Prime_Pantry,"KISSES Holiday Milk Chocolates, 36 Ounce","KISSES Holiday Milk Chocolates, 36 Ounce","[0.04144248738884926, -0.01052949856966734, -0..."
9,B01FUD39Y0,Prime_Pantry,Beanitos White Bean Sweet Chili &amp; Sour Cre...,Beanitos White Bean Sweet Chili &amp; Sour Cre...,"[0.052284348756074905, -0.007212603930383921, ..."


In [12]:
test_embeddings = detector.load_test(schema_name='test_datasets',
                                       table_name='sample_data_emb',
                                       uids=['asin','category','title'], 
                                       target_attribute='title',
                                       embeddings='text_embedding',
                                       exclude_scored='us-gcp-ame-con-52dbb-sbx-1.output.*')

In [13]:
odm = detector.train_single_model(train_embeddings)

In [14]:
odm.score_samples([create_embedding("Diet Pepsi, Bottles 16.9 Fl Oz, 6 Count")])

array([-0.43313723])

In [15]:
odm.score_samples([create_embedding("no title available")])

array([-0.53284379])

In [21]:
odm.score_samples([create_embedding("Meet Pixel 7a, the Tensor G2 chip makes it fast.\
                                         The Pixel Camera takes amazing photos and video. \
                                         VPN by One protects your online activity.\
                                         Pixel 7a features are rated highest in security.")])[0]

0.5349950401529593

In [17]:
results = detector.check_outliers(train_embeddings, test_embeddings, outlier_threshold=0.48)
results.head()

Unnamed: 0,asin,category,title,TARGET_ATTRIBUTE_catalog_value,TARGET_ATTRIBUTE_confidence_score,TARGET_ATTRIBUTE_model_value,TARGET_ATTRIBUTE_source_attributes
7,B00T7ATYWO,Prime_Pantry,Brass/Wood Harbor Master Telescope - Floor Sta...,Brass/Wood Harbor Master Telescope - Floor Sta...,0.496422,,
15,B010SI8YJW,Prime_Pantry,error 123842 unable to connect to database,error 123842 unable to connect to database,0.544201,,


#### **Write Outliers to BigQuery:**

In [18]:
dqe.write_dqresults(results, 
                uids=['asin','category','title'], 
                target_attribute='title', 
                function_name='outlier_detection', 
                error_id=30001, 
                error_code='Accuracy: flagged as outliers', 
                output_schema='output')