# SDC SocialMedia Assist Preprocessing & Embedding

## Import Libs 

In [9]:
from langchain_google_community import BigQueryVectorStore, VertexFSVectorStore,GCSFileLoader
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from google.cloud import storage
from dotenv import dotenv_values

config = dotenv_values("config/config.env")

## Loading Params

In [2]:
PROJECT_ID = config["PROJECT_ID"]
LOCATION = config["LOCATION"]
# CloudStorage
BUCKET_NAME = config["BUCKET_NAME"]
BLOB_NAME = config["BLOB_NAME"]

# BigQuery
DATASET_ID = config["DATASET_ID"]
TABLE_ID = config["TABLE_ID"]

EMBEDDING_MODEL = config["EMBEDDING_MODEL"]

## Embedding & Preprocessing

In [3]:
def build_embedding_model(embedding_model, project_id):
    
    embedding_model = VertexAIEmbeddings(
        model_name=embedding_model, project=project_id
    )
    
    return embedding_model

In [4]:
# Thin function load the doocument from a GCS bucket
def document_loader(project_id, bucket_name, blob_name):

    loader = GCSFileLoader(
        project_name=project_id, bucket=bucket_name, blob=blob_name
    )

    documents = loader.load()
    
    return documents


In [5]:
def text_splitter(documents):
    # split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=50,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
    )
    doc_splits = text_splitter.split_documents(documents)
    
    return doc_splits

In [6]:
# Just necessary if you wanna custom define the your table

# def create_embeddings_table():
#     dataset_id = 'sdc_marketing' # has to be created in bQ in beforehand
#     table_id = 'sdc_instagram_guideline'

#     schema = [
#         bigquery.SchemaField('document_id', 'STRING', mode='REQUIRED'),
#         bigquery.SchemaField('text', 'STRING', mode='REQUIRED'),
#         bigquery.SchemaField('embedding', 'FLOAT64', mode='REPEATED'),
#     ]

#     table_ref = bigquery_client.dataset(dataset_id).table(table_id)
#     table = bigquery.Table(table_ref, schema=schema)
#     table = bigquery_client.create_table(table, exists_ok=True)
#     print(f"Created table {table.full_table_id}")

# create_embeddings_table()

In [7]:
def initialize_bqvector(project_id, location, dataset_id, table_id, embedding_model):
    bq_store = BigQueryVectorStore(
        project_id=project_id,
        location=location,
        dataset_name=dataset_id,
        table_name=table_id,
        embedding=embedding_model,
    )

## Run functions

In [10]:
print("01 - Initialize embedding model")
embedding_model = build_embedding_model(EMBEDDING_MODEL, PROJECT_ID)

print("02 - Loads documents from GCS")
documents = document_loader(PROJECT_ID, BUCKET_NAME, BLOB_NAME)

print("03 - Prep Docs")
chunks = text_splitter(documents)

# Test
print("Add chunk number to metadata")
for idx, split in enumerate(chunks):
    split.metadata["chunk"] = idx

print(f"# of documents = {len(chunks)}")

# Just necessary if you wanna custom define the your table
# create_embeddings_table()

print("04 - Initialize BQVector Storage")
bq_store = initialize_bqvector(PROJECT_ID,LOCATION,DATASET_ID,TABLE_ID,embedding_model)

# print("05 - Store Chunks to BQVector")
# bq_store.add_documents(chunks)

print("Done :)")

Initialize embedding model
Loads documents from GCS
Prep Docs
Add chunk number to metadata
# of documents = 2
Initialize BQVector Storage
BigQuery table sdc-gen-ai.sdc_marketing.sdc_instagram_guideline initialized/validated as persistent storage. Access via BigQuery console:
 https://console.cloud.google.com/bigquery?project=sdc-gen-ai&ws=!1m5!1m4!4m3!1ssdc-gen-ai!2ssdc_marketing!3ssdc_instagram_guideline
Store Chunks to BQVector


In [None]:
# Test
# bq_store.similarity_search(
#     "Welchen Zweck hat das Posting?"
# )