# SDC Marketing RAG

## Install Libs

In [1]:
# %pip install --upgrade --user --quiet google-cloud-aiplatform "langchain-google-vertexai" "langchain-google-community[featurestore]" pypdf==4.2.0
# %pip install PyPDF2

In [2]:
from dotenv import dotenv_values
config = dotenv_values("config/config.env")

In [3]:
PROJECT_ID = config["PROJECT_ID"]
LOCATION = config["LOCATION"]
# CloudStorage
BUCKET_NAME = config["BUCKET_NAME"]
BLOB_NAME = config["BLOB_NAME"]

# BigQuery
DATASET_ID = config["DATASET_ID"]
TABLE_ID = config["TABLE_ID"]

EMBEDDING_MODEL = config["EMBEDDING_MODEL"]




## Embedding

In [4]:
from langchain.chains import RetrievalQA
from langchain.globals import set_debug
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_community import BigQueryVectorStore, VertexFSVectorStore,GCSFileLoader
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings

from google.cloud import storage
# import PyPDF2
import io


In [5]:
def build_embedding_model(embedding_model, project_id):
    
    embedding_model = VertexAIEmbeddings(
        model_name=embedding_model, project=project_id
    )
    
    return embedding_model

## DataPrep PDF

In [6]:

# Initialize Cloud Storage client

loader = GCSFileLoader(
    project_name=PROJECT_ID, bucket=BUCKET_NAME, blob=BLOB_NAME
)

documents = loader.load()


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [7]:
# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)
doc_splits = text_splitter.split_documents(documents)

# Add chunk number to metadata
for idx, split in enumerate(doc_splits):
    split.metadata["chunk"] = idx

print(f"# of documents = {len(doc_splits)}")

# of documents = 2


## Configure BigQueryVectorStore as Vector Store

In [8]:
embedding_model = build_embedding_model(EMBEDDING_MODEL, PROJECT_ID)

In [9]:
def create_embeddings_table():
    dataset_id = 'sdc_marketing' # has to be created in bQ in beforehand
    table_id = 'sdc_instagram_guideline'

    schema = [
        bigquery.SchemaField('document_id', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('text', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('embedding', 'FLOAT64', mode='REPEATED'),
    ]

    table_ref = bigquery_client.dataset(dataset_id).table(table_id)
    table = bigquery.Table(table_ref, schema=schema)
    table = bigquery_client.create_table(table, exists_ok=True)
    print(f"Created table {table.full_table_id}")

# create_embeddings_table()

In [10]:

bq_store = BigQueryVectorStore(
    project_id=PROJECT_ID,
    location=LOCATION,
    dataset_name=DATASET_ID,
    table_name=TABLE_ID,
    embedding=embedding_model,
)

BigQuery table sdc-gen-ai.sdc_marketing.sdc_instagram_guideline initialized/validated as persistent storage. Access via BigQuery console:
 https://console.cloud.google.com/bigquery?project=sdc-gen-ai&ws=!1m5!1m4!4m3!1ssdc-gen-ai!2ssdc_marketing!3ssdc_instagram_guideline


In [11]:
doc_ids = bq_store.add_documents(doc_splits)

In [12]:

bq_store.similarity_search(
    "Welchen Zweck hat das Posting?"
)

[Document(metadata={'doc_id': 'd4c9ad6aea37472a8c4be8097d196eda', 'source': 'gs://sdc_marketing/Skizze_Leitfaden_Instagram_Postings.pdf', 'chunk': 0, 'score': 0.7298426867676306}, page_content='Skizze: Leitfaden Instagram Postings\n\n1. Welchen Zweck hat das Posting? Überleg dir klare Ziele für das Posting 2. Welches Posting-Format (Reel, Story, Beitrag) wählst du? Habe das zugehörige Video-Format im Hinterkopf (s.u. “Technische Hintergründe”) 3. Werte und Ziele - Check! Überprüfe ob dein Vorhaben im Einklang steht: 4. 5. 6. 7. 8. 9. 10. 11. 12. 13. 14. 15. 16. 17. 18. 19. 20. 21. 22. 23.\n\n4. Wie bindest du die Community mit ein? Umfragen, Countdown-Sticker, zum Kommentieren auffordern (z.B. heute haben wir was zu python gelernt….was ist eure liebste Programmiersprache?),... 5. Welche Hashtags passen? Liste Hashtags: 6. Gestaltungsrichtlinen eingehalten?\n\nSchriftart: - Sprache (formell /Informell) - Tonalität und Hintergrundmusik - Farbschema: - - Postingstil (minimalistisch, bunt,

In [13]:
langchain_retriever = bq_store.as_retriever()

In [17]:

# # Set high verbosity
# set_debug(True)

# llm = VertexAI(model_name="gemini-1.5-flash-002")

# search_query = "What is the main topic discussed in the documents?"  # @param {type:"string"}

# retrieval_qa = RetrievalQA.from_chain_type(
#     llm=llm, chain_type="stuff", retriever=langchain_retriever
# )
# response = retrieval_qa.invoke(search_query)
# print("\n################ Final Answer ################\n")
# print(response["result"])
     

In [None]:
# Import necessary modules
# from langchain.callbacks import set_verbose
# set_verbose(True)

from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
# from langchain.llms import VertexAI
from langchain.memory import ConversationBufferMemory
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings

# Define your custom prompt
custom_prompt = PromptTemplate(
    input_variables=["context", "question", "chat_history"],
    template="""
    You are an AI assistant with the following characteristics:
    - Expertise in data science and machine learning.
    - Ability to provide clear and concise explanations.
    - Use bullet points when listing items.

    Conversation History:
    {chat_history}

    Context:
    {context}

    The user has asked the following question:
    {question}

    Instructions:
    - Provide a step-by-step answer.
    - Reference the context when relevant.
    - Do not include any irrelevant information.

    Answer:
    """
)

# Initialize the LLM
llm = VertexAI(model_name="gemini-1.5-flash-002")

# Set up the conversation memory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    input_key="question",
    output_key="answer",
    return_messages=True
)

# Set up the ConversationalRetrievalChain with the custom prompt
conversational_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=langchain_retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": custom_prompt}
)

# Function to interact with the agent
def chat_with_agent():
    print("Start chatting with the agent (type 'exit' to stop):")
    while True:
        user_input = input("\nYou: ")
        if user_input.lower() == "exit":
            break
        response = conversational_chain({"question": user_input})
        print(f"\nAssistant: {response['answer']}")

# Start the chat
chat_with_agent()


Start chatting with the agent (type 'exit' to stop):



You:  Welchen Zweck hat das Posting?


  response = conversational_chain({"question": user_input})


[32;1m[1;3m[chain/start][0m [1m[chain:ConversationalRetrievalChain] Entering Chain run with input:
[0m{
  "question": "Welchen Zweck hat das Posting?",
  "chat_history": []
}
[32;1m[1;3m[chain/start][0m [1m[chain:ConversationalRetrievalChain > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:ConversationalRetrievalChain > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Welchen Zweck hat das Posting?",
  "chat_history": "",
  "context": "Skizze: Leitfaden Instagram Postings\n\n1. Welchen Zweck hat das Posting? Überleg dir klare Ziele für das Posting 2. Welches Posting-Format (Reel, Story, Beitrag) wählst du? Habe das zugehörige Video-Format im Hinterkopf (s.u. “Technische Hintergründe”) 3. Werte und Ziele - Check! Überprüfe ob dein Vorhaben im Einklang steht: 4. 5. 6. 7. 8. 9. 10. 11. 12. 13. 14. 15. 16. 17. 18. 19. 20. 21. 22. 23.\n\n4. Wie bindest du die Community m