# SDC SocialMedia Assist RAG + Embedding

## Install Libs

In [1]:
from langchain.chains import RetrievalQA
from langchain.globals import set_debug
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_community import BigQueryVectorStore, VertexFSVectorStore,GCSFileLoader
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings

from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

from google.cloud import storage
import io
from dotenv import dotenv_values

config = dotenv_values("the-llm-library/RAG-Use-Cases/RAG-on-GCP/config/config.env")

In [2]:
PROJECT_ID = config["PROJECT_ID"]
LOCATION = config["LOCATION"]
# CloudStorage
BUCKET_NAME = config["BUCKET_NAME"]
BLOB_NAME = config["BLOB_NAME"]

# BigQuery
DATASET_ID = config["DATASET_ID"]
TABLE_ID = config["TABLE_ID"]

EMBEDDING_MODEL = config["EMBEDDING_MODEL"]

## Embedding

In [3]:
def build_embedding_model(embedding_model, project_id):
    
    embedding_model = VertexAIEmbeddings(
        model_name=embedding_model, project=project_id
    )
    
    return embedding_model

## DataPrep PDF

In [4]:

# Initialize Cloud Storage client

loader = GCSFileLoader(
    project_name=PROJECT_ID, bucket=BUCKET_NAME, blob=BLOB_NAME
)

documents = loader.load()


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [5]:
# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)
doc_splits = text_splitter.split_documents(documents)

# Add chunk number to metadata
for idx, split in enumerate(doc_splits):
    split.metadata["chunk"] = idx

print(f"# of documents = {len(doc_splits)}")

# of documents = 2


## Configure BigQueryVectorStore as Vector Store

In [6]:
embedding_model = build_embedding_model(EMBEDDING_MODEL, PROJECT_ID)

In [7]:
def create_embeddings_table():
    dataset_id = 'sdc_marketing' # has to be created in bQ in beforehand
    table_id = 'sdc_instagram_guideline'

    schema = [
        bigquery.SchemaField('document_id', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('text', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('embedding', 'FLOAT64', mode='REPEATED'),
    ]

    table_ref = bigquery_client.dataset(dataset_id).table(table_id)
    table = bigquery.Table(table_ref, schema=schema)
    table = bigquery_client.create_table(table, exists_ok=True)
    print(f"Created table {table.full_table_id}")

# create_embeddings_table()

In [8]:

bq_store = BigQueryVectorStore(
    project_id=PROJECT_ID,
    location=LOCATION,
    dataset_name=DATASET_ID,
    table_name=TABLE_ID,
    embedding=embedding_model,
)

BigQuery table sdc-gen-ai.sdc_marketing.sdc_instagram_guideline initialized/validated as persistent storage. Access via BigQuery console:
 https://console.cloud.google.com/bigquery?project=sdc-gen-ai&ws=!1m5!1m4!4m3!1ssdc-gen-ai!2ssdc_marketing!3ssdc_instagram_guideline


In [9]:
doc_ids = bq_store.add_documents(doc_splits)

In [10]:

# bq_store.similarity_search(
#     "Welchen Zweck hat das Posting?"
# )

In [11]:
langchain_retriever = bq_store.as_retriever()

In [12]:
def build_system_prompt():
    
    # Define your custom prompt
    custom_prompt = PromptTemplate(
        input_variables=["context", "question", "chat_history"],
        template="""
        Du bist ein hilfsbereiter KI-Assistent für unser Unternehmen, spezialisiert auf Social-Media-Strategien und die Einarbeitung neuer Teammitglieder. 
        Nutze das vorhandene Wissen aus dem Kontext, um präzise und nützliche Antworten zu liefern.

        Gesprächsverlauf:
        {chat_history}

        Kontext:
        {context}

        Frage:
        {question}

        Bitte beachte bei deiner Antwort:
        - Gehe spezifisch auf die Bedürfnisse des Nutzers ein.
        - Nur auf Nachfrage! Biete praktische Tipps für Social-Media-Posts im Unternehmenskontext.
        - Unterstütze neue Kollegen mit klaren Anweisungen und Ressourcen.
        - Verwende eine positive und motivierende Sprache.
        - Halte dich kurze und beantworte im ersten schritt erst mal nur die Nutzer-Frage

        Antwort:
        """
        )
    return custom_prompt
custom_prompt = build_system_prompt()

# Initialize the LLM
llm = VertexAI(model_name="gemini-1.5-flash-002")

# Set up the conversation memory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    input_key="question",
    output_key="answer",
    return_messages=True,
    verbose=True
)

# Set up the ConversationalRetrievalChain with the custom prompt
conversational_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=langchain_retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": custom_prompt}
)

# Function to interact with the agent
def chat_with_agent():
    print("Start chatting with the agent (type 'exit' to stop):")
    while True:
        user_input = input("\nYou: ")
        if user_input.lower() == "exit":
            break
        response = conversational_chain.invoke({"question": user_input})
        print(f"\nAssistant: {response['answer']}")

# Start the chat
chat_with_agent()


  memory = ConversationBufferMemory(


Start chatting with the agent (type 'exit' to stop):



You:  exit
