In [None]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("archive/Employee_Handbook.pdf")
pages = loader.load_and_split()
pages = pages[4:]  
text = "\n".join([doc.page_content for doc in pages])

In [None]:
print(text)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)
docs = text_splitter.create_documents([text])
print(docs)
for i, d in enumerate(docs):
    d.metadata = {"doc_id": i}

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()
API_KEY = os.getenv("GEMINI_API_KEY")
print(API_KEY)

In [None]:
import google.generativeai as genai

genai.configure(api_key=API_KEY)
model = genai.GenerativeModel("gemini-1.5-flash")
# response = model.generate_content("Explain how AI works")
# print(response.text)

In [12]:
import os
import google.generativeai as genai
import pandas as pd

os.environ['GEMINI_API_KEY'] = API_KEY
def get_embeddings(text):
   
    model = 'models/embedding-001'
    
    embedding = genai.embed_content(model=model,
                                    content=text,
                                    task_type="retrieval_document")
    return embedding['embedding']

content_list = [doc.page_content for doc in docs]

embeddings = [get_embeddings(content) for content in content_list]


dataframe = pd.DataFrame({
    'page_content': content_list,
    'embeddings': embeddings
})

In [14]:
import clickhouse_connect
client = clickhouse_connect.get_client(
      host='msc-0a72ba97.us-east-1.aws.myscale.com',
      port=443,
      username='suprio85_org_default',
      password='passwd_rdoILGlykRQnm4'
  )
  

In [16]:
print(client)

<clickhouse_connect.driver.httpclient.HttpClient object at 0x00000175A8180740>


In [17]:
# Create a table with the name 'handbook'
client.command("""
    CREATE TABLE default.handbook (
        id Int64,
        page_content String,
        embeddings Array(Float32),
        CONSTRAINT check_data_length CHECK length(embeddings) = 768
    ) ENGINE = MergeTree()
    ORDER BY id
""")

# The CONSTRAINT will ensure that the length of each embedding vector is 768

# Insert the data in batches
batch_size = 10
num_batches = len(dataframe) // batch_size
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = start_idx + batch_size
    batch_data = dataframe[start_idx:end_idx]
    # Insert the data
    client.insert("default.handbook", batch_data.to_records(index=False).tolist(), column_names=batch_data.columns.tolist())
    print(f"Batch {i+1}/{num_batches} inserted.")
# Create a vector index for a quick retrieval of data
client.command("""
ALTER TABLE default.handbook
    ADD VECTOR INDEX vector_index embeddings
    TYPE MSTG
""")

Batch 1/19 inserted.
Batch 2/19 inserted.
Batch 3/19 inserted.
Batch 4/19 inserted.
Batch 5/19 inserted.
Batch 6/19 inserted.
Batch 7/19 inserted.
Batch 8/19 inserted.
Batch 9/19 inserted.
Batch 10/19 inserted.
Batch 11/19 inserted.
Batch 12/19 inserted.
Batch 13/19 inserted.
Batch 14/19 inserted.
Batch 15/19 inserted.
Batch 16/19 inserted.
Batch 17/19 inserted.
Batch 18/19 inserted.
Batch 19/19 inserted.


['0', 'chi-msc-0a72ba97-msc-0a72ba97-0-0', 'OK', '0', '0']

In [19]:
def get_relevant_docs(user_query):
    query_embeddings = get_embeddings(user_query)
    results = client.query(f"""
        SELECT page_content,
        distance(embeddings, {query_embeddings}) as dist FROM default.handbook ORDER BY dist LIMIT 5
    """)
    relevant_docs = []
    for row in results.named_results():
        relevant_docs.append(row['page_content'])
    return relevant_docs

In [20]:
def make_rag_prompt(query, relevant_passage):
    relevant_passage = ' '.join(relevant_passage)
    prompt = (
        f"You are a helpful and informative chatbot that answers questions using text from the reference passage included below. "
        f"Respond in a complete sentence and make sure that your response is easy to understand for everyone. "
        f"Maintain a friendly and conversational tone. If the passage is irrelevant, feel free to ignore it.\n\n"
        f"QUESTION: '{query}'\n"
        f"PASSAGE: '{relevant_passage}'\n\n"
        f"ANSWER:"
    )
    return prompt



def generate_response(user_prompt):
    model = genai.GenerativeModel('gemini-pro')
    answer = model.generate_content(user_prompt)
    return answer.text

def generate_answer(query):
    relevant_text = get_relevant_docs(query)
    text = " ".join(relevant_text)
    prompt = make_rag_prompt(query, relevant_passage=relevant_text)
    answer = generate_response(prompt)
    return answer
answer = generate_answer(query="what is the Work Dress Code?")
print(answer)

The standard work dress code is business casual, which includes conservative and nice clothing like button-down shirts, trousers, blouses, sport coats, and skirts that are knee-length or longer. A tie is not necessary. Every Friday and Saturday employees can dress casually, but their clothes must still be neat and conservative.


In [21]:
answer = generate_answer(query="what is the office hours?")
print(answer)

The office working hours are as follows: 1) 9:00 am to 5:45 pm or 2) 9:30 am to 6:15 pm.
