In [307]:
import numpy as np
import pandas as pd

In [160]:
import os
import re
from dotenv import load_dotenv
import openai
load_dotenv()

OPENAI_KEY = os.getenv('OPENAI_API_KEY')
client = openai.OpenAI(api_key=OPENAI_KEY)

In [161]:
import lancedb
from lancedb.pydantic import LanceModel, Vector

In [230]:
# Function to generate embeddings using OpenAI
# text-embedding-ada-002
def generate_embeddings(texts):        
    embeddings = []
    for text in texts:
        response = client.embeddings.create(input=text, model="text-embedding-3-small")
        embeddings.append(response.data[0].embedding)
    
    return embeddings

In [252]:
def generate_query_embeddings(query):
    response = client.embeddings.create(input=query, model="text-embedding-ada-002")
    return response.data[0].embedding

In [231]:
# Connect to the LanceDB database
db = lancedb.connect("mydb")

class KnowledgeBase(LanceModel):
    kb_id: str  # Unique knowledge base ID
    name: str  # Knowledge base name
    description: str  # Optional description
    model: str  # Embedding model used

class Chunk(LanceModel):
    chunk_id: str  # Unique chunk ID
    kb_id: str  # Foreign key to `KnowledgeBase`    
    text: str  # Chunked text
    vector: Vector(1536)  # Embedding vector

In [232]:
db.drop_all_tables()

In [233]:
# Create the table if it doesn't already exist
if "KnowledgeBase" not in db.table_names():
    table = db.create_table("KnowledgeBase", schema=KnowledgeBase)
if "Chunk" not in db.table_names():
    table = db.create_table("Chunk", schema=Chunk)

In [1]:
from unstructured.partition.auto import partition

In [317]:
elements_txt = partition(r"D:\Programming\Python\AI\Basics\AMNIL Tech\Chat With Docs\better_app_test\data.txt")

libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.


In [316]:
"\n\n".join([str(el) for el in elements_txt])

'Business intelligence (BI) consists of strategies, methodologies, and technologies used by enterprises for data analysis and management of business information.[1] Common functions of BI technologies include reporting, online analytical processing, analytics, dashboard development, data mining, process mining, complex event processing, business performance management, benchmarking, text mining, predictive analytics, and prescriptive analytics.\n\nBI tools can handle large amounts of structured and sometimes unstructured data to help organizations identify, develop, and otherwise create new strategic business opportunities. They aim to allow for the easy interpretation of these big data. Identifying new opportunities and implementing an effective strategy based on insights is assumed to potentially provide businesses with a competitive market advantage and long-term stability, and help them take strategic decisions.[2]\n\nBusiness intelligence can be used by enterprises to support a wi

In [2]:
elements_img = partition(r"D:\Programming\Python\AI\Basics\AMNIL Tech\Chat With Docs\better_app_test\test_img.png")

  from .autonotebook import tqdm as notebook_tqdm


TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.

In [309]:
elements = partition(r"D:\Programming\Python\AI\Basics\AMNIL Tech\Chat With Docs\better_app_test\energy.pdf")

In [None]:
"\n\n".join([str(el) for el in elements_img])

In [312]:
document = "\n\n".join([str(el) for el in elements])

In [234]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create an instance of RecursiveCharacterTextSplitter
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Define the chunk size
    chunk_overlap=100,  # Define the overlap size
    length_function=len  # Defines how length is measured
)
with open('data.txt','r') as f:
    data = f.read()

#Split the document into chunks
chunks = recursive_splitter.split_text(data)

In [235]:
embedded_chunks = generate_embeddings(chunks)

In [236]:
np.array(embedded_chunks).shape

(4, 1536)

In [237]:
import uuid

In [238]:
table = db.open_table("KnowledgeBase")
table.create_fts_index("name", use_tantivy=False,replace=True)
table.search("DWBI",vector_column_name='name').select(["kb_id"]).to_list()

[]

In [239]:
def insert_chunks(chunks, embedded_chunks,KB_NAME,model):
    kb_table = db.open_table('KnowledgeBase')
    kb_table.create_fts_index("name", use_tantivy=False,replace=True)
    existing_kb = kb_table.search(KB_NAME,vector_column_name='name').select(["kb_id"]).to_list()
    if existing_kb:
        print(f"Knowledge base '{KB_NAME}' already exists.")
    else:    
        kb_id=str(uuid.uuid4())
        # Create a new knowledge base
        kb_table.add([KnowledgeBase(
            kb_id=kb_id,
            name=KB_NAME,
            description="A knowledge base for DWBI",
            model=model
        )])
        chunk_table = db.open_table('Chunk')
        # Save each chunk and its vector into LanceDB
        for chunk, embedding in zip(chunks, embedded_chunks):
            padded_vector = np.pad(embedding, (0, 1536 - len(embedding)), 'constant', constant_values=0)
            chunk_table.add([Chunk(chunk_id=str(uuid.uuid4()),kb_id=kb_id,text=chunk, vector=padded_vector)])

In [240]:
insert_chunks(chunks, embedded_chunks,KB_NAME='DWBI',model="embed-english-v3.0")

## Querying Knowledge Base

In [284]:
def get_kb_id(KB_NAME):
    try:
        table = db.open_table("KnowledgeBase")
        table.create_fts_index("name", use_tantivy=False,replace=True)
        return table.search(KB_NAME,vector_column_name='name').select(["kb_id"]).to_list()[0]['kb_id']    
    except Exception as e:
        return None

In [285]:
def retrieve_KB(KB_NAME):    
    kb_id = get_kb_id(KB_NAME)
    if kb_id is None:
        return None
    kb_table = db.open_table("Chunk")
    kb_table.create_fts_index("kb_id", use_tantivy=False,replace=True)
    chunk_df = kb_table.search(kb_id,vector_column_name='kb_id').select(["chunk_id","kb_id","text","vector"]).to_pandas()    
    return chunk_df

In [286]:
def calculate_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [298]:
def generate_prompt(query,chunk_df):
    embedded_query = generate_query_embeddings(query) 
    chunk_df['similarity'] = chunk_df.apply(lambda x: calculate_similarity(embedded_query,x.iloc[3]),axis=1)    
    df_top = chunk_df[chunk_df['similarity'] > chunk_df['similarity'].quantile(0.8)]      
    context = ''
    for txt in df_top['text']:
        context+=txt
        context+='\n'
    prompt = f"""
    "QUESTION:" {query}\n
    "CONTEXT:" {context}
    """
    return prompt

In [299]:
def generate_response(prompt):
    premable = 'You are a friendly bot. Don\'t ask for extra context. Read the "QUESTION:" and reply. If the question demands some information, answer the question provided as "QUESTION:" using the context provided as "CONTEXT:" If the answer is not present, say you don\'t know.'
    response = client.chat.completions.create(
        model="gpt-4o-mini", 
        messages=[{'role':'system','content':premable},{'role':'user','content':prompt}],
        max_tokens=150,  # You can adjust the max tokens based on your needs
        temperature=0.5,  # Adjust temperature for randomness (optional)
    )
    return response

In [300]:
def respond(KB_NAME,query):
    chunk_df = retrieve_KB(KB_NAME)
    if chunk_df is None:
        return 'No Knowledge base found.'
    chunk_df.drop(columns=['_score'],inplace=True)
    prompt = generate_prompt(query,chunk_df)    
    response = generate_response(prompt)    
    generated_text = response.choices[0].message.content
    return generated_text

In [301]:
respond('DWBI',"Where do BI applications get their data?")

'BI applications get their data from large amounts of structured and sometimes unstructured data.'