In [9]:
# Import dependencies
import streamlit as st
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from lib.core import DataLoader, Config, Logger
from langchain.text_splitter import RecursiveCharacterTextSplitter
import logging.config
import logging.handlers

# Usage
config = Config('../parameters/config.json')
api_key = config.get_api_key()

In [None]:

logger = Logger('../parameters/logs.ini').get_logger()

loader = DataLoader(file_path='../data/TroutStocking.pdf')
data = loader.load()


try:
    text_contents = [doc.page_content for doc in data]
except Exception as e:
    logging.error(f"Error processing document contents: {e}")


def chunk_data(data, chunk_size= 256, chunk_overlap=20):
    try:
        # Initialize the text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = chunk_size,
            chunk_overlap= chunk_overlap,
            length_function=len)
        chunks = text_splitter.create_documents(text_contents)
        logging.info(f"Data chunking successful. Number of chunks: {len(chunks)}")
        return chunks
    except Exception as e:
        logging.error(f"Error in chunking data: {e}")

chunks = chunk_data(data, chunk_size= 256, chunk_overlap=20)
def create_embeddings(chunks):
    try:
        config = Config('../parameters/config.json')
        api_key = config.get_api_key()
        logger.info("Creating embeddings...")
        embeddings = OpenAIEmbeddings(api_key=api_key)
        vector_store = Chroma.from_documents(chunks, embeddings)
        logger.info("Embeddings created successfully.")
        return vector_store
    except Exception as e:
        logger.error(f"Error creating embeddings: {e}")

def prompt(vector_store, query,k):
    try:
        logger.info(f"Processing query: {query}")
        # Set up the language model and retriever
        llm = ChatOpenAI(api_key=api_key, model_name='gpt-4-1106-preview', temperature=1)
        retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})
        # Create and run the retrieval chain
        chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)
        answer = chain.run(query)
        logger.info("Query processed successfully.")
        return answer
    except Exception as e:
        logger.error(f"Error in processing query: {e}")   
        
def embedding_cost(texts):
    try:
        logger.info("Calculating embedding cost...")
        # Import tiktoken module
        import tiktoken
        enc = tiktoken.encoding_for_model('text-embedding-ada-002')
        # Calculate total tokens
        total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
        cost = total_tokens / 1000 * 0.0004
        # Log the calculated cost
        logger.info(f'Total Tokens: {total_tokens}')
        logger.info(f'Embedding Cost in USD: {cost:.6f}')
        return cost
    except Exception as e:
        logger.error(f"Error in calculating embedding cost: {e}")



In [7]:
vector_store = create_embeddings(chunks)
k=3

In [None]:
query = 'WHat bodies of water are in lumkin county?'
answer = prompt(vector_store, query,k)