# AI assisted Chatbot

## Saving Bill Data

```mermaid 
sequenceDiagram
    participant C as CongressAPI
    participant PT as Pipeline Transforms
    box "Storage"
        participant S3 as S3 Storage
        participant PG as PostgreSQL
        participant M as Milvus
    end

    C->>PT: Bill Comes In
    PT->>S3: BillPDF
    PT->>M: Chunk Bill Text, Generate Embeddings
    PT->>S3: BillPDF.embeddings
    PT->>PG: Create Bill and BillChunk Entities with Embedding IDs
```



## Querying Bill Text

```mermaid
sequenceDiagram
    participant U as User
    participant PT as Pipeline Transforms
    participant M as Milvus
    participant DB as Postgres

    U->>PT: Submit Query
    PT->>M: Generate Embedding for Query
    M->>M: Perform Similarity Search
    M->>PT: Return Closest Embedding IDs
    DB->>PT: Retrieve BillChunks by embedding ID
    PT->>U: Return Relevant Chunks and Optional Full Bill
```


In [2]:
%%capture
%pip install PyPDF2 requests numpy pymilvus prisma pydantic boto3 minio langchain

In [3]:
import os
import sys
import logging

cwd = os.getcwd()
repo_root = os.path.abspath(os.path.join(cwd, "../../../"))
sys.path.append(repo_root)
print(repo_root)

# Set up logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


/Users/djdaniels/code/active_workspace/jupyter


In [5]:
import io
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection, connections
import PyPDF2
import numpy as np
import requests
import time
import pickle
from utils.request_cache import RequestCache
from utils.s3_connector import S3Connector

request_cache = RequestCache("./cache/embeddings_cache.pkl")

api_url = "https://api.openai.com/v1/embeddings"
openai_api_key = os.environ.get("CHATGPT_API_KEY")
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai_api_key}",
}


test_file_path = "./public-law-101-336.pdf"
bill_name = os.path.splitext(os.path.basename(test_file_path))[0]
pdf_file_key = f"{bill_name}/{os.path.basename(test_file_path)}"
# Example Usage
# s3_connector = S3Connector("bills")
# s3_connector.put_file(test_file_path, pdf_file_key)

2023-11-27 11:10:19,324 - INFO - Cache loaded from ./cache/embeddings_cache.pkl


In [7]:
def extract_text_from_pdf(pdf_path):
    logging.info(f"Extracting text from {pdf_path}")
    start_time = time.time()
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = "".join(
            reader.pages[page_num].extract_text()
            for page_num in range(len(reader.pages))
        )
    end_time = time.time()
    logging.info(f"Text extraction completed {end_time - start_time:.2f}")
    return text


def get_embeddings_for_chunks(chunks, request_cache):
    all_embeddings = []
    for i, chunk in enumerate(chunks):
        chunk_hash = RequestCache.generate_key(chunk)

        embeddings = request_cache.get(chunk_hash)
        if embeddings is None:
            start_time = time.time()
            data = {"model": "text-similarity-davinci-001", "input": chunk}
            response = requests.post(url=api_url, headers=headers, json=data)
            end_time = time.time()

            if response.status_code == 200:
                try:
                    embeddings = response.json()["data"][0]["embedding"]
                    request_cache.set(chunk_hash, embeddings)
                except KeyError as e:
                    logging.error(f"KeyError: {e} in response: {response.json()}")
                    continue
            else:
                logging.error(
                    f"Error in API request: {response.status_code} - {response.text}"
                )
                continue

            logging.info(
                f"Generated embeddings for chunk {i+1} in {end_time - start_time:.2f} seconds"
            )

        all_embeddings.append(embeddings)
    return all_embeddings


def chunk_text_by_tokens(text, max_tokens=2047, overlap=0):
    logging.info("Chunking text by tokens")
    words = text.split()
    chunks = []
    current_chunk = []
    token_count = 0

    for word in words:
        estimated_tokens = (
            len(word) // 4 + 1
        )  # Rough estimation of tokens for each word
        token_count += estimated_tokens

        if token_count < max_tokens:
            current_chunk.append(word)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            token_count = estimated_tokens

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    logging.info(f"Text chunked into {len(chunks)} parts based on tokens")
    return chunks

def process_bill_pdf(bill_pdf_path, request_cache):
    # sourcery skip: inline-immediately-returned-variable
    bill_text = extract_text_from_pdf(bill_pdf_path)
    chunks = chunk_text_by_tokens(bill_text)
    embeddings = get_embeddings_for_chunks(chunks, request_cache)
    return embeddings
    

| Run the procedure to generate embeddings of the text document

In [10]:
pdf_text = extract_text_from_pdf(test_file_path)
text_chunks = chunk_text_by_tokens(pdf_text, overlap=100)
embeddings = get_embeddings_for_chunks(text_chunks, request_cache)
logging.info(f"vector dimensionality is {len(embeddings[0])}")

# Save cache
request_cache.save()

concatenated_embeddings = np.concatenate(embeddings)

pickle.dump(
    concatenated_embeddings, open("../data/public-law-101-336.embeddings.pkl", "wb")
)

2023-11-27 11:11:58,367 - INFO - Extracting text from ./public-law-101-336.pdf
2023-11-27 11:11:58,888 - INFO - Text extraction completed 0.52
2023-11-27 11:11:58,888 - INFO - Chunking text by tokens
2023-11-27 11:11:58,891 - INFO - Text chunked into 23 parts based on tokens


TypeError: object of type 'float' has no len()

In [None]:
connections.connect("default", host="localhost", port="19530")

# Define the fields in your collection
# Assuming each embedding is a 256-dimensional vector
embedding_field = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=256)

# Create a primary key field if you want to uniquely identify each embedding
id_field = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True)

# Define the schema of the collection
schema = CollectionSchema(
    fields=[id_field, embedding_field], description="Bill Embeddings Collection"
)

# Name of the collection
collection_name = "BillEmbeddings"

# Create the collection in Milvus

collection = Collection(name=collection_name, schema=schema)

# Assuming you have a list of embeddings and their corresponding IDs
embeddings = [concatenated_embeddings]  # Your embeddings
ids = [1]  # Integer IDs corresponding to each embedding

# Insert data into the collection
mr = collection.insert([ids, embeddings])

# Optionally, you can flush the collection to make data searchable
collection.load()