In [1]:
# !pip install google-cloud-aiplatform==1.46.0 \
# 'bigframes<1.0.0' \
# langchain==0.1.14 \
# langchain_google_vertexai==0.1.2 \
# chromadb==0.4.24 \
# unstructured==0.12.6 \
# pillow-heif==0.15.0 \
# unstructured-inference==0.7.25 \
# pypdf==4.1.0 \
# pdf2image==1.17.0 \
# unstructured_pytesseract==0.3.12 \
# pikepdf==8.14.0 \
# --upgrade \
# --user

In [6]:
import os
import langchain
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.document_loaders import TextLoader, UnstructuredPDFLoader
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings
import vertexai

PROJECT_ID = "earnest-triumph-448219-g9"
LOCATION = "us-central1"

vertexai.init(project=PROJECT_ID, location=LOCATION)

INDEX_PATH = "./dataset/"
PERSIST_PATH = "./persistentdb/"

MODEL = "gemini-1.5-pro"
EMBEDDING_MODEL = "textembedding-gecko@003"

ModuleNotFoundError: No module named 'langchain'

In [6]:
def get_split_documents(index_path: str) -> list[str]:
    """
    This function is used to chunk documents and convert them into a list.

    Args:
    index_path: Path of the dataset folder containing the documents.

    Returns:
    List of chunked, or split documents.
    """

    split_docs = []

    for file_name in os.listdir(index_path):
        print(f"file_name : {file_name}")
        if file_name.endswith(".pdf"):
            loader = UnstructuredPDFLoader(index_path + file_name)
        else:
            loader = TextLoader(index_path + file_name)

        text_splitter = CharacterTextSplitter(chunk_size=8192, chunk_overlap=128)
        split_docs.extend(text_splitter.split_documents(loader.load()))

    return split_docs

EMBEDDING_NUM_BATCH = 5

embeddings = VertexAIEmbeddings(
    model_name=EMBEDDING_MODEL
)

In [None]:
split_docs = get_split_documents(INDEX_PATH)
db = Chroma.from_documents(
    documents=split_docs, embedding=embeddings, persist_directory=PERSIST_PATH
)
# db.persist()

In [7]:
llm = VertexAI(
    model=MODEL,
    max_output_tokens=2048,
    temperature=0.2,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

template = """
    You are a helpful AI assistant. You're tasked to answer the question given below, but only based on the context provided.
    context:

    {context}


    question:

    {input}


    If you cannot find an answer ask the user to rephrase the question.
    answer:

"""
prompt = PromptTemplate.from_template(template)

In [8]:
combine_docs_chain = create_stuff_documents_chain(llm, prompt)

retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

In [9]:
response = retrieval_chain.invoke({"input": "Tell me about Figuring the EIC."})

In [10]:
print(response["answer"])

To figure out the amount of your Earned Income Credit, you have two options:

1. **Let the IRS calculate it for you:** This is the simpler option. Refer to "IRS Will Figure the EIC for You" in Publication 596 for guidance.

2. **Calculate it yourself:** If you prefer to calculate the EIC on your own,  complete the front of Schedule R (Form 1040) and then proceed to Part III of the same schedule.  "How To Figure the EIC Yourself" in Publication 596 will provide detailed instructions.



<h2>Upload ChromaDb to GCS 

In [13]:
from google.cloud import storage
import os


BUCKET_NAME = "nl-llm"
GCS_PERSIST_PATH = "chroma/"
LOCAL_PERSIST_PATH = "./persistentdb/"

# Initialize GCS client
storage_client = storage.Client()

def upload_directory_to_gcs(local_directory, bucket_name, gcs_directory):
    """Upload all files in a local directory to a GCS directory."""
    bucket = storage_client.bucket(bucket_name)
    
    for root, _, files in os.walk(local_directory):
        for file_name in files:
            local_file_path = os.path.join(root, file_name)
            relative_path = os.path.relpath(local_file_path, local_directory)
            blob = bucket.blob(os.path.join(gcs_directory, relative_path))
            blob.upload_from_filename(local_file_path)
            print(f"Uploaded {local_file_path} to gs://{bucket_name}/{gcs_directory}{relative_path}")

# Upload Chroma persisted data to GCS bucket
upload_directory_to_gcs(LOCAL_PERSIST_PATH, BUCKET_NAME, GCS_PERSIST_PATH)

Uploaded ./persistentdb/chroma.sqlite3 to gs://nl-llm/chroma/chroma.sqlite3
Uploaded ./persistentdb/52207b07-059c-4577-8ff3-3388276bedaa/header.bin to gs://nl-llm/chroma/52207b07-059c-4577-8ff3-3388276bedaa/header.bin
Uploaded ./persistentdb/52207b07-059c-4577-8ff3-3388276bedaa/data_level0.bin to gs://nl-llm/chroma/52207b07-059c-4577-8ff3-3388276bedaa/data_level0.bin
Uploaded ./persistentdb/52207b07-059c-4577-8ff3-3388276bedaa/length.bin to gs://nl-llm/chroma/52207b07-059c-4577-8ff3-3388276bedaa/length.bin
Uploaded ./persistentdb/52207b07-059c-4577-8ff3-3388276bedaa/link_lists.bin to gs://nl-llm/chroma/52207b07-059c-4577-8ff3-3388276bedaa/link_lists.bin


In [None]:

def get_split_documents(index_path: str) -> List[str]:
    """
    This function is used to chunk documents and convert them into a list.

    Args:
    index_path: Path of the dataset folder containing the documents.

    Returns:
    List of chunked, or split documents.
    """
    split_docs = []

    # Check if directory exists
    if not os.path.exists(index_path):
        raise ValueError(f"Directory not found: {index_path}")

    for file_name in os.listdir(index_path):
        file_path = os.path.join(index_path, file_name)
        print(f"Processing file: {file_name}")
        
        try:
            # Select loader based on file type
            if file_name.lower().endswith(".pdf"):
                loader = UnstructuredPDFLoader(file_path)
            else:
                loader = TextLoader(file_path)

            # Split documents into chunks
            text_splitter = CharacterTextSplitter(
                chunk_size=8192,
                chunk_overlap=128,
                separator="\n"
            )
            
            documents = loader.load()
            split_chunks = text_splitter.split_documents(documents)
            split_docs.extend(split_chunks)
            
            print(f"Successfully processed {file_name}: {len(split_chunks)} chunks created")
            
        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")
            continue

    print(f"Total chunks created: {len(split_docs)}")
    return split_docs

In [19]:
import os
import langchain
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.document_loaders import UnstructuredPDFLoader,PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from llama_cpp import Llama
import pandas as pd

# Project & Database Setup
PROJECT_ID = "your_project_id"
LOCATION = "us-central1"
PERSIST_PATH = "./persistentdb/"
os.makedirs(PERSIST_PATH, exist_ok=True)

# Load PDF Document
pdf_loader = PyPDFLoader("tax-guidance.pdf")
documents = pdf_loader.load()

# Split Text into Chunks
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = text_splitter.split_documents(documents)

# Initialize Embedding Model
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Store Documents in ChromaDB
db = Chroma.from_documents(
    documents=split_docs,
    embedding=embedding_function,
    persist_directory=PERSIST_PATH
)
print(f"✅ Stored {len(split_docs)} chunks in ChromaDB!")

# Create Retriever
retriever = db.as_retriever()

# Define Prompt Template
template = """
    You are a helpful AI assistant. You're tasked to answer the question given below, but only based on the context provided.

    Context:
    {context}

    Question:
    {input}

    If you cannot find an answer, ask the user to rephrase the question.

    Answer:
"""
prompt = PromptTemplate.from_template(template)

# Load Mistral 7B GGUF Model
llm = Llama(model_path="mistral-7b-v0.1.Q4_K_M.gguf", n_ctx=2048)

# Create Retrieval Chain
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

# Query Example
query = "Tell me about Figuring the EIC?"
answer = retrieval_chain.invoke({"input": query})

print(f"Answer: {answer}")


llama_model_load_from_file_impl: using device Metal (Apple M3 Pro) - 11116 MiB free
llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from mistral-7b-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32          

✅ Stored 37 chunks in ChromaDB!


ggml_metal_init: GPU name:   Apple M3 Pro
ggml_metal_init: GPU family: MTLGPUFamilyApple9  (1009)
ggml_metal_init: GPU family: MTLGPUFamilyCommon3 (3003)
ggml_metal_init: GPU family: MTLGPUFamilyMetal3  (5001)
ggml_metal_init: simdgroup reduction   = true
ggml_metal_init: simdgroup matrix mul. = true
ggml_metal_init: has residency sets    = false
ggml_metal_init: has bfloat            = true
ggml_metal_init: use bfloat            = false
ggml_metal_init: hasUnifiedMemory      = true
ggml_metal_init: recommendedMaxWorkingSetSize  = 12884.92 MB
ggml_metal_init: loaded kernel_add                                    0x3d31317c0 | th_max = 1024 | th_width =   32
ggml_metal_init: loaded kernel_add_row                                0x3d32f4700 | th_max = 1024 | th_width =   32
ggml_metal_init: loaded kernel_sub                                    0x3d32f5970 | th_max = 1024 | th_width =   32
ggml_metal_init: loaded kernel_sub_row                                0x3d31319f0 | th_max = 1024 | th_

TypeError: object of type 'StringPromptValue' has no len()