In [None]:
%pip install llama-index-vector-stores-chroma


In [None]:
import chromadb

chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection("quickstart")

In [None]:
results = chroma_collection.get()
print(results)

In [None]:
from llama_index.vector_stores.chroma import ChromaVectorStore

chroma_vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
chroma_storage_context = StorageContext.from_defaults(vector_store=chroma_vector_store)

In [None]:
pip install --upgrade pip

In [None]:
pip install llama-index-vector-stores-weaviate

In [None]:
pip install -U weaviate-client

In [None]:
pip install llama-index llama-hub

populate index from csv doc 

In [None]:
# setup Arize Phoenix for logging/observability
import phoenix as px
import llama_index.core

px.launch_app()
llama_index.core.set_global_handler("arize_phoenix")

In [None]:
import os
import json 
import openai

with open('openai.secret.json', 'r') as file:
    secret = json.load(file)
    os.environ["OPENAI_API_KEY"] = secret['secret']
    openai.api_key = os.environ["OPENAI_API_KEY"]

In [None]:
pip install llama-index-llms-huggingface

In [None]:
pip install llama-index-llms-huggingface-api

In [None]:
pip install llama-index-embeddings-huggingface

In [None]:
import weaviate
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings


client = weaviate.connect_to_local(
    headers={"X-OpenAI-Api-key": os.getenv("OPENAI_API_KEY")}
)

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)
Settings.chunk_size = 512
Settings.chunk_overlap = 50



In [None]:
client.collections.delete("LlamaIndex_auto")

In [None]:
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core import VectorStoreIndex, StorageContext

vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name='Gov_opportunities'
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)


In [None]:
%pip install llama-index llama-index-readers-json

In [None]:
pip install span-marker

In [None]:
pip install llama-index-extractors-entity

In [None]:
pip install nest_asyncio

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
pip install python-dateutil

In [None]:
from llama_index.core import SimpleDirectoryReader
import json
from dateutil import parser
from llama_index.core.async_utils import run_jobs
from llama_index.core import SummaryIndex
from llama_index.llms.openai import OpenAI
from llama_index.core.vector_stores import (
    FilterOperator,
    MetadataFilter,
    MetadataFilters,
)
from llama_index.core.schema import IndexNode


documents = SimpleDirectoryReader('/Users/haiyangliu/Workspace/gov_opportunity_new/content/opportunities/projects').load_data()
doc_index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)


In [None]:
chroma_index = VectorStoreIndex.from_documents(
    documents, storage_context=chroma_storage_context
)


In [None]:
async def aprocess_doc(doc, include_summary: bool = True):
    doc_object = json.loads(doc.text)
    metadata = doc.metadata
    new_metadata = {
        "project_id": doc_object['ProjectID'],
        'department': doc_object['Department/Ind.Agency'],
        'naics_code': doc_object['NaicsCode'],
        'psc_code': doc_object['ProductServiceCode'],
        'sub_department': doc_object['Sub-Tier'],
        'set_aside': doc_object['SetASide'],
        'set_aside_code': doc_object['SetASideCode'],
        'type': doc_object['Type'],
        'place_of_performance_country': doc_object['PlaceOfPerformanceCountry'],
        'place_of_performance_city': doc_object['PlaceOfPerformanceCity'],
        'place_of_performance_state': doc_object['PlaceOfPerformanceState'],
    }
    if (doc_object['ResponseDueDate']):
        due_date = parser.parse(doc_object['ResponseDueDate'])
        new_metadata["due_date_year"] = due_date.year
        new_metadata["due_date_month"] = due_date.month
        new_metadata["due_date_day"] = due_date.day

    if (doc_object['PostedDate']):
        posted_date = parser.parse(doc_object['PostedDate'])
        new_metadata["posted_date_year"] = posted_date.year
        new_metadata["posted_date_month"] = posted_date.month
        new_metadata["posted_date_day"] = posted_date.day

    if include_summary:
        summary_index = SummaryIndex.from_documents([doc])
        query_str = "Give a one-sentence concise summary of this project."
        query_engine = summary_index.as_query_engine(
            llm=OpenAI(model="gpt-3.5-turbo")
        )
        summary_txt = await query_engine.aquery(query_str)
        summary_txt = str(summary_txt)
    else:
        summary_txt = ""
    # filter for the specific doc id
    filters = MetadataFilters(
        filters=[
            MetadataFilter(
                key="file_name", operator=FilterOperator.EQ, value=metadata['file_name']
            ),
        ]
    )
    # create an index node using the summary text
    index_node = IndexNode(
        text=summary_txt,
        metadata=new_metadata,
        obj=doc_index.as_retriever(filters=filters),
        index_id=doc.id_,
    )

    return index_node

async def aprocess_docs(docs):
    """Process metadata on docs."""

    index_nodes = []
    tasks = []
    for doc in docs:
        task = aprocess_doc(doc)
        tasks.append(task)

    index_nodes = await run_jobs(tasks, show_progress=True, workers=3)

    return index_nodes 


In [None]:
for doc in documents:
    print(doc.metadata)

In [None]:
index_nodes = await aprocess_docs(documents)

In [None]:
for node in index_nodes:
    print(node.metadata)

In [None]:
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core import VectorStoreIndex, StorageContext

class_name = "LlamaIndex_auto"

vector_store_auto = WeaviateVectorStore(
    weaviate_client=client, index_name=class_name
)
storage_context_auto = StorageContext.from_defaults(
    vector_store=vector_store_auto
)

In [None]:
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex, StorageContext

class_name = "LlamaIndex_auto_chroma"

vector_store_auto = ChromaVectorStore(
    chroma_collection=chroma_collection
)
storage_context_auto = StorageContext.from_defaults(
    vector_store=vector_store_auto
)


In [None]:
from llama_index.core.callbacks import (
    CallbackManager,
    LlamaDebugHandler,
    CBEventType,
)

llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

In [None]:
index = VectorStoreIndex(
    objects=index_nodes, storage_context=storage_context_auto, callback_manager=callback_manager
)

In [None]:
from llama_index.core.vector_stores import MetadataInfo, VectorStoreInfo

vector_store_info = VectorStoreInfo(
    content_info="Government contract opportunities",
    metadata_info=[
        MetadataInfo(
            name="project_id",
            description="the unique identifier of this opportunity",
            type="string",
        ),
        MetadataInfo(
            name="department",
            description="The government department that published this opportunity",
            type="string",
        ),
        MetadataInfo(
            name="naics_code",
            description="NAICS Code. This code is maximum of 6 digits, also referred as Industry Classification Code",
            type="string",
        ),
        MetadataInfo(
            name="psc_code",
            description="Classification Code of this opportunity, also referred as Product Service Code or PSC Code for short",
            type="string",
        ),
        MetadataInfo(
            name="sub_department",
            description="Division or Sub-Department that published this opportunity",
            type="string",
        ),
        MetadataInfo(
            name="set_aside",
            description="Description of the Set Aside",
            type="string",
        ),
        MetadataInfo(
            name="set_aside_code",
            description="Code of the Set Aside field",
            type="string",
        ),
        MetadataInfo(
            name="type",
            description="Procurement Type of this opportunities",
            type="string",
        ),
        MetadataInfo(
            name="place_of_performance_country",
            description="Country of the Place of Performance for this project",
            type="string",
        ),
        MetadataInfo(
            name="place_of_performance_city",
            description="City of the Place of Performance for this project",
            type="string",
        ),
        MetadataInfo(
            name="place_of_performance_state",
            description="State of the Place of Performance for this project",
            type="string",
        ),
        MetadataInfo(
            name="due_date_year",
            description="The year of the Response Deadline date",
            type="integer",
        ),
        MetadataInfo(
            name="due_date_month",
            description="the month of the Response Deadline date",
            type="integer",
        ),
        MetadataInfo(
            name="due_date_day",
            description="the day of the Response Deadline date",
            type="integer",
        ),
        MetadataInfo(
            name="posted_date_year",
            description="Year of the Opportunity Posted Date",
            type="integer",
        ),
        MetadataInfo(
            name="posted_date_month",
            description="Month of the Opportunity Posted Date",
            type="integer",
        ),
        MetadataInfo(
            name="posted_date_day",
            description="day of the Opportunity Posted Date",
            type="integer",
        )
    ],
)

In [None]:
from llama_index.core.retrievers import VectorIndexAutoRetriever

retriever = VectorIndexAutoRetriever(
    index,
    vector_store_info=vector_store_info,
    similarity_top_k=2,
    empty_query_top_k=10,  # if only metadata filters are specified, this is the limit
    verbose=True,
    callback_manager=callback_manager
)

In [None]:
from llama_index.core import QueryBundle

nodes = retriever.retrieve(QueryBundle("find me opportunities about plumbing"))

In [None]:
nodes = retriever.retrieve(QueryBundle("Find me the projects from Department of Defense"))

In [None]:
old_retriever = doc_index.as_retriever()
old_nodes = old_retriever.retrieve("find me plumbing related projects")
for old_node in old_nodes:
    print(old_node.metadata)

In [None]:
for node in nodes:
    print(node.metadata)

In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-3.5-turbo")

query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

In [None]:
response = query_engine.query("find me plumbing related projects")


In [None]:
print(str(response))