In [2]:
%env OPENAI_API_KEY #redacted key

env: OPENAI_API_KEY=#redacted key


In [18]:
import os
import numpy as np
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.schema import Document

# Step 1: Load documents from the excerpts directory
loader = DirectoryLoader('data/excerpts', glob="**/*.txt", loader_cls=TextLoader)
documents = loader.load()

# Step 2: Create embeddings and store them in a Chroma vector store
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(documents, embeddings)

def process_query_only_sources(query, k=2):
    def map_to_dataset(source):
        basename = os.path.basename(source)
        name = os.path.splitext(basename)[0]
        return "data/datasets/" + name
        
    # Search the database for similar documents
    relevant_docs = db.similarity_search(query, k=k)
    only_sources = map(map_to_dataset, map(lambda doc: doc.metadata['source'], relevant_docs))

    print(f"Query: {query}")
    print(f"Top {k} relevant documents:")
    for i, source in enumerate(only_sources, 1):
        print(f"{i}. Source: {source}")
        print()

    return only_sources

# Example usage
query = "what's the location of indoor bike station?"
relevant_docs = process_query_only_sources(query)

Query: what's the location of indoor bike station?
Top 2 relevant documents:
1. Source: data/datasets/bicycle-parking-racks

2. Source: data/datasets/bicycle-parking-racks

