In [None]:
import os

ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


In [4]:
# Import and initialize the OpenAI embedding model via LangChain
from langchain_openai import OpenAIEmbeddings

# Import and initialize the OpenAI embedding model via LangChain
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:

# Connect to the Astra DB vector store with specified collection and embedding model
vector_store = AstraDBVectorStore(
    collection_name="mdvs",
    embedding=embeddings,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN
)

In [10]:
# Connect to the Astra DB vector store with specified collection and embedding model
from langchain_astradb import AstraDBVectorStore
# Import and initialize the OpenAI embedding model via LangChain
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document


def load_and_vectorize_text_file(file_path, chunk_size=500, chunk_overlap=50):
    try:
        # Read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            text_content = file.read()
            
        # Create a document with metadata
        file_name = file_path
        document = Document(
            page_content=text_content,
            metadata={"source": file_path, "filename": file_name}
        )
        
        # Initialize text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )
        
        # Split text into chunks
        chunks = text_splitter.split_documents([document])
        print(f"Split document into {len(chunks)} chunks")
        
        # Initialize OpenAI embeddings
# Import and initialize the OpenAI embedding model via LangChain
        embeddings = OpenAIEmbeddings(
            model="text-embedding-3-small",
            openai_api_key=OPENAI_API_KEY
        )
        
        # Create vector store in AstraDB
# Connect to the Astra DB vector store with specified collection and embedding model
        vector_store = AstraDBVectorStore.from_documents(
            documents=chunks,
            embedding=embeddings,
            collection_name='mdvs',
            api_endpoint=ASTRA_DB_API_ENDPOINT,
            token=ASTRA_DB_APPLICATION_TOKEN,
        )
        
        print(f"Successfully stored {len(chunks)} vectorized chunks in AstraDB collection '{'mdvs'}'")
        return vector_store
        
    except Exception as e:
        print(f"Error processing file: {e}")
        return None


In [None]:

def search_similar_content(query, top_k=5):
    """
    Search for similar content using the vector store
    
    Parameters:
    query (str): The search query
    top_k (int): Number of results to return
    
    Returns:
    list: List of similar documents with scores
    """
    try:
        # Initialize embeddings
# Import and initialize the OpenAI embedding model via LangChain
        embeddings = OpenAIEmbeddings(
            model="text-embedding-3-small",
            openai_api_key=OPENAI_API_KEY
        )
        
        # Connect to existing vector store
        vector_store = AstraDB(
            embedding=embeddings,
            collection_name=COLLECTION_NAME,
            api_endpoint=ASTRA_DB_API_ENDPOINT,
            token=ASTRA_DB_APPLICATION_TOKEN,
        )
        
        # Search for similar content
        results = vector_store.similarity_search_with_score(query, k=top_k)
        
        return results
        
    except Exception as e:
        print(f"Error searching: {e}")
        return []


In [None]:

def load_multiple_files(directory_path, chunk_size=1000, chunk_overlap=100):
    """
    Load and vectorize multiple text files from a directory
    
    Parameters:
    directory_path (str): Path to directory containing text files
    chunk_size (int): Maximum size of each chunk in characters
    chunk_overlap (int): Overlap between chunks in characters
    """
    try:
        # Get all text files in the directory
        text_files = [f for f in os.listdir(directory_path) 
                     if os.path.isfile(os.path.join(directory_path, f)) 
                     and f.endswith(('.txt', '.md'))]
        
        # Initialize embeddings and vector store
# Import and initialize the OpenAI embedding model via LangChain
        embeddings = OpenAIEmbeddings(
            model="text-embedding-3-small",
            openai_api_key=OPENAI_API_KEY
        )
        
        # Create or connect to vector store
        vector_store = AstraDB(
            embedding=embeddings,
            collection_name=COLLECTION_NAME,
            api_endpoint=ASTRA_DB_API_ENDPOINT,
            token=ASTRA_DB_APPLICATION_TOKEN,
        )
        
        # Process each file
        for file_name in text_files:
            file_path = os.path.join(directory_path, file_name)
            
            # Read file
            with open(file_path, 'r', encoding='utf-8') as file:
                text_content = file.read()
            
            # Create document
            document = Document(
                page_content=text_content,
                metadata={"source": file_path, "filename": file_name}
            )
            
            # Split text
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                length_function=len,
            )
            chunks = text_splitter.split_documents([document])
            
            # Add to vector store
            vector_store.add_documents(chunks)
            print(f"Added {len(chunks)} chunks from '{file_name}'")
        
        print(f"Successfully processed {len(text_files)} files")
        return vector_store
        
    except Exception as e:
        print(f"Error processing directory: {e}")
        return None


In [None]:

# Example usage:
if __name__ == "__main__":
    # Option 1: Process a single file
    file_path = "C:\\Users\\hahaha\\Downloads\\mdvc.txt"
    vector_store = load_and_vectorize_text_file(file_path)
        


In [13]:
results = vector_store.similarity_search_with_score(
    "How to use cpap?", k=5
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=0.823029] air outlet on the machine.     *   Attach the other end of the tubing to your mask.     *
 If using a humidifier, fill the humidifier chamber with distilled water up to the
 indicated fill line and insert it into the machine. 3.  **Power the CPAP machine:**     *
 Plug the power supply into the machine and then into a power outlet.
 Chunk 568 | Source: https://freestyleserver.com/Payloads/IFU/2017july/ART22813-003_rev [{'source': 'C:\\Users\\hahaha\\Downloads\\mdvc.txt', 'filename': 'C:\\Users\\hahaha\\Downloads\\mdvc.txt'}]
* [SIM=0.816094] Getting used to your CPAP therapy can be challenging and there may be different equipment
 options that will work better for your personal comfort needs, so do
 Chunk 580 | Source: https://freestyleserver.com/Payloads/IFU/2017july/ART22813-003_rev
A_Web.pdf  n’t hesitate to reach out!  Not sure who your medical equipment supplier is? We
 can help. Fill out our Contact Us form or call us at 1-800-424-0737.   How do I clean my [{'sou