In [5]:
import os

#!pip install pymilvus
from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma, Milvus
from langchain_openai import OpenAIEmbeddings

# Load environment variables from .env
load_dotenv()

# Define the directory containing the text files and the persistent directory
current_dir = "C:\\Users\\Administrator\\PycharmProjects\\ReadReco\\research\\notebooks"
books_dir = os.path.join(current_dir, "books")

In [6]:
# Ensure the books directory exists
if not os.path.exists(books_dir):
    raise FileNotFoundError(f"The directory {books_dir} does not exist. Please check the path.")
else:
    # List all text files in the directory
    book_files = [f for f in os.listdir(books_dir) if f.endswith(".txt")]

    # Read the text content from each file and store it with metadata
    documents = []

    for book_file in book_files:
        file_path = os.path.join(books_dir, book_file)
        loader = TextLoader(file_path)
        book_docs = loader.load()

        for doc in book_docs:
            # Add metadata to each document indicating its source
            doc.metadata = {"source": book_file}
            documents.append(doc)

    # Split the documents into chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    docs = text_splitter.split_documents(documents)

    # Display information about the split documents
    print("\n--- Document Chunks Information ---")
    print(f"Number of document chunks: {len(docs)}")

    # Create embeddings
    print("\n--- Creating embeddings ---")
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  # Update to a valid embedding model if needed
    print("\n--- Finished creating embeddings ---")

    # Create the vector store and persist it
    print("\n--- Creating and persisting vector store ---")
    db = Milvus.from_documents(docs, embeddings, collection_name="novels", connection_args={"host": "127.0.0.1", "port": "19530"})
    print("\n--- Finished creating and persisting vector store ---")

Created a chunk of size 3416, which is longer than the specified 1000
Created a chunk of size 1093, which is longer than the specified 1000
Created a chunk of size 1748, which is longer than the specified 1000
Created a chunk of size 1922, which is longer than the specified 1000
Created a chunk of size 1523, which is longer than the specified 1000
Created a chunk of size 1075, which is longer than the specified 1000
Created a chunk of size 2093, which is longer than the specified 1000
Created a chunk of size 1683, which is longer than the specified 1000
Created a chunk of size 1556, which is longer than the specified 1000
Created a chunk of size 1019, which is longer than the specified 1000
Created a chunk of size 1437, which is longer than the specified 1000
Created a chunk of size 1766, which is longer than the specified 1000
Created a chunk of size 1046, which is longer than the specified 1000
Created a chunk of size 1055, which is longer than the specified 1000
Created a chunk of s


--- Document Chunks Information ---
Number of document chunks: 1991

--- Creating embeddings ---

--- Finished creating embeddings ---

--- Creating and persisting vector store ---

--- Finished creating and persisting vector store ---
