# Module

In [None]:
import glob
import pandas as pd
from llama_index.core import Document
from tqdm import tqdm
import os
import sys
import dotenv
dotenv.load_dotenv()
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from llama_index.core import SimpleDirectoryReader
from llama_index.core.schema import MetadataMode
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding

from pymilvus import Collection, connections
import time

import openai
from llama_index.llms.openai import OpenAI

### Setting

In [2]:
# 환경변수 가져오기
start = time.time()
dotenv.load_dotenv()

# initialize LLM
llm = OpenAI(model="gpt-4o-mini")
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-m3")

vector_store = MilvusVectorStore(
    uri="http://localhost:19530", collection_name = "wiki_test", dim=1024, similarity_metric="COSINE", overwrite=True)

storage_context = StorageContext.from_defaults(vector_store=vector_store)
end = time.time()
print("실행시간 : ", end - start)

실행시간 :  11.857601642608643


# Load

In [3]:
parquet_file = "/home/livin/rag_pipeline/wikipedia_rag/data/wikipedia/wiki_ko.parquet"
df = pd.read_parquet(parquet_file)

In [None]:
lang = parquet_file.split("wiki_")[-1].split(".")[0]
documents = []

for i in tqdm(range(1000)):
    document = Document(
        text=df.iloc[i]["text"],
        metadata={"filename": df.iloc[i]["title"], "url": df.iloc[i]["url"], "lang":lang},
    )
    document.excluded_embed_metadata_keys = ["url", "lang"]
    document.excluded_llm_metadata_keys = ["url", "lang"]
    documents.append(document)
splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=200)
nodes = splitter.get_nodes_from_documents(documents, show_progress=True)
print(len(nodes))

index = VectorStoreIndex(nodes, embed_model=embed_model, storage_context=storage_context, show_progress=True)

100%|██████████| 1000/1000 [00:00<00:00, 6416.63it/s]


Parsing nodes:   0%|          | 0/1000 [00:00<?, ?it/s]

5968


Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

In [None]:
from multiprocessing import Pool
def batch_indexing()