# Embedding & Vector Store

In [3]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [None]:
# import library

import config
from pprint import pprint
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document
from langchain_chroma import Chroma

In [None]:
# setting

CHUNK_SIZE = config.chunk_size
CHUNK_OVERLAP = config.chunk_overlap

MODEL_NAME  = config.model_name
EMBEDDING_NAME = config.embedding_name

COLLECTION_NAME = config.collection_name
PERSIST_DIRECTORY = config.persist_directory


In [None]:
# load & split the data
# TextLoader 
# RecursiveChractoerTextSplitter.from_tiktoken_encoder()

## Data path
path = "data/blue_r.txt"


loader = TextLoader(path, encoding="utf-8")
load_docs = loader.load()


## Data load, split
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name=MODEL_NAME, 
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

docs = splitter.split_document(load_docs)

print(len(docs))

## Metadata 생성
# name = os.path.splitext(os.path.basename(path))[0].split('_')[-1]
# metadata = {
#         "title": name,
#         "name": name,
#         "full_text":full_text
#     }

## Document 생성
document_list = []
for doc in docs: # docs: split된 소설 내용들. list[str]
        _doc = Document(metadata=metadata, page_content=doc)
        document_list.append(_doc)

print(len(document_list))


## Vector sotre 저장
embedding_model = OpenAIEmbeddings(
    model=EMBEDDING_NAME
)

## Persist directory 없는 경우 생성
if not os.path.exists(PERSIST_DIRECTORY):
    os.makedirs(PERSIST_DIRECTORY)

# 연결 + document 추가
vector_store = Chroma.from_documents(
    documents=document_list,
    embedding=embedding_model,
    collection_name=COLLECTION_NAME,
    persist_directory=PERSIST_DIRECTORY
)



# question -> Embedding Vector
query = "Restaurant?"
embedding_query = embedding_model.embed_query(query) # 한문장 변환.
print(type(embedding_query), len(embedding_query))