In [None]:
!pip install sentence_transformers
!pip install pinecone-client
!pip install tqdm

In [2]:
from sentence_transformers import SentenceTransformer
import os
import json
import ast
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

In [3]:
# 파일 위치 지정
keys_file_path = os.path.join('data', 'api_keys.txt')

# 파일에서 API 키를 로드하는 함수
def load_api_keys(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        keys = json.load(file)
    return keys

# API 키 사용
api_keys = load_api_keys(keys_file_path)
pinecone_keys = api_keys['pinecone_key1']

In [4]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [8]:
with open('data/documents.txt', 'r', encoding="UTF-8") as f:
    lines = f.readlines()

embedding_list = []
for line in tqdm(lines):
    response = model.encode(line)    
    embedding_list.append(response)  

  0%|          | 0/41754 [00:00<?, ?it/s]

  1%|          | 262/41754 [00:02<07:12, 95.90it/s]


KeyboardInterrupt: 

In [19]:
# 백업
with open('data/miniLM-embedded-vector', 'w', encoding='utf-8') as f:
    f.write(str(embedding_list))

In [None]:
# 로드
with open('data/miniLM-embedded-vector', 'r', encoding='utf-8') as f:
    embedded_vector = f.read()
    embedding_list = ast.literal_eval(embedded_vector)

print(embedding_list)

In [20]:
em_vectors = []
for i, em in tqdm(enumerate(embedding_list)):
    em_vectors.append({"id": str(i), "values": em, "metadata": {"assertion": lines[i]}})
print(len(em_vectors))

41754it [00:00, 956492.36it/s]

41754





In [22]:
from pinecone import Pinecone

# Pinecone 클라이언트 설정 함수
def init_pinecone_client():
    pc = Pinecone(api_key=pinecone_keys)
    index = pc.Index("minilm")
    return index

# 업서트 작업을 수행하는 함수
def upsert_batch(batch):
    index = init_pinecone_client()
    index.upsert(vectors=batch)

In [24]:
# 배치 크기 설정
batch_size = 100

# 배치 리스트 생성
batches = [em_vectors[i:i+batch_size] for i in range(0, len(em_vectors), batch_size)]

# ThreadPoolExecutor를 사용한 멀티 스레딩
with ThreadPoolExecutor() as executor:
    # tqdm을 사용하여 진행 상태 표시
    list(tqdm(executor.map(upsert_batch, batches), total=len(batches)))

100%|██████████| 418/418 [03:55<00:00,  1.78it/s]
