In [1]:
import os
from pinecone import Pinecone
from dotenv import load_dotenv
load_dotenv()

#파인콘 클라이언트 initialize
pc = Pinecone(api_key = os.environ["PINECONE_API_KEY"])

In [6]:
# pinecone serverless example
from pinecone import Pinecone, ServerlessSpec

# pc.create_index(
#    name="quickstart",
#    dimension=8,
#    metric="euclidean",
#    spec=ServerlessSpec(
#        cloud='aws',
#        region='us-east-1'
#    )
# )

index = pc.Index('quickstart')

index_stats = index.describe_index_stats()
print(index_stats)

{'dimension': 8,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [None]:
# # pinecone pod base example
# # 해당 예제에서는 사용하지 않음
# # pinecone free tier에서는 해당 spec 사용 불가

# from pinecone import Pinecone, PodSpec

# index_name = 'llm-study'
# dimension = 8
# metric = 'euclidean'
# spec = PodSpec('gcp-starter')

# if index_name in [index_info["name"] for index_info in pc.list_indexes()]:
#     pc.delete_index(index_name)

# pc.create_index(index_name, dimension=dimension, metric=metric, spec=spec)


# index = pc.Index(index_name)

# index_stats = index.describe_index_stats()
# print(index_stats)

In [7]:
index.upsert(
  vectors=[
    {"id": "vec1", "values": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]},
    {"id": "vec2", "values": [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]},
    {"id": "vec3", "values": [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]},
    {"id": "vec4", "values": [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]}
  ],
  namespace="ns1"
)

index.upsert(
  vectors=[
    {"id": "vec5", "values": [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]},
    {"id": "vec6", "values": [0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6]},
    {"id": "vec7", "values": [0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7]},
    {"id": "vec8", "values": [0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8]}
  ],
  namespace="ns2"
)

{'upserted_count': 4}

In [8]:
index.describe_index_stats()

{'dimension': 8,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 4}, 'ns2': {'vector_count': 4}},
 'total_vector_count': 8}

In [9]:
index.query(
  namespace="ns1",
  vector=[0.3,0.3,0.3,0.3,0.4,0.3,0.3,0.3],
  top_k=2,
  include_values=True
)

{'matches': [{'id': 'vec3',
              'score': 0.00999987125,
              'values': [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]},
             {'id': 'vec4',
              'score': 0.0700001717,
              'values': [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]}],
 'namespace': 'ns1',
 'usage': {'read_units': 6}}

In [10]:
index.query(
  namespace="ns2",
  vector=[0.7,0.7,0.7,0.7,0.7,0.7,0.7,0.7],
  top_k=3,
  include_values=True
)

{'matches': [{'id': 'vec7',
              'score': 0.0,
              'values': [0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7]},
             {'id': 'vec6',
              'score': 0.0799999237,
              'values': [0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6]},
             {'id': 'vec8',
              'score': 0.0799999237,
              'values': [0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8]}],
 'namespace': 'ns2',
 'usage': {'read_units': 6}}

## Hybrid Search용 metadata Upsert

In [11]:
index_name = 'quickstart'

if index_name in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.delete_index(index_name)

pc.create_index(
   name=index_name,
   dimension=8,
   metric="euclidean",
   spec=ServerlessSpec(
       cloud='aws',
       region='us-east-1'
   )
)

index = pc.Index(index_name)

index_stats = index.describe_index_stats()
print(index_stats)

{'dimension': 8,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [12]:
index.upsert(
  vectors=[
    {
      "id": "A",
      "values": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
      "metadata": {"genre":"comedy", "year": 2020}
    },
    {
      "id": "B",
      "values": [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2],
      "metadata": {"genre":"documentary", "year":2019}
    },
    {
      "id": "C",
      "values": [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3],
      "metadata": {"genre":"comedy","year":2019}
    },
    {
      "id": "D",
      "values": [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4],
      "metadata": {"genre":"drama"}
    }
  ]
)

{'upserted_count': 4}

### query filter
mongoDB와 동일
- $eq - Equal to (number, string, boolean)

- $ne - Not equal to (number, string, boolean)

- $gt - Greater than (number)

- $gte - Greater than or equal to (number)

- $lt - Less than (number)

- $lte - Less than or equal to (number)

- $in - In array (string or number)

- $nin - Not in array (string or number)

In [13]:
index.describe_index_stats()

{'dimension': 8,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4}},
 'total_vector_count': 4}

In [14]:
#쿼리연습
index.query(
    vector=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
    filter={"genre": {"$eq": "documentary"},
            "year": 2019
    },
    top_k=1,
    include_metadata=True
)

{'matches': [{'id': 'B',
              'metadata': {'genre': 'documentary', 'year': 2019.0},
              'score': 0.0800000131,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}

In [15]:
#쿼리연습
index.query(
    vector=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
    filter={"genre": {"$in": ["comedy", "documentary", "drama"]}
        },
    top_k=2,
    include_metadata=True
)


{'matches': [{'id': 'A',
              'metadata': {'genre': 'comedy', 'year': 2020.0},
              'score': 0.0,
              'values': []},
             {'id': 'B',
              'metadata': {'genre': 'documentary', 'year': 2019.0},
              'score': 0.0800000131,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}

In [16]:
#쿼리연습
index.query(
    vector=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
    filter={"genre": {"$eq":"documentary"},
            "year" : {"$gte": 2020}

        },
    top_k=2,
    include_metadata=True
)


{'matches': [], 'namespace': '', 'usage': {'read_units': 1}}

In [17]:
#or filter
index.query(
    vector=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
    filter={"$or":[
        {"genre": {"$eq":"documentary"}},
        {"year": {"$gte": 2020}}
    ]

    },
    top_k=2,
    include_metadata=True
)


{'matches': [{'id': 'A',
              'metadata': {'genre': 'comedy', 'year': 2020.0},
              'score': 0.0,
              'values': []},
             {'id': 'B',
              'metadata': {'genre': 'documentary', 'year': 2019.0},
              'score': 0.0800000131,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}

## Hybrid Search용 Sparse Vector Upsert

In [18]:
index_name = 'quickstart'

if index_name in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.delete_index(index_name)

pc.create_index(
   name=index_name,
   dimension=8,
   metric="dotproduct", # Sparse Vector Operation 지원
   spec=ServerlessSpec(
       cloud='aws',
       region='us-east-1'
   )
)

index = pc.Index(index_name)

index_stats = index.describe_index_stats()
print(index_stats)

{'dimension': 8,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [19]:
# 파인콘 레코드에 id, 덴스벡터, 메타데이터, 스파스벡터 모두 생성해보기
records=[
    {'id': 'vec1',
     'values': [0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2],
     'metadata': {'genre': 'drama'},
     # index를 제외한 나머지 인덱스는 0
     'sparse_values': {
        'indices': [10, 45, 16],
        'values' : [0.5, 0.5, 0.2]
     }
    },
    {'id': 'vec2',
     'values': [0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.9, 0.2],
     'metadata': {'genre': 'action'},
     'sparse_values': {
        'indices' : [15, 40, 11],
        'values' : [0.4, 0.5, 0.2]
     }
    }
]

In [20]:
# 파인콘 레코드 업서트하기:
index.upsert(records)

{'upserted_count': 2}

In [21]:
# 시맨틱서치 해보기
index.query(
    top_k=2,
    vector=[0.1, 0.2, 0.3, 0.1, 0.1, 0.2, 0.3, 0.1],
)

{'matches': [{'id': 'vec2', 'score': 0.620000064, 'values': []},
             {'id': 'vec1', 'score': 0.28, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

In [22]:
# 하이브리드 서치 해보기 (스파스벡터 추가)
index.query(
    top_k=2,
    vector=[0.1, 0.2, 0.3, 0.1, 0.1, 0.2, 0.3, 0.1],
    sparse_vector={
        'indices' :[10,45,16],
        'values' : [0.5,0.5,0.2]
    }
)

{'matches': [{'id': 'vec1', 'score': 0.819999933, 'values': []},
             {'id': 'vec2', 'score': 0.620000064, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

In [24]:
# Fusion Algorithm으로 하이브리드 결과 스코어링 방식 커스터마이즈
# alpha값으로 덴스<->스파스 가중치 조정
def hybrid_score_norm(dense, sparse, alpha: float):
    hs = {
        'indices': sparse['indices'],
        'values':  [v * (1 - alpha) for v in sparse['values']]
    }
    return [v * alpha for v in dense], hs

hs_dense, hs_sparse = hybrid_score_norm(
    [0.1, 0.2, 0.3, 0.1, 0.1, 0.2, 0.3, 0.1], 
    {
        'indices': [10, 45, 16],
        'values':  [0.5, 0.5, 0.2]
    }, 
    alpha=0.8)

print(hs_dense)
print(hs_sparse)

[0.08000000000000002, 0.16000000000000003, 0.24, 0.08000000000000002, 0.08000000000000002, 0.16000000000000003, 0.24, 0.08000000000000002]
{'indices': [10, 45, 16], 'values': [0.09999999999999998, 0.09999999999999998, 0.039999999999999994]}


In [25]:
# 조정된 가중치 값으로 다시 서치해보기
index.query(top_k=2,vector=hs_dense,sparse_vector=hs_sparse)

{'matches': [{'id': 'vec2', 'score': 0.496, 'values': []},
             {'id': 'vec1', 'score': 0.332000017, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

## 대용량 데이터 Upsert
- 한번의 upsert에 수천,수만개의 벡터를 다 업서트 시키는 대신, 배치화를 통해 업서트 파이프라인 가져가는 방법

In [26]:
import random
import itertools

# Batch job
def chunks(iterable, batch_size=100):
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

vector_dim = 8
vector_count = 500

example_data_generator = map(lambda i: (f'id-{i}', [random.random() for _ in range(vector_dim)]), range(vector_count))

for ids_vectors_chunk in chunks(example_data_generator, batch_size=100):
    index.upsert(vectors=ids_vectors_chunk)

In [28]:
index.describe_index_stats()

{'dimension': 8,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 502}},
 'total_vector_count': 502}

## Fetch / Update / Delete
- 인덱스 된 레코드들에 대한 일련의 CRUD Operation


In [29]:
# upsert 시 동봉된 id 값을 이용하여 조회
index.fetch(["vec1","vec2"])

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'vec1': {'id': 'vec1',
                      'metadata': {'genre': 'drama'},
                      'sparse_values': {'indices': [10, 16, 45],
                                        'values': [0.5, 0.2, 0.5]},
                      'values': [0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2]},
             'vec2': {'id': 'vec2',
                      'metadata': {'genre': 'action'},
                      'sparse_values': {'indices': [11, 15, 40],
                                        'values': [0.2, 0.4, 0.5]},
                      'values': [0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.9, 0.2]}}}

In [30]:
# 덴스벡터 업데이트

index.update("vec1", values=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8])

{}

In [31]:
index.fetch(["vec1"])

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'vec1': {'id': 'vec1',
                      'metadata': {'genre': 'drama'},
                      'sparse_values': {'indices': [10, 16, 45],
                                        'values': [0.5, 0.2, 0.5]},
                      'values': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]}}}

In [32]:
# 메타데이터 업데이트
index.update("vec1", set_metadata={"genre": "Hero", "year":2018})

{}

In [33]:
index.fetch(["vec1"])

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'vec1': {'id': 'vec1',
                      'metadata': {'genre': 'Hero', 'year': 2018.0},
                      'sparse_values': {'indices': [10, 16, 45],
                                        'values': [0.5, 0.2, 0.5]},
                      'values': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]}}}

In [34]:
# sparse vector update
index.update(
	id="vec1",
	sparse_values={"indices": [3,9], "values":[0.5,0.5]}
)

{}

In [35]:
index.fetch(['vec1'])

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'vec1': {'id': 'vec1',
                      'metadata': {'genre': 'Hero', 'year': 2018.0},
                      'sparse_values': {'indices': [3, 9],
                                        'values': [0.5, 0.5]},
                      'values': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]}}}

In [36]:
# id값으로 특정 레코드 삭제
index.delete(['vec1'])

{}

In [37]:
index.fetch(['vec1'])

{'namespace': '', 'usage': {'read_units': 1}, 'vectors': {}}