In [1]:
import pandas as pd
from tqdm import tqdm

from embedding.onehotembedder import OneHotEmbedder
from embedding.doc2vecembedder import Doc2VecEmbedder
from vectorstore.simplevec import SimpleVectorDatabase
from models.text import TextDoc
from models.vec import Vector

In [2]:
VECTOR_DIM = 1500

In [3]:
# load wikipedia dataset (find data in data_load notebook)
dataset = pd.read_csv("data_local/wiki_subset_small.csv", sep=",")
dataset = dataset.head(VECTOR_DIM)
# there need to be at least as many documents as the vector dimension

In [4]:
# fit the embedder
embedder = Doc2VecEmbedder(vector_dim=VECTOR_DIM)
# embedder = OneHotEmbedder(vector_dim=VECTOR_DIM)
embedder.fit(list(dataset.text))

2023-12-03 15:34:56,713::doc2vecembedder.py[_fit()]::INFO::Fitting embedder. Preprocessing documents...
100%|██████████| 1500/1500 [00:32<00:00, 45.64it/s]
2023-12-03 15:35:32,625::doc2vecembedder.py[_fit()]::INFO::Fitting Doc2Vec model...


In [5]:
# create vector database
vecdb = SimpleVectorDatabase(vector_dim=VECTOR_DIM)

# create vectors and store in database
for i in tqdm(range(len(dataset))):
    row = dataset.iloc[i]
    doc = TextDoc(row.text)
    vec = Vector(
        embedding=embedder.embed(doc),
        data=TextDoc(doc),
        metadata={"type": "wikipedia", "src": row.url, "title": row.title})
    vecdb.upsert(vec)

  0%|          | 0/1500 [00:00<?, ?it/s]

100%|██████████| 1500/1500 [14:05<00:00,  1.77it/s]


In [6]:
row = dataset.iloc[-1,:]
new_doc = TextDoc(row.text)
print(new_doc)

new_vec = Vector(
    embedding=embedder.embed(new_doc),
    data=TextDoc(new_doc),
    metadata={"type": "wikipedia", "src": row.url, "title": row.title}
)

TextDoc(42479): A neutron star is the collapsed core of a massive super ...


In [7]:
vector_1 = vecdb.get_random_vector()
vector_1.data

TextDoc(37375): The European bison (Bison bonasus) or the European wood ...

In [8]:
similar_vectors = vecdb.sim_search(vector_1, measure="cosine", k=5)
similar_vectors

[{'vector': <models.vec.Vector at 0x17f58b850>, 'score': 1.0000001},
 {'vector': <models.vec.Vector at 0x17f560950>, 'score': 0.45633447},
 {'vector': <models.vec.Vector at 0x17f5027d0>, 'score': 0.43197227},
 {'vector': <models.vec.Vector at 0x17f488c10>, 'score': 0.42692444},
 {'vector': <models.vec.Vector at 0x17f4b3950>, 'score': 0.4147767}]

In [9]:
print(f"vector:       [{vector_1.metadata['title']}] {vector_1.data}")
i = 4
print(f"most similar: [{similar_vectors[i]['vector'].metadata['title']}] {similar_vectors[i]['vector'].data}")

vector:       [European bison] TextDoc(37375): The European bison (Bison bonasus) or the European wood ...
most similar: [Ardipithecus] TextDoc(15011): Ardipithecus is a genus of an extinct hominine that liv ...


In [10]:
vector_1.data.body

'The European bison (Bison bonasus) or the European wood bison, also known as the wisent ( or ), or the zubr (), or colloquially the European buffalo, is a European species of bison. It is one of two extant species of bison, alongside the American bison. The European bison is the heaviest wild land animal in Europe and individuals in the past may have been even larger than their modern-day descendants. During late antiquity and the Middle Ages, bison became extinct in much of Europe and Asia, surviving into the 20th century only in northern-central Europe and the northern Caucasus Mountains. During the early years of the 20th century bison were hunted to extinction in the wild. The species — now numbering several thousand and returned to the wild by captive breeding programmes — is no longer in immediate danger of extinction, but remains absent from most of its historical range. It is not to be confused with the aurochs (Bos primigenius), the extinct ancestor of domestic cattle, with w

In [11]:
similar_vectors[1]['vector'].data.body

'The Carolina parakeet (Conuropsis carolinensis), or Carolina conure, is an extinct species of small green neotropical parrot with a bright yellow head, reddish orange face and pale beak that was native to the eastern, Midwest and plains states of the United States. It was the only indigenous parrot within its range, as well as one of only three parrot species native to the United States (the others being the thick-billed parrot, now extirpated, and the green parakeet, still present in Texas; a fourth parrot species, the red-crowned amazon, is debated). The Carolina parakeet was found from southern New York and Wisconsin to Kentucky, Tennessee and the Gulf of Mexico, from the Atlantic seaboard to as far west as eastern Colorado. It lived in old-growth forests along rivers and in swamps. It was called puzzi la née ("head of yellow") or pot pot chee by the Seminole and kelinky in Chickasaw. Though formerly prevalent within its range, the bird had become rare by the middle of the 19th cen