In [1]:
import pandas as pd
from tqdm import tqdm

from embedding.onehotembedder import OneHotEmbedder
from vectorstore.simplevec import SimpleVectorDatabase
from models.text import TextDoc
from models.vec import Vector

In [2]:
VECTOR_DIM = 1500

In [3]:
# load wikipedia dataset (find data in data_load notebook)
dataset = pd.read_csv("data_local/wiki_subset_small.csv", sep=",")
dataset = dataset.head(VECTOR_DIM)
# there need to be at least as many documents as the vector dimension

In [4]:
# fit the embedder
embedder = OneHotEmbedder(vector_dim=VECTOR_DIM)
embedder.fit(list(dataset.text))

2023-12-03 14:07:22,043::onehotembedder.py[_fit()]::INFO::Fitting embedder. Preprocessing documents...
100%|██████████| 1500/1500 [00:29<00:00, 50.24it/s]
2023-12-03 14:07:52,121::onehotembedder.py[_fit()]::INFO::Corpus created with 185292 words.
2023-12-03 14:07:52,122::onehotembedder.py[_fit()]::INFO::Reducing vector dimensions: fitting PCA with 1500


In [5]:
# create vector database
vecdb = SimpleVectorDatabase(vector_dim=VECTOR_DIM)

# create vectors and store in database
for i in tqdm(range(len(dataset))):
    row = dataset.iloc[i]
    doc = TextDoc(row.text)
    vec = Vector(
        embedding=embedder.embed(doc),
        data=TextDoc(doc),
        metadata={"type": "wikipedia", "src": row.url, "title": row.title})
    vecdb.upsert(vec)

100%|██████████| 1500/1500 [03:13<00:00,  7.76it/s]


In [6]:
row = dataset.iloc[-1,:]
new_doc = TextDoc(row.text)
print(new_doc)

new_vec = Vector(
    embedding=embedder.embed(new_doc),
    data=TextDoc(new_doc),
    metadata={"type": "wikipedia", "src": row.url, "title": row.title}
)

TextDoc(42479): A neutron star is the collapsed core of a massive super ...


In [13]:
vector_1 = vecdb.get_random_vector()
vector_1.data

TextDoc(27260): Hesychasm (; Greek: Ησυχασμός) is a mystical tradition  ...

In [16]:
similar_vectors = vecdb.sim_search(vector_1, measure="cosine", k=5)
similar_vectors

[{'vector': <models.vec.Vector at 0x17deb2a50>, 'score': 1.0},
 {'vector': <models.vec.Vector at 0x17dda34d0>, 'score': 0.40092767542461605},
 {'vector': <models.vec.Vector at 0x17dd12c50>, 'score': 0.2910714883379696},
 {'vector': <models.vec.Vector at 0x17dd62ed0>, 'score': 0.26451210497771876},
 {'vector': <models.vec.Vector at 0x17de0d490>, 'score': 0.24988477528944558}]

In [21]:
print(f"vector:       [{vector_1.metadata['title']}] {vector_1.data}")
i = 4
print(f"most similar: [{similar_vectors[i]['vector'].metadata['title']}] {similar_vectors[i]['vector'].data}")

vector:       [Hesychasm] TextDoc(27260): Hesychasm (; Greek: Ησυχασμός) is a mystical tradition  ...
most similar: [Supersessionism] TextDoc(21238): Supersessionism, also called replacement theology, is a ...


In [18]:
vector_1.data.body

'Hesychasm (; Greek: Ησυχασμός) is a mystical tradition of contemplative prayer in the Eastern Orthodox Church. Based on Jesus\'s injunction in the Gospel of Matthew that "whenever you pray, go into your room and shut the door and pray to your Father who is in secret; and your Father who sees in secret will reward you", hesychasm in tradition has been the process of retiring inward by ceasing to register the senses, in order to achieve an experiential knowledge of God (see Theoria).\n\nEtymology\n\nMeaning\nHesychasm (, Modern ) derives from the word hesychia (, ), meaning "stillness, rest, quiet, silence" and hesychazo ( ) "to keep stillness".\n\nUsage\nMetropolitan Kallistos Ware, a scholar of Eastern Orthodox theology, distinguishes five distinct usages of the term "hesychasm":\n\n "solitary life", a sense, equivalent to "eremitical life", in which the term is used since the 4th century;\n "the practice of inner prayer, aiming at union with God on a level beyond images, concepts and

In [34]:
similar_vectors[1]['vector'].data.body

'Spanish may refer to:\n Items from or related to Spain:\nSpaniards, a nation and ethnic group indigenous to Spain\nSpanish language\nSpanish cuisine\n\nOther places\n Spanish, Ontario, Canada\n Spanish River (disambiguation), the name of several rivers\n Spanish Town, Jamaica\n\nOther uses\n John J. Spanish (1922–2019), American politician\n "Spanish" (song), a single by Craig David, 2003\n\nSee also\n \n \n Español (disambiguation)\n Hispania, the Roman and Greek name for the Iberian Peninsula\n Hispanic, the people, nations, and cultures that have a historical link to Spain\n Hispanic (disambiguation)\n Hispanism\n Spain (disambiguation)\n National and regional identity in Spain\n Culture of Spain\n Spanish Fort (disambiguation)\n\nLanguage and nationality disambiguation pages'