In [1]:
import pandas as pd
from tqdm import tqdm
import pprint

from embedding import OneHotEmbedder, Doc2VecEmbedder
from vectorstore import SimpleVectorDatabase
from models import TextDoc, Vector

In [2]:
# This parameter is used to set the vector dimension of the embedding algorithm and the 
# vector database. The higher the dimension, the more accurate the results will be, but
# the more computationally expensive the calculations will be. OpenAI's embedding models
# use a dimension of 1536 as a reference.

VECTOR_DIM = 200

In [3]:
# load wikipedia dataset (find data also in the `data_load` notebook)
dataset = pd.read_csv("data_local/wiki_subset_small.csv", sep=",")

# take only the first rows due to computation time
dataset = dataset.head(VECTOR_DIM)
dataset.head()

Unnamed: 0,id,url,title,text
0,3255,https://en.wikipedia.org/wiki/Apostles%27%20Creed,Apostles' Creed,The Apostles' Creed (Latin: Symbolum Apostolor...
1,27487,https://en.wikipedia.org/wiki/Split%20screen%2...,Split screen (video production),"In film and video production, split screen is ..."
2,37686,https://en.wikipedia.org/wiki/Parasports,Parasports,Parasports are sports played by people with a ...
3,18739,https://en.wikipedia.org/wiki/Laika,Laika,Laika (; c. 1954 – 3 November 1957) was a Sovi...
4,39095,https://en.wikipedia.org/wiki/Extension%20%28s...,Extension (semantics),§\nIn any of several fields of study that trea...


In [18]:
# Choose the embedder model to use: OneHot or Doc2Vec currently available
embedder = OneHotEmbedder(vector_dim=None, encoding_method="additive") # Doc2VecEmbedder(vector_dim=VECTOR_DIM)

# fit the embedder model: this could take a view minutes
embedder.fit(list(dataset.text))

2023-12-04 13:56:01,414::onehotembedder.py[_fit()]::INFO::Fitting embedder. Preprocessing documents...
100%|██████████| 200/200 [00:03<00:00, 53.31it/s]
2023-12-04 13:56:05,193::onehotembedder.py[_fit()]::INFO::Corpus created with 48639 words.


In [12]:
# create vector database instance
vecdb = SimpleVectorDatabase(vector_dim=VECTOR_DIM)

# create vectors and stores them in the database
for i in tqdm(range(len(dataset))):
    row = dataset.iloc[i]
    doc = TextDoc(row.text)
    vec = Vector(
        embedding=embedder.embed(doc),
        data=TextDoc(doc),
        metadata={"type": "wikipedia", "src": row.url, "title": row.title})
    vecdb.upsert(vec)

100%|██████████| 200/200 [00:07<00:00, 27.45it/s]


In [13]:
# Let's take the last row of the dataset and query the database for similar vectors.
row = dataset.iloc[-1,:]
new_doc = TextDoc(row.text)
print(new_doc)

# create the vector model used by our database
new_vec = Vector(
    embedding=embedder.embed(new_doc),
    data=TextDoc(new_doc),
    metadata={"type": "wikipedia", "src": row.url, "title": row.title}
)
print(new_vec)

TextDoc(1318): Calligra Words is a word processor, which is part of Ca ...
<Vector b420e093-29e1-486e-b011-ad036f812ae4 (200) : TextDoc(1318): Calligra Words is a word processor, which is part of Ca ...>


In [14]:
# Now we can query the database for similar vectors. Select the measure to use and the
# number of similar vectors to return.
# Currently implemented measures are: cosine, euclidean and dot.
similar_vectors = vecdb.sim_search(new_vec, measure="dot", k=5)
similar_vectors

[{'vector': <Vector b865b78d-72d4-4656-b70a-d49de2535d0c (200) : TextDoc(1318): Calligra Words is a word processor, which is part of Ca ...>,
  'score': 1108.7594999996882},
 {'vector': <Vector 45fca3bf-ed35-4e74-8f49-ae2470fa2e12 (200) : TextDoc(281): Jan Borukowski of Bielin (1524–1584) was the Bishop
  of  ...>,
  'score': 929.0194999997086},
 {'vector': <Vector 1923b6df-24a1-4acd-abaf-e6507a533a9d (200) : TextDoc(308): The bell curve is typical of the normal distribution.
  
   ...>,
  'score': 926.0444999997198},
 {'vector': <Vector c68520cd-e726-479c-8292-51d3092b8fbc (200) : TextDoc(141): The 1290s was a decade of the Julian Calendar which beg ...>,
  'score': 923.2194999997098},
 {'vector': <Vector 5de16f85-0fb7-4118-b962-2ac8c4b56d00 (200) : TextDoc(324): Islamic Jihad may refer to:
  
   Islamic Jihad Movement in ...>,
  'score': 923.1994999996916}]

In [15]:
print(f"Vector:       [title: {new_vec.metadata['title']}] {new_vec.data}")
for i in range(len(similar_vectors)):
    print(f"Most similar: [title: {similar_vectors[i]['vector'].metadata['title']}] {similar_vectors[i]['vector'].data}")

Vector:       [title: Calligra Words] TextDoc(1318): Calligra Words is a word processor, which is part of Ca ...
Most similar: [title: Calligra Words] TextDoc(1318): Calligra Words is a word processor, which is part of Ca ...
Most similar: [title: Jan Borukowski] TextDoc(281): Jan Borukowski of Bielin (1524–1584) was the Bishop
of  ...
Most similar: [title: Bell curve (disambiguation)] TextDoc(308): The bell curve is typical of the normal distribution.

 ...
Most similar: [title: 1290s] TextDoc(141): The 1290s was a decade of the Julian Calendar which beg ...
Most similar: [title: Islamic Jihad] TextDoc(324): Islamic Jihad may refer to:

 Islamic Jihad Movement in ...


In [16]:
# Search Article
pp = pprint.PrettyPrinter(width=150, compact=True)
pp.pprint(new_vec.data.body[:1000])

('Calligra Words is a word processor, which is part of Calligra Suite and developed by KDE as free software.\n'
 '\n'
 'History \n'
 '\n'
 'When the Calligra Suite was formed, unlike the other Calligra applications Words was not a continuation of the corresponding KOffice application – '
 'KWord. The Words was largely written from scratch – in May 2011 a completely new layout engine was announced. The first release was made available '
 'on , using the version number 2.4 to match the rest of Calligra Suite.\n'
 '\n'
 'Reception \n'
 "Initial reception of Calligra Words shortly after the 2.4 release was mixed. While Linux Pro Magazine Online's Bruce Byfield wrote “Calligra "
 'needed an impressive first release. Perhaps surprisingly, and to the development team’s credit, it has managed one in 2.4.”, he also noted that '
 '“Words in particular is still lacking features”. He concluded that Calligra is “worth keeping an eye on”.\n'
 '\n'
 'On the other hand, Calligra Words became the defau

In [17]:
# Search Result Article
pp.pprint(similar_vectors[1]['vector'].data.body[:1000])

('Jan Borukowski of Bielin (1524–1584) was the Bishop\n'
 'of Przemyśl, and was the royal secretary of Poland from 1553. In\n'
 '1569, he signed the act of annexation of Podlaskie, Volhynia and Kyiv to the kingdom during Sejm in Lublin.\n'
 '\n'
 'References\n'
 '\n'
 '1524 births\n'
 '1584 deaths\n'
 'Bishops of Przemyśl')
