In [17]:
import pandas as pd
from tqdm import tqdm
import pprint

from embedding import OneHotEmbedder, Doc2VecEmbedder
from vectorstore import SimpleVectorDatabase
from models import TextDoc, Vector

In [2]:
# This parameter is used to set the vector dimension of the embedding algorithm and the 
# vector database. The higher the dimension, the more accurate the results will be, but
# the more computationally expensive the calculations will be. OpenAI's embedding models
# use a dimension of 1536 as a reference.

VECTOR_DIM = 500

In [28]:
# load wikipedia dataset (find data also in the `data_load` notebook)
dataset = pd.read_csv("data_local/wiki_subset_small.csv", sep=",")

# take only the first rows due to computation time
dataset = dataset.head(VECTOR_DIM)
dataset.head()

Unnamed: 0,id,url,title,text
0,3255,https://en.wikipedia.org/wiki/Apostles%27%20Creed,Apostles' Creed,The Apostles' Creed (Latin: Symbolum Apostolor...
1,27487,https://en.wikipedia.org/wiki/Split%20screen%2...,Split screen (video production),"In film and video production, split screen is ..."
2,37686,https://en.wikipedia.org/wiki/Parasports,Parasports,Parasports are sports played by people with a ...
3,18739,https://en.wikipedia.org/wiki/Laika,Laika,Laika (; c. 1954 – 3 November 1957) was a Sovi...
4,39095,https://en.wikipedia.org/wiki/Extension%20%28s...,Extension (semantics),§\nIn any of several fields of study that trea...


In [4]:
# Choose the embedder model to use: OneHot or Doc2Vec currently available
embedder = OneHotEmbedder(vector_dim=VECTOR_DIM) # Doc2VecEmbedder(vector_dim=VECTOR_DIM)

# fit the embedder model: this could take a view minutes
embedder.fit(list(dataset.text))

2023-12-04 12:43:43,019::onehotembedder.py[_fit()]::INFO::Fitting embedder. Preprocessing documents...
100%|██████████| 500/500 [00:10<00:00, 49.62it/s]
2023-12-04 12:43:53,219::onehotembedder.py[_fit()]::INFO::Corpus created with 88538 words.
2023-12-04 12:43:53,219::onehotembedder.py[_fit()]::INFO::Reducing vector dimensions: fitting PCA (n=500)


In [5]:
# create vector database instance
vecdb = SimpleVectorDatabase(vector_dim=VECTOR_DIM)

# create vectors and stores them in the database
for i in tqdm(range(len(dataset))):
    row = dataset.iloc[i]
    doc = TextDoc(row.text)
    vec = Vector(
        embedding=embedder.embed(doc),
        data=TextDoc(doc),
        metadata={"type": "wikipedia", "src": row.url, "title": row.title})
    vecdb.upsert(vec)

100%|██████████| 500/500 [00:41<00:00, 11.95it/s]


In [6]:
# Let's take the last row of the dataset and query the database for similar vectors.
row = dataset.iloc[-1,:]
new_doc = TextDoc(row.text)
print(new_doc)

# create the vector model used by our database
new_vec = Vector(
    embedding=embedder.embed(new_doc),
    data=TextDoc(new_doc),
    metadata={"type": "wikipedia", "src": row.url, "title": row.title}
)
print(new_vec)

TextDoc(2494): A chemical patent, pharmaceutical patent or drug patent ...
<Vector 1d41aec3-816c-4c68-b3e1-4a85e60eb4c9 (500) : TextDoc(2494): A chemical patent, pharmaceutical patent or drug patent ...>


In [36]:
# Now we can query the database for similar vectors. Select the measure to use and the
# number of similar vectors to return.
# Currently implemented measures are: cosine, euclidean and dot.
similar_vectors = vecdb.sim_search(new_vec, measure="dot", k=5)
similar_vectors

[{'vector': <Vector 45eb0314-66cb-4a79-b499-1ea509f01f69 (500) : TextDoc(2494): A chemical patent, pharmaceutical patent or drug patent ...>,
  'score': 1413.6689079995763},
 {'vector': <Vector c58b0eb7-ffe7-45a5-9bb9-3b9cca82a7a7 (500) : TextDoc(18734): A chemical formula is a way of  presenting information  ...>,
  'score': 916.3589079998607},
 {'vector': <Vector 9733ec31-1ed5-4264-bee8-9e059b07178c (500) : TextDoc(263): Risa may refer to:
  
   Risa (given name), a feminine give ...>,
  'score': 837.5069079995632},
 {'vector': <Vector b320f317-5354-4902-9ac2-268138919e41 (500) : TextDoc(85): The 760s decade ran from January 1, 760, to December 31 ...>,
  'score': 837.3649079995587},
 {'vector': <Vector 2a3bcf28-4e60-4448-994a-9eb8dab7deac (500) : TextDoc(281): Jan Borukowski of Bielin (1524–1584) was the Bishop
  of  ...>,
  'score': 836.3569079995575}]

In [37]:
print(f"Vector:       [title: {new_vec.metadata['title']}] {new_vec.data}")
for i in range(len(similar_vectors)):
    print(f"Most similar: [title: {similar_vectors[i]['vector'].metadata['title']}] {similar_vectors[i]['vector'].data}")

Vector:       [title: Chemical patent] TextDoc(2494): A chemical patent, pharmaceutical patent or drug patent ...
Most similar: [title: Chemical patent] TextDoc(2494): A chemical patent, pharmaceutical patent or drug patent ...
Most similar: [title: Chemical formula] TextDoc(18734): A chemical formula is a way of  presenting information  ...
Most similar: [title: Risa] TextDoc(263): Risa may refer to:

 Risa (given name), a feminine give ...
Most similar: [title: 760s] TextDoc(85): The 760s decade ran from January 1, 760, to December 31 ...
Most similar: [title: Jan Borukowski] TextDoc(281): Jan Borukowski of Bielin (1524–1584) was the Bishop
of  ...


In [23]:
# Search Article
pp = pprint.PrettyPrinter(width=150, compact=True)
pp.pprint(new_vec.data.body[:1000])

('A chemical patent, pharmaceutical patent or drug patent is a patent for an invention in the chemical or pharmaceuticals industry. Strictly '
 'speaking, in most jurisdictions, there are essentially no differences between the legal requirements to obtain a patent for an invention in the '
 'chemical or pharmaceutical fields, in comparison to obtaining a patent in the other fields, such as in the mechanical field. A chemical patent or '
 'a pharmaceutical patent is therefore not a sui generis right, i.e. a special legal type of patent.\n'
 '\n'
 'In the pharmaceutical industry, the patent protection of drugs and medicines is accorded a particular importance, because drugs and medicines can '
 'easily be copied or imitated (by analyzing a pharmaceutical substance) and because of the significant research and development spending and the '
 'high risks associated with the development of a new drug.\n'
 '\n'
 'Chemical patents are different from other sources of technical information becau

In [25]:
# Search Result Article
pp.pprint(similar_vectors[1]['vector'].data.body[:1000])

('A chemical formula is a way of  presenting information about the chemical proportions of atoms that constitute a particular chemical compound or '
 'molecule, using chemical element symbols, numbers, and sometimes also other symbols, such as parentheses, dashes, brackets, commas and plus (+) '
 'and minus (−) signs. These are limited to a single typographic line of symbols, which may include subscripts and superscripts. A chemical formula '
 'is not a chemical name, and it contains no words. Although a chemical formula may imply certain simple chemical structures, it is not the same as '
 'a full chemical structural formula. Chemical formulae can fully specify the structure of only the simplest of molecules and chemical substances, '
 'and are generally more limited in power than chemical names and structural formulae.\n'
 '\n'
 'The simplest types of chemical formulae are called empirical formulae, which use letters and numbers indicating the numerical proportions of atoms '
 'of ea