In [1]:
#requirements.txt
#https://github.com/alexklibisz/elastiknn/blob/main/examples/tutorial-notebooks/multimodal-search-amazon-products.ipynb
#https://towardsdatascience.com/computing-node-embedding-with-a-graph-database-neo4j-its-graph-data-science-library-d45db83e54b6

In [None]:


%load_ext autoreload
%autoreload 2
%matplotlib inline
from vectordocutil import *
from itertools import islice
from tqdm import tqdm
from pprint import pprint, pformat
from IPython.display import Image, display, Markdown, Code, HTML
import matplotlib.pyplot as plt
import numpy as np
import json

from pymilvus import CollectionSchema, FieldSchema, DataType

import random
from sentence_transformers import SentenceTransformer
from faker import Faker

In [None]:
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
fake = Faker(['en_US'])
fake.text()

In [None]:
" ".join(string)
embeddingsTXT = model.encode(" ".join(string), show_progress_bar=True)
embeddingsTXT = np.array([embedding for embedding in embeddingsTXT]).astype("float32")
embeddings = np.array([embedding for embedding in embeddings]).astype("float64")
embeddings = np.array([embedding for embedding in embeddings]).astype("float")

In [None]:
mdata = [
    [i for i in range(768)],
    [[embeddings[j] for i in range(1)] for j in range(768)]
]

## Connect to Elasticsearch

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

es = Elasticsearch(["http://localhost:9200"])
es.cluster.health(wait_for_status='yellow', request_timeout=1)

## Creating syntetic dataset

In [None]:
vector_dims = 256
reduced = iter_vectors_reduced(fname_vectors, dims=vector_dims, samples=10000)

for (asin, vec) in islice(reduced(fname_vectors), 3):
  print(asin, len(vec), vec[:3])

sample = np.array([v for (_, v) in islice(reduced(fname_vectors), 20000)])
plt.title("Shape: %s, mean: %.3f" % (sample.shape, sample.mean()))
plt.hist(np.ravel(sample), bins=40, log=True)
plt.show()

In [None]:
index = 'fakeDocs'
source_no_vecs = ['tittle', 'abstract']

#function to generate yield list of items to insert into elastic
def docs():
  for p in tqdm(iter_products(fname_products)):
    yield { 
      "_op_type": "index", 
      "_index": index, 
      "_id": p["asin"], 
      "title": p.get("title", None), 
      "abstract": p.get("abstract", None)
    }

## Create the Elasticsearch Index

In [None]:
#bulk insert
bulk(es, docs(), chunk_size=2000, max_retries=2)

In [None]:


settings = {
  "settings": {
    "elastiknn": True,
    "number_of_shards": 1,
    "number_of_replicas": 0
  }
}

mapping = {
  "dynamic": False,
  "properties": {
    "asin": { "type": "keyword" },
    "imVecElastiknn": {
      "type": "elastiknn_dense_float_vector",
      "elastiknn": {
        "dims": vector_dims,
        "model": "lsh",
        "similarity": "angular",
        "L": 60,
        "k": 3
      }
    },
    "imVecXpack": {
      "type": "dense_vector",
      "dims": vector_dims
    },
    "title": { "type": "text" },
    "description": { "type": "text" },
    "price": { "type": "float" },
    "imUrl": { "type": "text" }
  }
}

if not es.indices.exists(index):
  es.indices.create(index, settings)
  es.indices.put_mapping(mapping, index)
es.indices.get_mapping(index)