In [None]:
from pathlib import Path
import json
import requests
import redis
from redis.commands.search.field import VectorField
from redis.exceptions import ResponseError
import numpy as np
from redis.commands.search.query import Query

In [None]:
class Embeddings:
    def __init__(self, index_name):
        with open(Path().absolute() / '..' / 'dev-config.json', 'r') as f:
            config = json.load(f)
        
        self._config = config
        self._index_name = index_name
        
        self._session = requests.session()
        self._redis = redis.Redis(host='localhost', port=6379, db=0)
        
        self._init_redis()
        
    def _init_redis(self):
        field = VectorField(
            name='embedding',
            algorithm='HNSW',
            attributes=dict(
                type='FLOAT64',
                dim=1536,
                distance_metric='COSINE',
            ),
        )
        
        try:
            self._redis.ft(self._index_name).create_index(field)
        except ResponseError as e:
            if str(e) != 'Index already exists':
                raise

    def _get_embedding(self, text: str) -> np.array:
        headers = {}
        headers['authorization'] = 'Bearer ' + self._config['openai_api_key']
        
        r = self._session.post(
            'https://api.openai.com/v1/embeddings',
            headers=headers,
            json=dict(
                input=text,
                model='text-embedding-ada-002',
            ),
        )
        r.raise_for_status()
        
        response = r.json()
        assert response['object'] == 'list', f'{response["object"]} must be list'
        data = response['data']
        assert len(data) == 1
        assert data[0]['object'] == 'embedding', f'{data[0]["object"]} must be embedding'
        embedding = data[0]['embedding']
        
        return np.array(embedding)
    
    def _save_embedding(self, text: str, embedding: np.array) -> bytes:
        self._redis.hset(f"text:{text}", mapping = dict(
            embedding=embedding.tobytes(),
            text=text,
        ))
                
    def add(self, text: str) -> np.array:
        found = self._redis.hget(f'text:{text}', 'embedding')
        if found:
            return found
        
        embedding = self._get_embedding(text)
        self._save_embedding(text, embedding)

        return embedding.tobytes()

    def close(self):
        self._s.close()

In [None]:
e = Embeddings('examplegpt')

texts = [
    'Мультипликация',
    'Man who can fly',
    'Карлсон',
    'Amelie',
    'Girl with a dragon tattoo',
    'Iron man',
    'Тетя петунья из Гарри Поттера',
    'Harry Potter',
    'Töölö',
    'Dreamworks',
    'Disney',
    'Pixar',
    'Та поэма Пушкина, где три девицы прядут под окном',
    'Мультифльм советский, где еще князь в комара превращается',
    'Movie about girl that lives in Paris',
    'How to double a number?',
    'How to make a value twice as big?',
]
pairs = []

for text in texts:
    emb = e.add(text)
    pairs.append((text, emb))

In [None]:
r = redis.Redis(host='localhost', port=6379, db=0)
q = Query("*=>[KNN 5 @embedding $e]").return_field('text').return_field('__embedding_score').dialect(2)

for text, emb in pairs:
    print(text)
    result = r.ft('examplegpt').search(q, query_params={"e": emb})
    for d in result.docs:
        if float(d['__embedding_score']) > 0.0001:
            print(f'  * {d["text"]}' + ' ' * (60 - len(d['text'])) + d['__embedding_score'])