In [9]:
from src.utils import generate_csv_data
from src.shared import singleton_engine
from src.models import WordsEn
from src.models.data.get_word_en import GetWordEn
from sqlalchemy import text
from pathlib import Path
from dotenv import load_dotenv
from pprint import pprint

load_dotenv(dotenv_path=Path.cwd(), override=True)

False

In [2]:
# Might be my skill issue or kernel's chaos, but in notebook downloading is extremly slow, so don't use util for csv generation with preloaded_data=False here
generated_csv_path = generate_csv_data()

In [3]:
# Used single time for creating extension, table, index and copying data from generated .csv file
with singleton_engine.connect() as conn:
    conn.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
    creation_query = """
                        CREATE TABLE words_en (
                        id UUID DEFAULT gen_random_uuid() PRIMARY KEY, 
                        word varchar(20) NOT NULL,
                        embedding vector(300) 
                        );
                        CREATE INDEX ON words_en USING hnsw (embedding vector_cosine_ops);
                     """
    conn.execute(text(creation_query))
    copy_query = f"""
                    COPY words_en (word, embedding) FROM '{generated_csv_path}' DELIMITER ',' CSV;
                 """
    conn.execute(text(copy_query))
    conn.commit()

In [None]:
# Test query to verify everything works smoothly
def test_cosine_similarity(word: str, limit: int) -> list[GetWordEn]:
    with singleton_engine.connect() as conn:
        test_query = f"""
                        SELECT word, 1 - (embedding <=> (SELECT embedding from words_en WHERE word = :word)) as distance
                        FROM words_en
                        ORDER BY distance DESC
                        LIMIT :limit;
                     """
        result = conn.execute(text(test_query), {"word": word, "limit": limit}).all()
    similar_words = []
    for row in result:
        similar_words.append(GetWordEn(word=row[0], distance=row[1]))
    
    return similar_words

pprint(test_cosine_similarity(word='woman', limit=10))

[GetWordEn(word='woman', distance=1.0),
 GetWordEn(word='man', distance=0.7664013747067495),
 GetWordEn(word='girl', distance=0.7494641774911672),
 GetWordEn(word='teenager', distance=0.631708604958935),
 GetWordEn(word='lady', distance=0.6288786571651082),
 GetWordEn(word='mother', distance=0.6076306670085915),
 GetWordEn(word='boy', distance=0.5975908768400086),
 GetWordEn(word='she', distance=0.5641393359770321),
 GetWordEn(word='person', distance=0.5470173400209332),
 GetWordEn(word='housewife', distance=0.5463822760355982)]
