app outline
1. random word from latin words
2. all translations
3. all translation for that word
4. input some text
5. get 5 most relevant (confidence bigger than X)
6. confirm by number or reject (r)
7. skip or try again
8. ask for clue (example if present)
9. save score

prototype outline
1. get translations
2. vectorize them
3. input text
4. find most relevant synonyms

In [1]:
import sys
from pathlib import Path
from sqlalchemy.orm import sessionmaker
from environment.setup import engine

# Add the parent directory to sys.path to import from database
sys.path.append(str(Path.cwd().parent))

from database.db_classes import Translations

# Create engine and session
# engine = create_engine('sqlite:///path/to/your/database.db')  # Replace with your actual database URL
Session = sessionmaker(bind=engine)

def get_all_translations():
    """
    Retrieve all translations from the database using the Translations class.
    
    Returns:
        list: A list of all translation records from the database
    """
    session = Session()
    try:
        translations = session.query(Translations).all()
        return translations
    except Exception as e:
        print(f"Error retrieving translations: {e}")
        return []
    finally:
        session.close()

translations = get_all_translations()
print(f"Retrieved {len(translations)} translations from the database")

# Display a few sample translations if available
if translations:
    print("\nSample translations:")
    for translation in translations[:5]: 
        print(translation.translation)


Retrieved 982 translations from the database

Sample translations:
karać
winnica
patrzeć, widzieć
bardzo
chwalić


In [2]:
# from langchain.schema import Document
from langchain.docstore.document import Document

# Convert translations to Documents
translation_docs = []
for translation in translations:
    doc = Document(
        page_content=translation.translation
    )
    translation_docs.append(doc)

print(f"\nCreated {len(translation_docs)} Documents")
# Display a few sample documents if available
if translation_docs:
    print("\nSample Documents:")
    for doc in translation_docs[:3]:  # Show first 3 documents
        print(f"Content: {doc.page_content}")


Created 982 Documents

Sample Documents:
Content: karać
Content: winnica
Content: patrzeć, widzieć


In [3]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

doc_embeddings = embeddings.embed_documents([doc.page_content for doc in translation_docs])

  embeddings = OpenAIEmbeddings()


In [4]:
import numpy as np


def get_relevant_documents(query: str, n: int, documents: list, open_AI_embeddings: OpenAIEmbeddings, doc_embeddings: list) -> list:
    """
    Find the N most relevant documents for a given query word using LangChain embeddings and cosine similarity.
    
    Args:
        query (str): The input word to find similar documents for
        n (int): Number of relevant documents to return
        documents (list): List of Document objects to search through
        
    Returns:
        list: The N most relevant Document objects sorted by similarity
    """
    
    # Get embeddings for query and documents
    query_embedding = open_AI_embeddings.embed_query(query)
    
    
    # Calculate cosine similarities
    similarities = []
    for i, doc_embedding in enumerate(doc_embeddings):
        similarity = np.dot(query_embedding, doc_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding))
        similarities.append((similarity, i))
    
    # Sort by similarity and get top N
    similarities.sort(reverse=True)
    top_n_indices = [idx for _, idx in similarities[:n]]
    
    # Return the most relevant documents
    return [documents[idx] for idx in top_n_indices]



In [5]:
def similar_translations(query: str, n: int = 3) -> list[str]:
    relevant_docs = get_relevant_documents(
        query, 
        n=n, 
        documents=translation_docs, 
        open_AI_embeddings=embeddings, 
        doc_embeddings=doc_embeddings
        )

    return [doc.page_content for doc in relevant_docs]

In [6]:
similar_translations('hałas zgiełk')

['krzyk, zgiełk, hałas (ludzki)', 'zamieszanie, wrzawa, zgiełk', 'hańba']

In [8]:
similar_translations('widowisko', 5)


['widowisko publiczne, igrzyska', 'wojsko', 'wieża', 'świątynia', 'wieniec']