In [5]:
import os
from langchain_openai import ChatOpenAI # type: ignore

api_key = os.getenv("OPENAI_API_KEY")

chat = ChatOpenAI(model='gpt-4o-mini', temperature=0.0, openai_api_key=api_key)
chat


ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x129286510>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x12928e120>, root_client=<openai.OpenAI object at 0x1283a5d30>, root_async_client=<openai.AsyncOpenAI object at 0x129286660>, model_name='gpt-4o-mini', temperature=0.0, model_kwargs={}, openai_api_key=SecretStr('**********'))

In [6]:
from langchain.docstore.document import Document
import json

# First load your JSON file
with open('english_dictionary.json', 'r') as f:
    entries = json.load(f)

# Create a document for each entry
docs = []
for entry in entries:
    # Create content from structured data
    content = {
        "word": entry["word"],
        "part_of_speech": entry["part_of_speech"],
        "example": entry["example"],
        "source": entry["source"],
        "translations": entry["translations"]
    }
    
    # Create document with the JSON content and metadata
    doc = Document(
        page_content=json.dumps(content, indent=2),
        metadata={
            "word": entry["word"],
            "part_of_speech": entry["part_of_speech"]
        }
    )
    docs.append(doc)

# Verify the documents
print(f"Number of documents: {len(docs)}")
for i, doc in enumerate(docs[:3]):  # Show first 3 entries
    print(f"\nDocument {i+1}:")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print("-" * 50)

Number of documents: 243

Document 1:
Content: {
  "word": "impel",
  "part_of_speech": "verb",
  "example": "I never read medicine advertisement without being impelled to the conclusion that I am suffering from the particular disease",
  "source": "Three Men in a Boat (to say nothing of the dog)",
  "translations": [
    "to make someone feel that they must do something",
    "to force someone to do something"
  ]
}
Metadata: {'word': 'impel', 'part_of_speech': 'verb'}
--------------------------------------------------

Document 2:
Content: {
  "word": "a touch of",
  "part_of_speech": "phrase",
  "example": "Slight ailment of which I had a touch",
  "source": "Three Men in a Boat (to say nothing of the dog)",
  "translations": [
    "a small amount of (something) : a hint or trace of (something)"
  ]
}
Metadata: {'word': 'a touch of', 'part_of_speech': 'phrase'}
--------------------------------------------------

Document 3:
Content: {
  "word": "fancy",
  "part_of_speech": "verb",
 

In [10]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import json
import numpy as np

def find_similar_translations(query_string, docs, top_k=3):
    """
    Find documents with translations semantically similar to the input string.
    
    Args:
        query_string (str): The input string to find similar translations for
        docs (list): List of Document objects containing dictionary entries
        top_k (int): Number of top results to return
        
    Returns:
        list: Top k documents with translations similar to the input string
    """
    # Initialize the embedding model
    embeddings = OpenAIEmbeddings()
    
    # Create a list to store documents with their translations as content
    translation_docs = []
    
    # Process each document to extract translations
    for doc in docs:
        content = json.loads(doc.page_content)
        word = content["word"]
        part_of_speech = content["part_of_speech"]
        
        # For each translation in the document, create a new document
        for translation in content["translations"]:
            translation_doc = {
                "content": translation,
                "original_doc": doc,
                "word": word,
                "part_of_speech": part_of_speech
            }
            translation_docs.append(translation_doc)
    
    # Get embeddings for all translations
    texts = [doc["content"] for doc in translation_docs]
    translation_embeddings = embeddings.embed_documents(texts)
    
    # Get embedding for the query string
    query_embedding = embeddings.embed_query(query_string)
    
    # Calculate cosine similarity between query and all translations
    similarities = []
    for i, emb in enumerate(translation_embeddings):
        similarity = np.dot(query_embedding, emb) / (np.linalg.norm(query_embedding) * np.linalg.norm(emb))
        similarities.append((similarity, i))
    
    # Sort by similarity (highest first)
    similarities.sort(reverse=True)
    
    # Get top k results
    results = []
    seen_words = set()  # To avoid duplicate words in results
    
    for similarity, idx in similarities:
        doc_info = translation_docs[idx]
        word = doc_info["word"]
        
        # Only add if we haven't seen this word yet
        if word not in seen_words:
            results.append({
                "word": word,
                "part_of_speech": doc_info["part_of_speech"],
                "translation": doc_info["content"],
                "similarity": similarity,
                "original_document": doc_info["original_doc"]
            })
            seen_words.add(word)
            
            if len(results) >= top_k:
                break
    
    return results

# Example usage
def display_similar_words(query, docs, top_k=3):
    """Helper function to display results in a readable format"""
    results = find_similar_translations(query, docs, top_k)
    
    print(f"Finding words with translations similar to: '{query}'\n")
    for i, result in enumerate(results):
        print(f"{i+1}. {result['word']} [{result['part_of_speech']}]")
        print(f"   Translation: {result['translation']}")
        print(f"   Similarity score: {result['similarity']:.4f}")
        
        # Get the original content to show example
        content = json.loads(result['original_document'].page_content)
        if "example" in content and content["example"]:
            print(f"   Example: \"{content['example']}\"")
        print()

In [8]:
# Example query
query = "to believe something without evidence"
display_similar_words(query, docs, top_k=5)

Finding words with translations similar to: 'to believe something without evidence'

1. allege (verb)
   Translation: to say that someone has done something wrong or illegal, but not prove it
   Similarity score: 0.8521
   Example: "Schneider alone was alleged to have defrauded the country to the tune of 400k $ for armaments"

2. fancy (verb)
   Translation: to imagine or think that something is so (Hay fever, I fancy it was)
   Similarity score: 0.8521
   Example: "Hay fever, I fancy it was"

3. conceptualize (verb)
   Translation: to form an idea or principle in your mind
   Similarity score: 0.8412
   Example: "Other changes have consequences so devastating that, once discover, we struggle even to  conceptualize them"

4. profess (verb)
   Translation: to state or admit, often in a public or formal way, that you have a particular belief or feeling
   Similarity score: 0.8396
   Example: "Jews professed aversion for Greek civilization at the very moment they were considering the Caba

In [12]:
display_similar_words("becoming weaker", docs, top_k=5)

Finding words with translations similar to: 'becoming weaker'

1. senescence [noun]
   Translation: the fact of becoming older, and therefore being in less good condition and less able to function well
   Similarity score: 0.8854
   Example: "We have done theoretical work on the evolution of trade-offs, senescence and morality"

2. subside [verb]
   Translation: to become less strong or loud
   Similarity score: 0.8809
   Example: "This subsided but interest was again revived"

3. frail [adjective]
   Translation: weak and delicate
   Similarity score: 0.8616

4. feeble [adj]
   Translation: weak and without energy, strength, or power
   Similarity score: 0.8589
   Example: ""Get me out of this" was the feeble reply"

5. impair [verb]
   Translation: to spoil something or make it weaker so that it is less effective
   Similarity score: 0.8474
   Example: "Why do we choose to impair and disrupt our own cognition ?"



In [13]:
def display_similar_words(query, docs, top_k=3, part_of_speech=None):
    """Helper function to display results in a readable format
    
    Args:
        query (str): The query string to find similar translations for
        docs (list): List of Document objects containing dictionary entries
        top_k (int): Number of top results to return
        part_of_speech (str or list, optional): Filter results by part of speech.
                                               Can be a single string or a list of strings.
    """
    # First, get all results
    all_results = find_similar_translations(query, docs, top_k=top_k*3)  # Get more results to account for filtering
    
    # Filter by part of speech if specified
    filtered_results = []
    if part_of_speech is not None:
        # Convert to list if a single string is provided
        if isinstance(part_of_speech, str):
            part_of_speech = [part_of_speech]
            
        # Filter results
        filtered_results = [r for r in all_results if r['part_of_speech'] in part_of_speech]
        
        # Limit to top_k
        filtered_results = filtered_results[:top_k]
    else:
        # No filtering, just use the original results
        filtered_results = all_results[:top_k]
    
    print(f"Finding words with translations similar to: '{query}'")
    if part_of_speech:
        print(f"Filtered by part of speech: {', '.join(part_of_speech)}")
    print()
    
    for i, result in enumerate(filtered_results):
        print(f"{i+1}. {result['word']} [{result['part_of_speech']}]")
        print(f"   Translation: {result['translation']}")
        print(f"   Similarity score: {result['similarity']:.4f}")
        
        # Get the original content to show example
        content = json.loads(result['original_document'].page_content)
        if "example" in content and content["example"]:
            print(f"   Example: \"{content['example']}\"")
        print()

In [14]:
display_similar_words("becoming weaker", docs, top_k=5)

Finding words with translations similar to: 'becoming weaker'

1. senescence [noun]
   Translation: the fact of becoming older, and therefore being in less good condition and less able to function well
   Similarity score: 0.8854
   Example: "We have done theoretical work on the evolution of trade-offs, senescence and morality"

2. subside [verb]
   Translation: to become less strong or loud
   Similarity score: 0.8810
   Example: "This subsided but interest was again revived"

3. frail [adjective]
   Translation: weak and delicate
   Similarity score: 0.8619

4. feeble [adj]
   Translation: weak and without energy, strength, or power
   Similarity score: 0.8589
   Example: ""Get me out of this" was the feeble reply"

5. impair [verb]
   Translation: to spoil something or make it weaker so that it is less effective
   Similarity score: 0.8477
   Example: "Why do we choose to impair and disrupt our own cognition ?"



In [16]:
display_similar_words("becoming weaker", docs, top_k=5, part_of_speech=['verb', 'noun'])

Finding words with translations similar to: 'becoming weaker'
Filtered by part of speech: verb, noun

1. senescence [noun]
   Translation: the fact of becoming older, and therefore being in less good condition and less able to function well
   Similarity score: 0.8854
   Example: "We have done theoretical work on the evolution of trade-offs, senescence and morality"

2. subside [verb]
   Translation: to become less strong or loud
   Similarity score: 0.8810
   Example: "This subsided but interest was again revived"

3. impair [verb]
   Translation: to spoil something or make it weaker so that it is less effective
   Similarity score: 0.8474
   Example: "Why do we choose to impair and disrupt our own cognition ?"

4. vice [noun]
   Translation: a weakness in someone's character
   Similarity score: 0.8314
   Example: "Paris with its hovels and its brothels, with its misery and its vice"

5. diffidence [noun]
   Translation: lack of self-confidence
   Similarity score: 0.8119
   Example: