# Embedding Tools tests

tests the functionality of embedding providers in the embedding_tool module, to ensure its function correctness

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from embedding_tool import embedding_tool, batch_embedding_tool

sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

## Test data generation

test data for embedding.

In [2]:
# Similar to our documents data
test_docs = [
    {
        "id": 1,
        "content": "Machine learning is a field of inquiry devoted to understanding and building methods that 'learn', that improve performance based on experience."
    },
    {
        "id": 2,
        "content": "Deep learning is part of a broader family of machine learning methods based on artificial neural networks."
    },
    {
        "id": 3,
        "content": "Natural language processing is a subfield of linguistics, computer science, and artificial intelligence."
    },
    {
        "id": 4,
        "content": "Computer vision is an interdisciplinary field that deals with how computers can gain high-level understanding from digital images or videos."
    },
    {
        "id": 5,
        "content": "Reinforcement learning is an area of machine learning concerned with how intelligent agents ought to take actions in an environment."
    }
]

## SentenceTransformers Embedding Provider (Local)

In [None]:
import sentence_transformers
try:
    model_name = "all-MiniLM-L6-v2"
    
    # embedding for first document
    embedding = embedding_tool(
        test_docs[0],
        model=model_name,
        feature_key="content",
        provider="sentence_transformers"
    )
    
    print(f"Single document embedding shape: {len(embedding)}")
    print(f"First few values: {embedding[:5]}")
    
    # Test batch embedding
    batch_embeddings = batch_embedding_tool(
        test_docs,
        model=model_name,
        feature_key="content",
        provider="sentence_transformers"
    )
    
    print(f"\nBatch embeddings count: {len(batch_embeddings)}")
    print(f"Each embedding shape: {len(batch_embeddings[0])}")
    
    # for further analysis
    embeddings_array = np.array(batch_embeddings)
    print(f"Embeddings array shape: {embeddings_array.shape}")
except Exception as e:
    print(f"Error in SentenceTransformers testing: {e}")
   

## OpenAI Embeddings

In [7]:
openai_api_key = os.environ.get("OPENAI_API_KEY", '')

if openai_api_key:
    openai_embedding = embedding_tool(
        test_docs[0],
        api_key=openai_api_key,
        model="text-embedding-ada-002",
        feature_key="content",
        provider="openai"
    )
    
    print(f"OpenAI embedding dimension: {len(openai_embedding)}")
    print(f"First few values: {openai_embedding[:5]}")
    openai_embedding2 = embedding_tool(
        test_docs[1],
        api_key=openai_api_key,
        model="text-embedding-ada-002",
        feature_key="content",
        provider="openai"
    )
    
    # Calculate similarity
    similarity = cosine_similarity(
        [openai_embedding], 
        [openai_embedding2]
    )[0][0]
    
    print(f"\nSimilarity between first two documents: {similarity:.4f}")
    print(f"\nDoc 1: {test_docs[0]['content']}")
    print(f"\nDoc 2: {test_docs[1]['content']}")