In [None]:
pip install llama-index-embeddings-huggingface

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [None]:
# Initialize the embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")


In [None]:
# Generate an embedding for a single sentence
single_embedding = embed_model.get_text_embedding("Embeddings represent text as numerical vectors in AI systems.")


In [None]:
print(single_embedding)

In [None]:
print(f"Single embedding length: {len(single_embedding)}")

In [None]:
# Step 2: Define the statements
statements = [
    "The cat is on the mat.",                     # Set 1 - Stmt 1
    "The sun rises in the east.",                 # Unrelated - Stmt 2
    "The feline rests on the carpet.",            # Set 1 - Stmt 3
    "Artificial Intelligence is fascinating.",    # Set 2 - Stmt 4
    "Machine learning drives AI advancements.",   # Set 2 - Stmt 5
    "Birds fly in the sky.",                      # Unrelated - Stmt 6
    "Deep learning is a subset of AI.",           # Set 2 - Stmt 7
    "Cat and Cow are domestic animals",           # Set 1  - Stmt 8
    "Equity, mutual funds and stocks are various options to invest", # Set 3 - Stmt 9
    "Gold can be used as hedge towards the investement", # Set 3 - Stmt 10
    "Nifty and Sensex are major index in India", # Set 3 - Stmt 11
    "Nasdaq, Dow and S&P 100 are major index in India", # Set 3 - Stmt 12
]

In [None]:
# Step 3: Generate embeddings
embeddings = [embed_model.get_text_embedding(statement) for statement in statements]

In [None]:
# Display information about the embeddings
for i, emb in enumerate(embeddings):
    print(f"Embedding {i + 1}: Length = {len(emb)}, First 5 dimensions = {emb[:5]}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Step 4: Calculate cosine similarity
similarity_matrix = cosine_similarity(embeddings)

In [None]:
# Step 5: Visualize the similarity matrix
plt.figure(figsize=(10, 8))
plt.imshow(similarity_matrix, cmap='coolwarm', interpolation='nearest')
plt.colorbar(label='Cosine Similarity')
plt.xticks(range(len(statements)), [f"Stmt {i+1}" for i in range(len(statements))], rotation=45)
plt.yticks(range(len(statements)), [f"Stmt {i+1}" for i in range(len(statements))])
plt.title("Cosine Similarity Between Statements")
plt.show()

In [None]:

# Step 6: Analyze relationships in 2D using PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)


In [None]:

# Scatter plot to visualize relationships
plt.figure(figsize=(8, 6))
for i, (x, y) in enumerate(reduced_embeddings):
    plt.scatter(x, y, label=f"Stmt {i+1}")
    plt.text(x + 0.02, y, f"Stmt {i+1}", fontsize=9)

plt.title("Statement Relationships (PCA Reduced)")
plt.xlabel("PCA Dimension 1")
plt.ylabel("PCA Dimension 2")
plt.legend()
plt.grid()
plt.show()