In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import google.generativeai as genai
import os
import time

In [None]:
genai.configure(api_key="YOUR_GOOGLE_API_KEY")

In [None]:
df = pd.read_csv(r"ATTACH_YOUR_DATASET_PATH")

In [26]:
def get_embedding(text):
    while True:
        try:
            response = genai.embed_content(
                model="models/embedding-001",
                content=text,
                task_type="retrieval_document"
            )
            return response["embedding"]
        except Exception as e:
            print(f"❌ Error: {e}. Retrying...")
            time.sleep(2)

In [27]:
# 💾 Cache embeddings to avoid reprocessing
embedding_file = "gemini_embeddings.pkl"

if os.path.exists(embedding_file):
    print("✅ Loading cached embeddings...")
    df = pd.read_pickle(embedding_file)
else:
    print("🚀 Generating embeddings with Gemini...")
    df["embedding"] = df["Text"].apply(get_embedding)
    df.to_pickle(embedding_file)
    print(f"💾 Saved to: {embedding_file}")

🚀 Generating embeddings with Gemini...
💾 Saved to: gemini_embeddings.pkl


In [28]:
# 📐 Convert to numpy matrix
embedding_matrix = np.vstack(df["embedding"].values)

In [29]:
# 🔍 Search function
def find_similar_entries(query, top_n=5):
    query_embedding = get_embedding(query)
    query_embedding = np.array(query_embedding).reshape(1, -1)
    similarities = cosine_similarity(query_embedding, embedding_matrix)[0]
    top_indices = similarities.argsort()[::-1][:top_n]
    return df.iloc[top_indices][["Text", "Label"]].assign(Similarity=similarities[top_indices])

In [30]:
# 🧪 Query-time logic
if __name__ == "__main__":
    query = input("🔍 Enter your query: ")
    results = find_similar_entries(query)
    print("\n🔗 Top similar results:")
    print(results.to_string(index=False))


🔗 Top similar results:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                