In [18]:
# ✅ STEP 1: Install OpenAI (if needed)
!pip install --quiet openai

In [19]:
# ✅ STEP 2: Import libraries
import pandas as pd
import openai
import numpy as np
import time
from sklearn.metrics.pairwise import cosine_similarity
import os

In [20]:
# ✅ STEP 3: Enter your OpenAI API key
import getpass
openai.api_key = getpass.getpass("🔑 Enter your OpenAI API key: ")

In [None]:
# ✅ STEP 4: Load dataset
df = pd.read_csv(r"ATTACH_YOUR_DATASET_PATH")

In [22]:
# ✅ STEP 5: Embedding function with retry logic
def get_embedding(text, model="text-embedding-3-small"):
    while True:
        try:
            response = openai.embeddings.create(input=[text], model=model)
            return response.data[0].embedding
        except openai.RateLimitError:
            print("⏳ Rate limit hit. Sleeping 2s...")
            time.sleep(2)
        except Exception as e:
            print(f"❌ Embedding error: {e}")
            return None

In [23]:
# ✅ STEP 6: Embed and cache
embedding_file = "reduced_text_dataset_with_embeddings.pkl"

if os.path.exists(embedding_file):
    print("✅ Loading cached embeddings...")
    df = pd.read_pickle(embedding_file)
else:
    print("🚀 Generating embeddings...")
    df["embedding"] = df["Text"].apply(lambda x: get_embedding(x))
    df.to_pickle(embedding_file)
    print("💾 Saved embeddings to .pkl file")

🚀 Generating embeddings...
💾 Saved embeddings to .pkl file


In [24]:
# ✅ STEP 7: Similarity function
embedding_matrix = np.vstack(df["embedding"].values)

def find_similar_entries(query, top_n=5):
    query_embedding = get_embedding(query)
    query_embedding = np.array(query_embedding).reshape(1, -1)

    similarities = cosine_similarity(query_embedding, embedding_matrix)[0]
    top_indices = similarities.argsort()[::-1][:top_n]

    return df.iloc[top_indices][["Text", "Label"]].assign(Similarity=similarities[top_indices])

In [25]:
# ✅ STEP 8: Run similarity search
query = input("🔍 Enter your query: ")
results = find_similar_entries(query)

In [26]:
# ✅ STEP 9: Show results
print("\n🔗 Top similar results:")
display(results)


🔗 Top similar results:


Unnamed: 0,Text,Label,Similarity
13,Schools to take part in mock poll\n \n Record ...,0,0.701738
34,Labour's election love-in\n \n Peace and love ...,0,0.278183
22,Lib Dems unveil election slogan\n \n The Liber...,0,0.259875
12,Greer attacks 'bully' Big Brother\n \n Germain...,3,0.252135
25,Anti-spam screensaver scrapped\n \n A contenti...,2,0.24947
