In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='tqdm.auto')  # hide tqdm IProgress warning
warnings.filterwarnings("ignore", category=FutureWarning)  # hide seaborn / pandas future warnings

import pandas as pd
import numpy as np
import pickle
import plotly.express as px
from sklearn.metrics.pairwise import cosine_similarity
import umap.umap_ as umap
import sys, os


sys.path.append(os.path.abspath('..'))
try:
    from src import config
except ModuleNotFoundError:
    class config:
        PROCESSED_DATA_DIR = "data/processed"

try:
    processed_path = f"{config.PROCESSED_DATA_DIR}/train_processed.json"
    df = pd.read_json(processed_path, orient='records')
    print(f"Successfully loaded metadata from {processed_path}")
    
    embeddings_path = f"{config.PROCESSED_DATA_DIR}/problem_embeddings.pkl"
    with open(embeddings_path, 'rb') as f:
        embeddings = pickle.load(f)
    print(f"Successfully loaded embeddings from {embeddings_path}")

except FileNotFoundError as e:
    print(f"Error: {e}\nMake sure you have run the main data pipeline first.")


print("\n--- Dataset Info ---")
print(f"DataFrame records: {len(df)}")
if isinstance(embeddings, np.ndarray):
    print(f"Embeddings shape: {embeddings.shape}")
else:
    print(f"Embeddings type: {type(embeddings)} (expected numpy.ndarray)")


Successfully loaded metadata from e:\agentic-reasoning-engine\dataset\processed/train_processed.json
Successfully loaded embeddings from e:\agentic-reasoning-engine\dataset\processed/problem_embeddings.pkl

--- Dataset Info ---
DataFrame records: 384
Embeddings shape: (384, 384)


In [5]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='umap')
warnings.filterwarnings("ignore", category=FutureWarning)

print("Running UMAP to reduce dimensionality...")

# Ensure embeddings are valid before reduction
if embeddings is None or not isinstance(embeddings, np.ndarray):
    raise ValueError("❌ Embeddings not found or invalid. Please verify that the embedding file was loaded correctly.")

# Configure and run UMAP
reducer = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    n_components=2,
    random_state=42
)

embeddings_2d = reducer.fit_transform(embeddings)
print(f"UMAP reduction complete — new shape: {embeddings_2d.shape}")

# Add UMAP coordinates to dataframe
df['umap_x'] = embeddings_2d[:, 0]
df['umap_y'] = embeddings_2d[:, 1]

print("2D coordinates added to dataframe as 'umap_x' and 'umap_y'.")


Running UMAP to reduce dimensionality...
UMAP reduction complete — new shape: (384, 2)
2D coordinates added to dataframe as 'umap_x' and 'umap_y'.


In [6]:
fig = px.scatter(
    df,
    x='umap_x',
    y='umap_y',
    color='topic',
    hover_name='topic',
    hover_data={'problem_statement': True, 'umap_x': False, 'umap_y': False},
    title='Interactive UMAP Projection of Problem Statement Embeddings'
)

fig.update_traces(textposition='top center')
fig.update_layout(height=800)
fig.show()

In [7]:
# Pick a random query problem
query_index = np.random.randint(0, len(df))
query_statement = df.iloc[query_index]['problem_statement']
query_embedding = embeddings[query_index].reshape(1, -1)

print(f"--- QUERY PROBLEM (Index: {query_index}) ---")
print(query_statement)

# Compute cosine similarities
similarities = cosine_similarity(query_embedding, embeddings)[0]

# Top 5 most similar (excluding itself)
top_indices = np.argsort(similarities)[-6:-1][::-1]

print("\n--- TOP 5 MOST SIMILAR PROBLEMS FOUND ---")
for i, index in enumerate(top_indices):
    print(f"\n{i+1}. (Index: {index}, Similarity: {similarities[index]:.4f})")
    print(df.iloc[index]['problem_statement'])

--- QUERY PROBLEM (Index: 249) ---
You are tasked with painting numbers 1 through 100 on the doors of 100 rooms. The numbers have to be painted such that there is no pattern in how the evens and odds are distributed, to avoid making one row of doors appear predominantly odd or even to those walking by. To optimize the painting process and minimize the time spent on decision making, you decide to establish a simple algorithm by which you will paint odd and even numbers on the doors. What is the algorithm that ensures an approximately even distribution of odds and evens without creating a discernible pattern?

--- TOP 5 MOST SIMILAR PROBLEMS FOUND ---

1. (Index: 62, Similarity: 0.5057)
You are in a room with six doors, each leading to a different room. Each subsequent room has the same setup with six doors leading to other rooms, but without any doors leading back. After passing through four rooms, you enter a room that only has one door leading to the outside. Assuming you always choos