# Week 2: Semantic Enrichment & Vector Search
## Adding Intelligence to Your Code Graph

In this notebook, we'll:
1. Load the Week 1 structural graph
2. Add LLM-generated summaries
3. Create vector embeddings
4. Build FAISS index
5. Test semantic search

In [8]:
# Setup
import sys
from pathlib import Path
import pickle
import json
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import pandas as pd

sys.path.insert(0, '../src')


from utils.llm_client import LangChainClient
from indexing.semantic_enrichment import SemanticEnricher
from indexing.vector_store import VectorStore

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## 1. Load Week 1 Graph

In [9]:
# Load the structural graph from Week 1
graph_path = '../data/graphs/code_graph.pkl'

with open(graph_path, 'rb') as f:
    graph = pickle.load(f)

print(f"Loaded graph: {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges")

Loaded graph: 28 nodes, 209 edges


In [10]:
# Inspect a sample node BEFORE enrichment
sample_node = list(graph.nodes(data=True))[5]
node_id, attrs = sample_node

print(f"Node: {attrs['name']} ({attrs['type']})")
print(f"\nCurrent attributes:")
for key, value in attrs.items():
    if key != 'code':
        print(f"  {key}: {value}")
    else:
        print(f"  code: {len(value)} chars")

Node: typing (import)

Current attributes:
  name: typing
  type: import
  code: 28 chars
  docstring: None
  file_path: d:\kaggle_project\GraphRAG\data\repositories\SMS-Spam-VotingClassifier-\file.py
  start_line: 6
  end_line: 6
  language: python


## 2. Semantic Enrichment with LLM

In [12]:
# Initialize LLM client
llm_client = LangChainClient(
    provider='google',  # or 'anthropic'
)

print(f"✓ LLM client initialized: {llm_client.provider}")

✓ LLM client initialized: google


In [13]:
# Test enrichment on a single node first
enricher = SemanticEnricher(llm_client=llm_client)

# Pick a function node
test_node = None
for node_id, attrs in graph.nodes(data=True):
    if attrs.get('type') == 'function' and len(attrs.get('code', '')) > 50:
        test_node = (node_id, attrs)
        break

if test_node:
    node_id, attrs = test_node
    print(f"Testing enrichment on: {attrs['name']}\n")
    
    enriched = enricher.enrich_single(node_id, attrs)
    
    print("✓ Enrichment result:")
    print(f"  Summary: {enriched.summary}")
    print(f"  Description: {enriched.description}")
    print(f"  Tags: {', '.join(enriched.tags)}")
    print(f"  Complexity: {enriched.complexity}")
    print(f"  Purpose: {enriched.purpose}")

[32m2025-12-19 16:37:42.698[0m | [1mINFO    [0m | [36mindexing.semantic_enrichment[0m:[36m__init__[0m:[36m58[0m - [1mSemantic enricher initialized (batch_size=1)[0m


Testing enrichment on: fetch_hashed_password





ChatGoogleGenerativeAIError: Error calling model 'gemini-2.5-pro' (RESOURCE_EXHAUSTED): 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.5-pro\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.5-pro\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.5-pro\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.5-pro\nPlease retry in 7.238933759s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_input_token_count', 'quotaId': 'GenerateContentInputTokensPerModelPerDay-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-pro'}}, {'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-pro'}}, {'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'model': 'gemini-2.5-pro', 'location': 'global'}}, {'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_input_token_count', 'quotaId': 'GenerateContentInputTokensPerModelPerMinute-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-pro'}}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '7s'}]}}

In [14]:
# Enrich the entire graph (this will take a few minutes)
print("Enriching all nodes... (this may take 2-5 minutes)\n")

enriched_graph = enricher.enrich_graph(graph, skip_existing=True)

print("\n✓ Enrichment complete!")

[32m2025-12-19 16:38:56.795[0m | [1mINFO    [0m | [36mindexing.semantic_enrichment[0m:[36menrich_graph[0m:[36m90[0m - [1mEnriching 5 nodes...[0m


Enriching all nodes... (this may take 2-5 minutes)



[32m2025-12-19 16:39:07.436[0m | [31m[1mERROR   [0m | [36mindexing.semantic_enrichment[0m:[36menrich_graph[0m:[36m114[0m - [31m[1mFailed to enrich function:d:\kaggle_project\GraphRAG\data\repositories\SMS-Spam-VotingClassifier-\file.py:fetch_hashed_password:17: Error calling model 'gemini-2.5-pro' (RESOURCE_EXHAUSTED): 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.5-pro\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.5-pro\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content


✓ Enrichment complete!


In [15]:
# Show enrichment statistics
stats = enricher.get_enrichment_stats(enriched_graph)

print(f"\n=== Enrichment Statistics ===")
print(f"Total nodes: {stats['total_nodes']}")
print(f"Enriched: {stats['enriched_nodes']} ({stats['enrichment_rate']:.1%})")
print(f"Unique tags: {stats['total_unique_tags']}")

print(f"\nTop tags:")
for tag, count in stats['top_tags'][:10]:
    print(f"  {tag}: {count}")


=== Enrichment Statistics ===
Total nodes: 28
Enriched: 0 (0.0%)
Unique tags: 0

Top tags:


In [16]:
# Visualize tag distribution
tag_data = stats['top_tags'][:15]
tags, counts = zip(*tag_data)

plt.figure(figsize=(12, 6))
plt.barh(tags, counts, color='skyblue')
plt.xlabel('Count')
plt.title('Top 15 Code Tags')
plt.tight_layout()
plt.show()

ValueError: not enough values to unpack (expected 2, got 0)

In [17]:
# LLM usage statistics
llm_stats = llm_client.get_stats()

print("\n=== LLM Usage Statistics ===")
print(f"Total requests: {llm_stats['total_requests']}")
print(f"Total tokens: {llm_stats['total_tokens']:,}")
print(f"Cache hits: {llm_stats['cache_hits']} ({llm_stats['cache_hit_rate']:.1%})")
print(f"Avg tokens/request: {llm_stats['avg_tokens_per_request']:.0f}")

# Estimate cost (rough)
if 'gpt-4' in llm_client.model:
    cost_per_1k = 0.01  # Approximate
    estimated_cost = (llm_stats['total_tokens'] / 1000) * cost_per_1k
    print(f"\nEstimated cost: ${estimated_cost:.2f}")

TypeError: LangChainClient.get_stats() missing 1 required positional argument: 'response'

## 3. Build Vector Index

In [None]:
# Initialize vector store
vector_store = VectorStore(
    model_name='mini',  # Fast and good quality
    index_type='Flat'   # Exact search
)

print(f"Vector store initialized:")
print(f"  Embedding dim: {vector_store.embedding_dim}")
print(f"  Index type: {vector_store.index_type}")

In [None]:
# Build index from enriched graph
print("Building vector index...\n")

vector_store.build_from_graph(
    enriched_graph,
    text_field='combined',  # Combines code, summary, tags
    batch_size=32
)

print(f"\n✓ Vector index built: {len(vector_store.node_ids)} vectors")

## 4. Test Semantic Search

In [18]:
def display_search_results(query, results):
    """Pretty print search results"""
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    print(f"{'='*80}\n")
    
    for i, result in enumerate(results, 1):
        print(f"{i}. {result.name} ({result.type})")
        print(f"   Score: {result.score:.3f}")
        if result.summary:
            print(f"   Summary: {result.summary}")
        if result.tags:
            print(f"   Tags: {', '.join(result.tags[:5])}")
        print()

In [19]:
# Test various queries
test_queries = [
    "function that validates user input",
    "code for reading and parsing data",
    "database connection or storage",
    "configuration and settings"
]

for query in test_queries:
    results = vector_store.search(query, top_k=3)
    display_search_results(query, results)

NameError: name 'vector_store' is not defined

In [20]:
# Interactive search
def interactive_search():
    print("\n🔍 Enter queries (or 'quit' to exit):\n")
    
    while True:
        query = input("Query > ").strip()
        
        if query.lower() in ['quit', 'exit', 'q']:
            break
        
        if not query:
            continue
        
        results = vector_store.search(query, top_k=5)
        display_search_results(query, results)

# Uncomment to use:
# interactive_search()

## 5. Analyze Search Quality

In [21]:
# Compare score distributions
sample_queries = [
    "validation function",
    "data processing",
    "configuration management"
]

all_scores = []
query_labels = []

for query in sample_queries:
    results = vector_store.search(query, top_k=10)
    scores = [r.score for r in results]
    all_scores.extend(scores)
    query_labels.extend([query[:20]] * len(scores))

# Plot
df = pd.DataFrame({'Query': query_labels, 'Score': all_scores})

plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Query', y='Score')
plt.title('Search Score Distribution by Query Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

NameError: name 'vector_store' is not defined

In [22]:
# Find similar nodes
def find_and_display_similar(node_name, top_k=5):
    """Find nodes similar to a given node"""
    
    # Find node ID
    target_node_id = None
    for node_id, attrs in enriched_graph.nodes(data=True):
        if attrs.get('name') == node_name:
            target_node_id = node_id
            print(f"Found: {attrs['name']} ({attrs['type']})")
            print(f"Summary: {attrs.get('summary', 'N/A')}\n")
            break
    
    if not target_node_id:
        print(f"Node '{node_name}' not found")
        return
    
    # Find similar
    similar = vector_store.find_similar_nodes(target_node_id, top_k=top_k)
    
    print(f"Similar nodes:\n")
    for i, result in enumerate(similar, 1):
        print(f"{i}. {result.name} ({result.type}) - Score: {result.score:.3f}")
        if result.summary:
            print(f"   {result.summary}")
        print()

# Try it
find_and_display_similar('save_to_file', top_k=3)

Node 'save_to_file' not found


## 6. Save Everything

In [None]:
# Save enriched graph
enriched_path = '../data/graphs/code_graph_enriched.pkl'
with open(enriched_path, 'wb') as f:
    pickle.dump(enriched_graph, f)
print(f"✓ Enriched graph saved: {enriched_path}")

# Save vector store
vector_path = '../data/graphs/vector_store'
vector_store.save(vector_path)
print(f"✓ Vector store saved: {vector_path}")

## Summary

Week 2 Complete! You now have:
- ✅ LLM-generated summaries for all code entities
- ✅ Domain tags and complexity ratings
- ✅ Vector embeddings (384 dimensions)
- ✅ FAISS index for fast similarity search
- ✅ Semantic search capability

**Week 3 Preview:**
- Community detection (Louvain algorithm)
- Hierarchical summarization
- Global query support
- Query classification (global vs local)