In [0]:
# Cell 1: Install required libraries for embedding generation and vector storage
# sentence-transformers: Open-source embedding model (production choice)
# pinecone: Vector database client for semantic search

%pip install sentence-transformers -q
%pip install pinecone -q

In [0]:
dbutils.library.restartPython()

In [0]:
# Configure Azure Storage access
storage_account = "sradatalake"
storage_key = ""  # Get from Azure Portal -> Storage Accounts -> Access keys

spark.conf.set(
    f"fs.azure.account.key.{storage_account}.dfs.core.windows.net",
    storage_key
)

print(" Azure Storage authentication configured")

In [0]:
# Load processed Wikipedia data
storage_account = "sradatalake"

print("üì• Loading Wikipedia articles...")
df = spark.read.parquet(f"abfss://processed-data@{storage_account}.dfs.core.windows.net/wikipedia_1000/")

print(f"‚úÖ Loaded {df.count()} articles")
df.show(5, truncate=50)

In [0]:
# Cell 5: Initialize sentence transformer model for embedding generation
# Model: all-MiniLM-L6-v2 (384 dimensions, optimized for semantic similarity)

from sentence_transformers import SentenceTransformer
from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import ArrayType, FloatType
import pandas as pd

# Load pre-trained sentence transformer from HuggingFace
print("üì• Loading sentence transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print(" Model loaded successfully!")

@pandas_udf(ArrayType(FloatType()))
def generate_embeddings(texts: pd.Series) -> pd.Series:
    """Generate embeddings using sentence-transformers"""
    # Truncate texts to 5000 characters
    texts_list = [str(t)[:5000] for t in texts.tolist()]
    
    # Generate embeddings
    embeddings = model.encode(texts_list, show_progress_bar=False)
    
    # Convert to list format
    return pd.Series([emb.tolist() for emb in embeddings])

print(" Embedding function ready")

In [0]:
# Cell 6: Generate embeddings for all articles using distributed processing
print("üîÑ Generating embeddings for all articles...")
print("‚è≥ This will take 10-20 minutes for 1000 articles...")

# Generate embeddings
df_embedded = df.withColumn(
    "embedding",
    generate_embeddings(col("text_clean"))
)

# Cache to avoid recomputation
df_embedded.cache()

print("\n‚úÖ Embeddings generated!")
print("\nüìä Sample results:")
df_embedded.select("title", "text_length", "embedding").show(5, truncate=50)

# Check embedding dimension
sample_embedding = df_embedded.select("embedding").first()[0]
print(f"\nüìè Embedding dimension: {len(sample_embedding)}")
print(f"üìä Total articles with embeddings: {df_embedded.count()}")

In [0]:
# Save embeddings
output_path = f"abfss://embeddings@{storage_account}.dfs.core.windows.net/wikipedia_1000_embeddings/"

print(f"üíæ Saving embeddings to Azure Storage...")
print(f"üìç Location: {output_path}")

df_embedded.write.format("parquet") \
    .mode("overwrite") \
    .save(output_path)

print(f"\n‚úÖ Successfully saved {df_embedded.count()} embeddings!")

# Verify save
print("\nüîç Verifying saved data...")
df_verify = spark.read.parquet(output_path)
print(f"‚úÖ Verification complete: {df_verify.count()} records")

In [0]:
from pinecone import Pinecone, ServerlessSpec
import time

# Initialize Pinecone
pc = Pinecone(api_key="pinecone-api-key")

# Index configuration
index_name = "wikipedia-search"

# Check if index exists
existing_indexes = pc.list_indexes().names()

if index_name in existing_indexes:
    print(f" Deleting existing index with wrong dimension...")
    pc.delete_index(index_name)
    print(" Old index deleted")
    time.sleep(5)  # Wait for deletion

# Create new index with correct dimension
print(f"üìù Creating Pinecone index: {index_name}")
pc.create_index(
    name=index_name,
    dimension=384,  # CORRECT dimension for Sentence Transformers all-MiniLM-L6-v2
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)
print(f" Index created with dimension 384!")

# Wait for index to be ready
print("‚è≥ Waiting for index to initialize...")
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)
print(" Index is ready!")

# Connect to index
index = pc.Index(index_name)

# Show index stats
stats = index.describe_index_stats()
print(f"\nüìä Pinecone Index Stats:")
print(f"Dimension: 384")
print(f"Total vectors: {stats.get('total_vector_count', 0)}")

In [0]:
# Cell 9: Upload Embeddings to Pinecone
import re

# Load embeddings from Azure
print(" Loading embeddings from Azure Storage...")
df_embedded = spark.read.parquet(f"abfss://embeddings@{storage_account}.dfs.core.windows.net/wikipedia_1000_embeddings/")
print(f" Loaded {df_embedded.count()} embeddings")

# Convert to Pandas
print("\nüì¶ Converting to Pandas for upload...")
df_pandas = df_embedded.select("title", "text_clean", "text_length", "embedding").toPandas()
print(f" Converted {len(df_pandas)} records")

# Helper function to clean text
def clean_text(text):
    """Remove problematic Unicode characters"""
    if not text:
        return ""
    text = str(text)
    text = text.replace('\u2013', '-')
    text = text.replace('\u2014', '-')
    text = text.replace('\u2018', "'")
    text = text.replace('\u2019', "'")
    text = text.replace('\u201c', '"')
    text = text.replace('\u201d', '"')
    text = text.encode('ascii', 'ignore').decode('ascii')
    return text

# Upload to Pinecone in batches
print(f"\nüîÑ Uploading {len(df_pandas)} vectors to Pinecone...")

batch_size = 100
vectors = []

for idx, row in df_pandas.iterrows():
    vector_id = f"doc_{idx}"
    embedding = row['embedding']
    
    # Verify embedding dimension
    if len(embedding) != 384:
        print(f"‚ö†Ô∏è Skipping vector {idx}: wrong dimension {len(embedding)}")
        continue
    
    # Clean text
    title_clean = clean_text(row['title'])
    text_clean = clean_text(row['text_clean'])
    
    metadata = {
        "title": title_clean[:200],
        "text": text_clean[:1000],
        "text_length": int(row['text_length'])
    }
    
    vectors.append({
        "id": vector_id,
        "values": embedding,
        "metadata": metadata
    })
    
    # Upload in batches
    if len(vectors) >= batch_size:
        try:
            index.upsert(vectors=vectors)
            print(f"‚úÖ Uploaded {idx + 1}/{len(df_pandas)} vectors...")
            vectors = []
        except Exception as e:
            print(f"‚ö†Ô∏è Error at index {idx}: {e}")
            vectors = []

# Upload remaining vectors
if vectors:
    try:
        index.upsert(vectors=vectors)
        print(f"‚úÖ Uploaded final batch")
    except Exception as e:
        print(f"‚ö†Ô∏è Error uploading final batch: {e}")

print(f"\n‚úÖ Upload complete!")

# Verify
time.sleep(2)
stats = index.describe_index_stats()
print(f"\nüìä Final Pinecone Stats:")
print(f"Total vectors in index: {stats['total_vector_count']}")

In [0]:
# Cell 10: Validation query - Test semantic search functionality
query_text = "artificial intelligence and machine learning"

print(f"üîç Searching for: '{query_text}'")

# Generate embedding for query (use same model)
query_embedding = model.encode([query_text])[0].tolist()

# Search Pinecone
results = index.query(
    vector=query_embedding,
    top_k=5,
    include_metadata=True
)

print(f"\nüìä Top 5 Most Similar Articles:")
print("=" * 80)

for i, match in enumerate(results['matches'], 1):
    print(f"\n{i}. {match['metadata']['title']}")
    print(f"   Similarity Score: {match['score']:.4f}")
    print(f"   Text Length: {match['metadata']['text_length']} characters")
    print(f"   Preview: {match['metadata']['text'][:300]}...")
    print("-" * 80)

In [0]:
def search_wikipedia(query, top_k=5):
    """Search Wikipedia articles using semantic search"""
    print(f"üîç Searching for: '{query}'")
    
    # Generate query embedding
    query_embedding = model.encode([query])[0].tolist()
    
    # Search
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )
    
    # Display results
    print(f"\nüìä Top {top_k} Results:")
    print("=" * 80)
    
    for i, match in enumerate(results['matches'], 1):
        print(f"\n{i}. {match['metadata']['title']}")
        print(f"   Score: {match['score']:.4f}")
        print(f"   {match['metadata']['text'][:200]}...")
        print("-" * 80)
    
    return results

# Try different searches

search_wikipedia("climate change")