In [0]:
# Cell 1: Install OpenAI SDK for embedding generation
%pip install openai
dbutils.library.restartPython()

In [0]:
# Cell 2: Configure OpenAI embedding generation 
import openai
from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import ArrayType, FloatType
import pandas as pd

openai.api_key = ""  # Your OpenAI key

@pandas_udf(ArrayType(FloatType()))
def generate_embeddings(texts: pd.Series) -> pd.Series:
    """Generate embeddings for Wikipedia articles"""
    embeddings = []
    batch_size = 100
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size].tolist()
        # Truncate to 8000 characters (model limit)
        batch = [str(t)[:8000] for t in batch]
        
        try:
            response = openai.Embedding.create(
                input=batch,
                model="text-embedding-3-small"
            )
            batch_embeddings = [item['embedding'] for item in response['data']]
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"Error: {e}")
            embeddings.extend([None] * len(batch))
    
    return pd.Series(embeddings)

print(" Embedding function ready")

In [0]:
# Configure Azure storage authentication
# Cell 3: Load processed Wikipedia articles and generate sample embeddings
storage_account = "sradatalake"

storage_key = ""  # Azure storage key

spark.conf.set(
    f"fs.azure.account.key.{storage_account}.dfs.core.windows.net",
    storage_key
)

# Now load the data
df = spark.read.parquet(f"abfss://processed-data@{storage_account}.dfs.core.windows.net/wikipedia_1000/")
print(f" Loaded {df.count()} articles")

# Take first 100 for testing
df_sample = df.limit(100)
print(f"üîÑ Generating embeddings for {df_sample.count()} articles...")

# Generate embeddings
df_embedded = df_sample.withColumn(
    "embedding",
    generate_embeddings(col("text_clean"))
)

# Show results
print("\nüìä Sample results:")
df_embedded.select("title", "text_length", "embedding").show(5, truncate=50)

print(f"\n‚úÖ Generated {df_embedded.count()} embeddings")

# Check embedding dimensions
sample_embedding = df_embedded.select("embedding").first()[0]
if sample_embedding:
    print(f"üìè Embedding dimension: {len(sample_embedding)}")

In [0]:
# Cell 4: Persist sample embeddings to Azure Storage
# Saves development/test embeddings for evaluationstorage_account = "sradatalake"

output_path = f"abfss://embeddings@{storage_account}.dfs.core.windows.net/wikipedia_100/"

print(f"üíæ Saving embeddings to: {output_path}")

df_embedded.write.format("parquet") \
    .mode("overwrite") \
    .save(output_path)

print(f"‚úÖ Saved {df_embedded.count()} embeddings!")
print(f"üìç Location: {output_path}")

# Verify the save
print("\nüîç Verifying saved data...")
df_verify = spark.read.parquet(output_path)
print(f"‚úÖ Verification successful! Loaded {df_verify.count()} records")
df_verify.select("title", "text_length", "embedding").show(3, truncate=50)

In [0]:
storage_account = "sradatalake"

# Load all 1000 articles
print("üì• Loading all Wikipedia articles...")
df = spark.read.parquet(f"abfss://processed-data@{storage_account}.dfs.core.windows.net/wikipedia_1000/")
print(f"‚úÖ Loaded {df.count()} articles")

# Generate embeddings for ALL articles
print(f"\nüîÑ Generating embeddings for ALL {df.count()} articles...")
print("‚è≥ This will take 10-20 minutes...")

df_embedded = df.withColumn(
    "embedding",
    generate_embeddings(col("text_clean"))
)

# Save
output_path = f"abfss://embeddings@{storage_account}.dfs.core.windows.net/wikipedia_1000_embeddings/"
print(f"\nüíæ Saving embeddings...")

df_embedded.write.format("parquet") \
    .mode("overwrite") \
    .save(output_path)

print(f"\n‚úÖ Successfully saved {df_embedded.count()} embeddings!")
print(f"üìç Location: {output_path}")