# Embeddings and Similarity Search with Cortex AISQL

This notebook demonstrates:
- **AI_EMBED**: Generate vector embeddings for text and images
- **AI_SIMILARITY**: Calculate similarity between embeddings
- Semantic search and clustering use cases


In [None]:
import streamlit as st
import pandas as pd
import altair as alt
import numpy as np
from snowflake.snowpark.context import get_active_session

session = get_active_session()
session.sql("USE DATABASE AISQL_DB").collect()
session.sql("USE SCHEMA AISQL_SCHEMA").collect()
session.sql("USE WAREHOUSE AISQL_WH").collect()


## 1. AI_EMBED: Generate Embeddings

Create vector embeddings for semantic search


In [None]:
# Create embeddings for emails
sql = """
CREATE OR REPLACE TABLE email_embeddings AS
SELECT 
    ticket_id,
    user_id,
    content,
    AI_EMBED('snowflake-arctic-embed-m-v1.5', content) as embedding
FROM emails
LIMIT 100
"""
session.sql(sql).collect()

# View embeddings
sql_view = """
SELECT 
    ticket_id,
    SUBSTR(content, 1, 100) as content_preview,
    ARRAY_SIZE(embedding) as embedding_dimension
FROM email_embeddings
LIMIT 10
"""
df = session.sql(sql_view).to_pandas()
st.subheader("Email Embeddings Created")
st.dataframe(df)
st.metric("Embedding Dimension", df['EMBEDDING_DIMENSION'].iloc[0])


## 2. AI_SIMILARITY: Find Similar Tickets

Use similarity scores to find related support tickets


In [None]:
# Find similar tickets
sql = """
WITH ticket_pairs AS (
    SELECT 
        a.ticket_id as ticket_a,
        b.ticket_id as ticket_b,
        SUBSTR(a.content, 1, 80) as content_a,
        SUBSTR(b.content, 1, 80) as content_b,
        AI_SIMILARITY(a.embedding, b.embedding) as similarity_score
    FROM email_embeddings a
    CROSS JOIN email_embeddings b
    WHERE a.ticket_id < b.ticket_id
)
SELECT * FROM ticket_pairs
ORDER BY similarity_score DESC
LIMIT 20
"""

df_similar = session.sql(sql).to_pandas()
st.subheader("Most Similar Ticket Pairs")
st.dataframe(df_similar)

# Visualize similarity distribution
hist = alt.Chart(df_similar).mark_bar().encode(
    alt.X('SIMILARITY_SCORE:Q', bin=alt.Bin(maxbins=20)),
    y='count()'
).properties(height=300)
st.subheader("Similarity Score Distribution")
st.altair_chart(hist, use_container_width=True)


## 3. Semantic Search

Search for tickets similar to a query


In [None]:
# Semantic search
search_query = st.text_input("Enter search query:", "I need a refund for my ticket")

if search_query:
    sql = f"""
    WITH query AS (
        SELECT AI_EMBED('snowflake-arctic-embed-m-v1.5', '{search_query}') as query_embedding
    )
    SELECT 
        e.ticket_id,
        SUBSTR(e.content, 1, 150) as content_preview,
        AI_SIMILARITY(e.embedding, q.query_embedding) as relevance_score
    FROM email_embeddings e
    CROSS JOIN query q
    ORDER BY relevance_score DESC
    LIMIT 10
    """
    
    df_search = session.sql(sql).to_pandas()
    st.subheader(f"Search Results for: '{search_query}'")
    st.dataframe(df_search)
