In [None]:
from snowflake.snowpark.functions import *
import modin.pandas as pd
import snowflake.snowpark.modin.plugin
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np

In [None]:
from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
session.sql("create or replace databse S3_TO_SNOWFLAKE");
session.sql("use S3_TO_SNOWFLAKE");

In [None]:
session.sql("""
CREATE OR REPLACE STAGE Snow_stage
URL='s3://snwoflakeragtest/pets/'
credentials=(aws_key_id=''
aws_secret_key='')
DIRECTORY=(ENABLE=TRUE)
""").collect()

In [None]:
files_df = pd.read_snowflake("SELECT * FROM DIRECTORY(@Snow_stage)")

In [None]:
files_df.head()

In [None]:
embed_df = pd.read_snowflake("""
SELECT RELATIVE_PATH, FILE_URL,
       TO_ARRAY(AI_EMBED('voyage-multimodal-3', TO_FILE(FILE_URL))) AS image_embedding
FROM DIRECTORY(@Snow_stage)
""")

In [None]:
embed_df.head()

In [None]:
def apply_pca_split_columns(df, embedding_col="IMAGE_EMBEDDING", n_components=3, prefix="pc"):
    """
    Apply PCA on embeddings and create separate columns for each component.
    
    Parameters:
        df (pd.DataFrame): DataFrame with embeddings.
        embedding_col (str): Column containing embeddings (list/array per row).
        n_components (int): Number of PCA components.
        prefix (str): Prefix for new columns (default "pc").
    
    Returns:
        pd.DataFrame, PCA: DataFrame with PCA columns, fitted PCA model.
    """
    # Convert embeddings into matrix
    embeddings = np.array(df[embedding_col].to_list())
    
    # Fit PCA
    pca = PCA(n_components=n_components)
    reduced = pca.fit_transform(embeddings)
    
    # Add separate columns for each PCA dimension
    for i in range(n_components):
        df[f"{prefix}{i+1}"] = reduced[:, i]
    
    return df

In [None]:
df_data = apply_pca_split_columns(embed_df, embedding_col="IMAGE_EMBEDDING",  n_components=3)
df_data.head()

In [None]:
def apply_kmeans(df, feature_cols=["pc1", "pc2", "pc3"], n_clusters = 2):
    """
    Apply KMeans clustering on specified feature columns.

    Parameters:
        df (pd.DataFrame): DataFrame with PCA features.
        feature_cols (list): Columns to use for clustering.
        n_clusters (int): Number of clusters.

    Returns:
        pd.DataFrame, KMeans: DataFrame with cluster assignments, fitted KMeans model.
    """
    # Select features
    X = df[feature_cols].values

    # Fit KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)

    # Add cluster column
    df["cluster_id"] = clusters

    return df


In [None]:
df_data = apply_kmeans(df_data, feature_cols=["pc1", "pc2", "pc3"], n_clusters=2)
df_data.head()

In [None]:
# Write results back into Snowflake
session.write_pandas(df_data, "IMAGE_CLUSTER_TABLE",auto_create_table=True, overwrite=True)

In [None]:
select * from S3_TO_SNOWFLAKE.PUBLIC.IMAGE_CLUSTER_TABLE order by "cluster_id";