In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
from snowflake.snowpark.context import get_active_session

# Get the current active Snowflake session
session = get_active_session()

In [None]:
# Step 1: Create sample text data
texts = [
    # All related to Machine Learning and AI (38 similar texts)
    "Machine learning algorithms learn patterns from training data to make predictions",
    "Supervised learning uses labeled datasets to train predictive models accurately",
    "Neural networks are machine learning models inspired by the human brain structure",
    "Deep learning is a subset of machine learning using multiple neural network layers",
    "Train machine learning models using gradient descent optimization techniques",
    "Machine learning classification predicts categorical output variables from features",
    "Regression models in machine learning predict continuous numerical values",
    "Machine learning requires feature engineering and careful data preprocessing",
    "Overfitting in machine learning occurs when models memorize training data",
    "The best chocolate chip cookie recipe uses brown sugar and vanilla extract for flavor", #outlier
    "Cross-validation technique evaluates machine learning model performance reliably",
    "Natural language processing enables computers to understand human language",
    "Convolutional neural networks excel at image recognition and computer vision tasks",
    "Recurrent neural networks process sequential data like text and time series",
    "Transfer learning reuses pre-trained models for new machine learning tasks",
    "Ensemble methods combine multiple models to improve prediction accuracy",
    "Decision trees split data based on features to make predictions",
    "Random forests use multiple decision trees for robust predictions",
    "Support vector machines find optimal hyperplanes for classification problems",
    "K-means clustering groups similar data points into clusters automatically",
    "Principal component analysis reduces data dimensionality while preserving variance",
    "Reinforcement learning trains agents through rewards and penalties",
    "Backpropagation algorithm trains neural networks by updating weights",
        "The Great Barrier Reef coral ecosystem faces threats from ocean warming and acidification", #outlier
    "Activation functions introduce non-linearity in neural network layers",
    "Batch normalization improves neural network training stability and speed",
    "Dropout technique prevents overfitting by randomly deactivating neurons",
    "Learning rate controls how quickly neural networks update weights",
    "Loss functions measure the difference between predictions and actual values",
    "Hyperparameter tuning optimizes model configuration for better performance",
    "Data augmentation artificially increases training dataset size and variety",
    "Word embeddings represent words as dense vectors in semantic space",
    "Attention mechanisms help neural networks focus on relevant input parts",
    "Transformer architecture revolutionized natural language processing tasks",
    "BERT model uses bidirectional context for language understanding",
    "GPT models generate coherent text using autoregressive language modeling",
    "Fine-tuning adapts pre-trained models to specific downstream tasks",
    "Object detection identifies and localizes objects within images",
    "Semantic segmentation classifies each pixel in an image",
    "Generative adversarial networks create realistic synthetic data samples"
]

# Step 2: Create pandas DataFrame
df = pd.DataFrame({
    'id': range(len(texts)),
    'text': texts
})

print("Original DataFrame:")
print(df.head())
print(f"\nTotal texts: {len(df)}")

In [None]:
# Upload the DataFrame to Snowflake as a temporary table
session.create_dataframe(df).write.mode("overwrite").save_as_table("TEMP_TEXT_DATA_FOR_ANOMALY_DETECTION", table_type='temporary')

In [None]:
SELECT  "id","text" FROM TEMP_TEXT_DATA_FOR_ANOMALY_DETECTION 

In [None]:
# Step 3: Generate embeddings using Snowflake Cortex
embedding_query = """
SELECT  "id", "text", SNOWFLAKE.CORTEX.EMBED_TEXT_768('e5-base-v2', "text") AS embedding FROM TEMP_TEXT_DATA_FOR_ANOMALY_DETECTION 
"""

# Execute query and get results as pandas DataFrame
embed_df = session.sql(embedding_query).to_pandas()

In [None]:
embed_df

In [None]:
embeddings = np.array([np.array(emb) for emb in embed_df['EMBEDDING']])

print(f"\nEmbeddings shape: {embeddings.shape}")
print(f"Each text is represented by {embeddings.shape[1]} features (768 dimensions)")

# Step 5: Normalization (Standardization)
scaler = StandardScaler()
embeddings_normalized = scaler.fit_transform(embeddings)
print(f"\nNormalized embeddings shape: {embeddings_normalized.shape}")

# Step 6: Apply PCA
n_components = 2
pca = PCA(n_components=n_components, random_state=42)
embeddings_pca = pca.fit_transform(embeddings_normalized)

print(f"\nPCA reduced embeddings shape: {embeddings_pca.shape}")
print(f"Explained variance ratio (first 5 components): {pca.explained_variance_ratio_[:5]}")
print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.4f}")

# Step 7: Inverse PCA (Reconstruction)
embeddings_reconstructed = pca.inverse_transform(embeddings_pca)
print(f"\nReconstructed embeddings shape: {embeddings_reconstructed.shape}")

# Step 8: Calculate Reconstruction Error
reconstruction_error = np.mean((embeddings_normalized - embeddings_reconstructed) ** 2, axis=1)
print(f"\nReconstruction error shape: {reconstruction_error.shape}")

In [None]:
embed_df['reconstruction_error'] = reconstruction_error

threshold = np.percentile(reconstruction_error, 96)
embed_df['is_anomaly'] = reconstruction_error > threshold

In [None]:
embed_df