# BigQuery Data Preparation and Feature Extraction for Liquid Neural Networks

This notebook demonstrates how to:

1. Extract and prepare data from BigQuery tables using BigFrames
2. Process features through Restricted Boltzmann Machines (RBMs)
3. Feed the RBM output into a CfC-based liquid neural network with LSTM neurons for gating
4. Implement a motor neuron that outputs a value to trigger deeper exploration

The pipeline is designed to handle terabyte-sized tables efficiently through chunked processing.

## Setup and Imports

In [None]:
# Install required packages if needed
# !pip install google-cloud-bigquery bigframes

In [None]:
import os
import bigframes.bigquery as bf
import matplotlib.pyplot as plt
import logging
import time
import pandas as pd
from typing import Dict, List, Optional, Tuple, Union, Any, Generator

# Import ember_ml instead of NumPy and TensorFlow
from ember_ml import ops
from ember_ml.nn import tensor
from ember_ml.ops import get_backend

# Print the current backend
current_backend = get_backend()
print(f"Using {current_backend} backend")

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('bigquery_pipeline')

# Import our components
from ember_ml.nn.features.terabyte_feature_extractor_bigframes import TerabyteFeatureExtractor, TerabyteTemporalStrideProcessor
from ember_ml.models.optimized_rbm import OptimizedRBM
from ember_ml.models.stride_aware_cfc import (
    create_liquid_network_with_motor_neuron,
    create_lstm_gated_liquid_network,
    create_multi_stride_liquid_network
)

## BigQuery Connection Setup

Set up the connection to BigQuery. You can use service account credentials or application default credentials.

In [None]:
# Set your GCP project ID
PROJECT_ID = "your-project-id"  # Replace with your project ID

# Path to service account credentials (optional)
CREDENTIALS_PATH = 'path/to/your/credentials.json'  # Replace with path to credentials.json if needed

# BigQuery location
LOCATION = "US"

# Initialize the feature extractor
feature_extractor = TerabyteFeatureExtractor(
    project_id=PROJECT_ID,
    location=LOCATION,
    chunk_size=100000,
    max_memory_gb=16.0,
    verbose=True
)

# Set up BigQuery connection
feature_extractor.setup_bigquery_connection(CREDENTIALS_PATH)

print(f"Connected to BigQuery project: {PROJECT_ID}")
print(f"Using location: {LOCATION}")
print(f"Feature extractor initialized with BigFrames support")

## Explore Available Tables

Let's explore the available tables in your BigQuery project.

In [None]:
# Import BigQuery client
from google.cloud import bigquery

# Create client
client = bigquery.Client(project=PROJECT_ID)

# List datasets
datasets = list(client.list_datasets())
print(f"Datasets in project {PROJECT_ID}:")
for dataset in datasets:
    print(f"- {dataset.dataset_id}")

# Choose a dataset to explore
if datasets:
    dataset_id = datasets[0].dataset_id
    print(f"\nTables in dataset {dataset_id}:")
    tables = list(client.list_tables(dataset_id))
    for table in tables:
        print(f"- {table.table_id}")
        
    # Get more details about the first table
    if tables:
        first_table = tables[0]
        table_ref = f"{dataset_id}.{first_table.table_id}"
        table = client.get_table(table_ref)
        
        print(f"\nDetails for table {table_ref}:")
        print(f"Description: {table.description}")
        print(f"Row count: {table.num_rows}")
        print(f"Created: {table.created}")
        print(f"Last modified: {table.modified}")
        
        print("\nSchema:")
        for field in table.schema:
            print(f"- {field.name} ({field.field_type})")
            
        # Preview data
        print("\nPreview data:")
        query = f"SELECT * FROM `{PROJECT_ID}.{table_ref}` LIMIT 5"
        query_job = client.query(query)
        results = query_job.result()
        
        for row in results:
            print(row)
else:
    print("No datasets found in this project.")

## Extract Features from BigQuery

Now let's extract features from a BigQuery table. Replace `TABLE_ID` with the table you want to use.

In [None]:
# Set the table ID
TABLE_ID = "your-dataset.your-table"  # Replace with your table ID

# Set the target column (optional)
TARGET_COLUMN = 'target_column'  # Replace with your target column if needed

# Set a limit for testing (remove for full dataset)
LIMIT = 10000

# Extract features
result = feature_extractor.prepare_data(
    table_id=TABLE_ID,
    target_column=TARGET_COLUMN,
    limit=LIMIT,
    force_categorical_columns=[
        # Add your categorical columns here
        "category1", "category2"
    ]
)

if result is not None:
    train_df, val_df, test_df, train_features, val_features, test_features, scaler, imputer = result
    
    print(f"Train shape: {train_df.shape}")
    print(f"Validation shape: {val_df.shape}")
    print(f"Test shape: {test_df.shape}")
    print(f"Features: {train_features}")
    
    # Now we can use these DataFrames directly with BigFrames operations
    # For example, to get the first few rows of the training data:
    print("\nFirst few rows of training data:")
    print(train_df.head())
    
    # Or to get summary statistics:
    print("\nSummary statistics for training data:")
    print(train_df[train_features].describe())
    
    # Define a function to convert DataFrames to ember_ml tensors
    def df_to_tensor(df, columns):
        """
        Convert DataFrame to ember_ml tensor.
        
        Args:
            df: DataFrame
            columns: Columns to include
            
        Returns:
            ember_ml tensor
        """
        # We need to convert to numpy array first, then to ember_ml tensor
        array_data = df[columns].to_numpy()
        return tensor.convert_to_tensor(array_data)
    
    # Convert DataFrames to ember_ml tensors
    print("\nConverting DataFrames to ember_ml tensors...")
    train_tensor = df_to_tensor(train_df, train_features)
    val_tensor = df_to_tensor(val_df, val_features)
    test_tensor = df_to_tensor(test_df, test_features)
    
    print("\nConverted to ember_ml tensors for acceleration")
    print(f"Train tensor shape: {tensor.shape(train_tensor)}")
    print(f"Validation tensor shape: {tensor.shape(val_tensor)}")
    print(f"Test tensor shape: {tensor.shape(test_tensor)}")
    
    # Perform some basic operations with ember_ml ops
    print("\nBasic statistics using ember_ml ops:")
    print(f"Mean of train features: {ops.stats.mean(train_tensor, axis=0)[:5]}...")  # Show first 5 means
    print(f"Standard deviation of train features: {ops.sqrt(ops.stats.mean(ops.square(train_tensor - ops.stats.mean(train_tensor, axis=0)), axis=0))[:5]}...")  # Show first 5 stds
    print(f"Min of train features: {stats.min(train_tensor, axis=0)[:5]}...")  # Show first 5 mins
    print(f"Max of train features: {stats.max(train_tensor, axis=0)[:5]}...")  # Show first 5 maxs

else:
    print("Feature extraction failed")
    # Create empty variables to avoid NameError in subsequent cells
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()
    test_df = pd.DataFrame()
    train_features = []
    val_features = []
    test_features = []
    train_tensor = tensor.zeros((0, 0))
    val_tensor = tensor.zeros((0, 0))
    test_tensor = tensor.zeros((0, 0))

## Apply Temporal Stride Processing

Now let's apply temporal stride processing to the extracted features.

In [None]:
# Create temporal processor
temporal_processor = TerabyteTemporalStrideProcessor(
    window_size=10,
    stride_perspectives=[1, 3, 5],
    pca_components=32,
    batch_size=10000,
    use_incremental_pca=True,
    verbose=True
)

# Define a generator to yield data in batches
def data_generator(df, features, batch_size=10000):
    # Convert DataFrame to ember_ml tensors in batches
    for i in range(0, len(df), batch_size):
        # Get a batch of data
        batch = df.iloc[i:i+batch_size]
        
        # Convert directly to ember_ml tensor using our helper function
        yield df_to_tensor(batch, features)

# Process data - make sure train_df and train_features are defined
if len(train_df) > 0 and len(train_features) > 0:
    # Process the data through the temporal stride processor
    stride_perspectives = temporal_processor.process_large_dataset(
        data_generator(train_df, train_features, batch_size=10000)
    )
    
    # Print stride perspective shapes
    for stride, data in stride_perspectives.items():
        # Use tensor.shape instead of .shape
        print(f"Stride {stride}: shape {tensor.shape(data)}")
    
    # Visualize explained variance for each stride
    explained_variances = [temporal_processor.get_explained_variance(stride) for stride in stride_perspectives.keys()]
    plt.figure(figsize=(10, 6))
    plt.bar(stride_perspectives.keys(), explained_variances)
    plt.xlabel('Stride Length')
    plt.ylabel('Explained Variance Ratio')
    plt.title('Explained Variance by Stride Length')
    plt.show()
    
    # Visualize feature importance for the first stride
    first_stride = list(stride_perspectives.keys())[0]
    feature_importance = temporal_processor.get_feature_importance(first_stride)
    
    if feature_importance is not None:
        # Convert to numpy for matplotlib
        feature_importance_np = tensor.to_numpy(feature_importance)
        
        plt.figure(figsize=(12, 6))
        plt.bar(range(len(feature_importance_np)), feature_importance_np)
        plt.xlabel('Feature Index')
        plt.ylabel('Importance')
        plt.title(f'Feature Importance (Stride {first_stride})')
        plt.tight_layout()
        plt.show()
        
    # Apply temporal processing to create multi-stride features
    print("\nApplying temporal processing to create multi-stride features...")
    
    # Process through each stride perspective
    multi_stride_features = {}
    for stride, data in stride_perspectives.items():
        # Store the transformed features
        multi_stride_features[stride] = data
        print(f"Stride {stride} features shape: {tensor.shape(multi_stride_features[stride])}")
    
    # Demonstrate how to combine multi-stride features
    print("\nCombining multi-stride features...")
    
    # Get a list of all stride features
    stride_features_list = [multi_stride_features[stride] for stride in sorted(multi_stride_features.keys())]
    
    # Concatenate along feature dimension (axis 1)
    combined_features = ops.concatenate(stride_features_list, axis=1)
    
    print(f"Combined multi-stride features shape: {tensor.shape(combined_features)}")
    
    # Calculate correlation between strides using ember_ml ops
    print("\nCalculating correlation between stride features...")
    
    # Calculate correlation matrix using ember_ml ops
    # First center the data
    centered_features = combined_features - ops.stats.mean(combined_features, axis=0)
    # Calculate covariance matrix
    cov_matrix = ops.matmul(
        ops.transpose(centered_features),
        centered_features
    ) / tensor.shape(combined_features)[0]
    
    # Convert to numpy only for visualization with matplotlib
    cov_matrix_np = tensor.to_numpy(cov_matrix)
    
    # Visualize correlation matrix
    plt.figure(figsize=(10, 8))
    plt.imshow(cov_matrix_np, cmap='coolwarm')
    plt.colorbar()
    plt.title('Multi-stride Feature Covariance')
    plt.show()
    
    print("\nMulti-stride temporal processing complete!")

else:
    print("Cannot process data: train_df or train_features is empty")

## Train Restricted Boltzmann Machine

Now let's train an RBM on the extracted features.

In [None]:
# Create RBM
if len(train_features) > 0:
    # Initialize RBM
    rbm = OptimizedRBM(
        n_visible=len(train_features),
        n_hidden=64,
        learning_rate=0.01,
        momentum=0.5,
        weight_decay=0.0001,
        batch_size=100,
        use_binary_states=False,
        use_gpu=True,
        verbose=True
    )
    
    # Define a generator to yield data in batches directly from DataFrames
    def rbm_data_generator(df, features, batch_size=100):
        # Get total size
        total_size = len(df)
        
        # Create random indices for shuffling
        indices = tensor.to_numpy(tensor.argsort(tensor.random_uniform((total_size,))))
        
        # Process in batches
        for i in range(0, total_size, batch_size):
            end_idx = min(i + batch_size, total_size)
            batch_indices = indices[i:end_idx]
            
            # Get batch from DataFrame
            batch = df.iloc[batch_indices]
            
            # Convert directly to numpy array for RBM
            yield batch[features].to_numpy()
    
    # Train RBM
    training_errors = rbm.train_in_chunks(
        rbm_data_generator(train_df, train_features, batch_size=100),
        epochs=10,
        k=1
    )
    
    # Plot training errors
    plt.figure(figsize=(10, 6))
    plt.plot(training_errors)
    plt.xlabel('Epoch')
    plt.ylabel('Reconstruction Error')
    plt.title('RBM Training Error')
    plt.show()
    
    # Extract RBM features
    def feature_generator(df, features, batch_size=1000):
        # Get total size
        total_size = len(df)
        
        # Process in batches
        for i in range(0, total_size, batch_size):
            end_idx = min(i + batch_size, total_size)
            
            # Get batch from DataFrame
            batch = df.iloc[i:end_idx]
            
            # Convert directly to numpy array for RBM
            yield batch[features].to_numpy()
    
    # Extract features from RBM
    train_rbm_features = rbm.transform_in_chunks(
        feature_generator(train_df, train_features, batch_size=1000)
    )
    
    val_rbm_features = rbm.transform_in_chunks(
        feature_generator(val_df, val_features, batch_size=1000)
    )
    
    test_rbm_features = rbm.transform_in_chunks(
        feature_generator(test_df, test_features, batch_size=1000)
    )
    
    # Convert to ember_ml tensors
    train_rbm_tensor = tensor.convert_to_tensor(train_rbm_features)
    val_rbm_tensor = tensor.convert_to_tensor(val_rbm_features)
    test_rbm_tensor = tensor.convert_to_tensor(test_rbm_features)
    
    print(f"Train RBM features shape: {tensor.shape(train_rbm_tensor)}")
    print(f"Validation RBM features shape: {tensor.shape(val_rbm_tensor)}")
    print(f"Test RBM features shape: {tensor.shape(test_rbm_tensor)}")
    
    # Visualize RBM feature distributions
    plt.figure(figsize=(12, 8))
    
    # Plot histograms for first 16 RBM features
    for i in range(min(16, tensor.shape(train_rbm_tensor)[1])):
        plt.subplot(4, 4, i+1)
        # Convert to numpy for matplotlib
        feature_np = tensor.to_numpy(train_rbm_tensor[:, i])
        plt.hist(feature_np, bins=30, alpha=0.7)
        plt.title(f'Feature {i+1}')
        plt.tight_layout()
    
    plt.suptitle('RBM Feature Distributions', y=1.02)
    plt.show()
    
    # Visualize feature correlations
    plt.figure(figsize=(10, 8))
    # Use ember_ml ops for correlation
    centered_features = train_rbm_tensor - ops.stats.means.mean(train_rbm_tensor, axis=0)
    corr_matrix = ops.matmul(
        ops.transpose(centered_features),
        centered_features
    ) / tensor.shape(train_rbm_tensor)[0]
    
    # Convert to numpy only for visualization with matplotlib
    corr_matrix_np = tensor.to_numpy(corr_matrix)
    
    plt.imshow(corr_matrix_np, cmap='coolwarm')
    plt.colorbar()
    plt.title('RBM Feature Correlations')
    plt.show()
    
    print("RBM feature extraction complete!")
else:
    print("Cannot train RBM: train_features is empty")