In [1]:
from google.colab import drive

drive.mount('/content/drive')
file_path = '/content/drive/My Drive/Inf8245/'

Mounted at /content/drive


In [2]:
RANDOM_SEED = 42
import os

# Setting this environment variable fixes the CuBLAS non-determinism issue
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

import torch
import numpy as np
import random

def set_seed(seed_value):
    random.seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    np.random.seed(seed_value)

    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)

        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

        try:
            torch.use_deterministic_algorithms(True)
        except:
            print("Note: torch.use_deterministic_algorithms(True) failed. Check PyTorch version.")

    print(f"Global seed set to {seed_value}.")
set_seed(RANDOM_SEED)

Global seed set to 42.


### Utils

In [3]:
import numpy as np
import pandas as pd

def load_meta_data():
    meta_train = pd.read_csv(f'{file_path}Data/metadata_train.csv')
    meta_test = pd.read_csv(f'{file_path}Data/metadata_train.csv')
    pass

def load_data():
    data_train = np.load(f'{file_path}Data/train.npz')
    data_test = np.load(f'{file_path}Data/test.npz')

    X_train, y_train = data_train["X_train"], data_train["y_train"] # data_train["ids"]
    X_test = data_test["X_test"] # data_test["ids"]

    return X_train, y_train, X_test

def merge_train_test(X_train, X_test, debug=False):
    X_combined = np.concatenate([X_train, X_test], axis=0)
    if debug: print(X_combined.shape)
    return X_combined

def remove_null_variance_column(X_train, X_test, debug=False):
    mask = (X_train.min(axis=0) != X_train.max(axis=0))

    if debug: print(f'Initial Column number: {X_train.shape[1]}')
    X_train = X_train[:, mask]
    X_test  = X_test[:, mask]
    if debug: print(f'After removed null variance: {X_train.shape[1]}')

    return X_train, X_test

def StandardNormalization(X_train, X_test):
    from sklearn.preprocessing import StandardScaler

    scaler = StandardScaler()
    scaler.fit(X_train)
    return scaler.transform(X_train), scaler.transform(X_test)

def MinMaxNormalization(X_train, X_test):
    from sklearn.preprocessing import MinMaxScaler

    scaler = MinMaxScaler()
    scaler.fit(X_train)
    return scaler.transform(X_train), scaler.transform(X_test)


### PCA

Run incremental PCA epochs until the improvement on explained variance is lower than `tolerance`, for a max of `max_epochs`, with a batch_size of `batch_size`.

In [4]:
import numpy as np
import pandas as pd
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

def incremental_pca(data, n_components, tolerance=1e-4, max_epochs=10, batch_size=500, debug=True):
  ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
  n_samples = len(X_train)
  best_ipca = None
  min_variance_diff = float('inf')

  if debug: print(f"Starting Incremental PCA with {max_epochs} epochs and tolerance {tolerance}")

  for epoch in range(1, max_epochs + 1):
      if debug: print(f"Epoch {epoch}: Starting...")

      shuffled_indices = np.random.permutation(n_samples)
      X_all_shuffled = data[shuffled_indices]

      current_epoch_variance_ratios = []

      for i in range(0, n_samples, batch_size):
          batch = X_all_shuffled[i:i + batch_size]
          ipca.partial_fit(batch)

          if ipca.explained_variance_ratio_ is not None:
              current_epoch_variance_ratios.append(ipca.explained_variance_ratio_.sum())

      if epoch > 1:
          prev_variance_sum = current_epoch_variance_ratios_prev[-1]
          current_variance_sum = current_epoch_variance_ratios[-1]

          variance_diff = abs(current_variance_sum - prev_variance_sum)

          if debug: print(f"Epoch {epoch} finished. Total Explained Variance: {current_variance_sum:.4f}")
          if debug: print(f"Variance change from last epoch: {variance_diff:.6f}")

          if variance_diff < tolerance:
              if debug: print("\n **Converged!** Variance change is below tolerance.")
              break

          # Optional: Save the best model based on minimum variance change
          if variance_diff < min_variance_diff:
              min_variance_diff = variance_diff
              best_ipca = ipca

      else:
          print(f"Epoch {epoch} finished. Total Explained Variance: {current_epoch_variance_ratios[-1]:.4f}")
      current_epoch_variance_ratios_prev = current_epoch_variance_ratios

  else:
      print("\n **Stopped!** Maximum number of epochs reached without convergence.")

  return ipca

### SVD

In [5]:
from sklearn.decomposition import TruncatedSVD

def truncated_svd(data, n_components, tolerance=1e-5):
  svd = TruncatedSVD(n_components=n_components, algorithm='arpack', tol=tolerance)
  svd.fit(data)
  return svd

### Autoencoder


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import numpy as np

class LinearAutoencoder(nn.Module):
    def __init__(self, input_dim, n_components):
        super(LinearAutoencoder, self).__init__()
        # Encoder: Maps input to latent space (similar to PCA components)
        self.encoder = nn.Linear(input_dim, n_components, bias=True)
        # Decoder: Maps latent space back to input
        self.decoder = nn.Linear(n_components, input_dim, bias=True)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

    def transform(self, X):
        self.eval()
        device = next(self.parameters()).device
        with torch.no_grad():
            # Convert numpy to tensor
            X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
            # Encode
            encoded = self.encoder(X_tensor)
            # Return to numpy
            return encoded.cpu().numpy()

def train_linear_ae(data, n_components, epochs=100, batch_size=500, lr=1e-3, tolerance=1e-5, patience=5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Training on {device}...")

    input_dim = data.shape[1]
    model = LinearAutoencoder(input_dim, n_components).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Data Loader
    tensor_x = torch.tensor(data, dtype=torch.float32)
    dataset = TensorDataset(tensor_x, tensor_x)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Tracking
    loss_history = []
    best_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0

        for batch_x, _ in dataloader:
            batch_x = batch_x.to(device)

            # Forward & Backward
            outputs = model(batch_x)
            loss = criterion(outputs, batch_x)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        # Average loss for this epoch
        avg_loss = epoch_loss / len(dataloader)
        loss_history.append(avg_loss)

        # --- Convergence Check (Early Stopping) ---
        # If the loss improved by more than 'tolerance'
        if avg_loss < best_loss - tolerance:
            best_loss = avg_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1

        # Stop if no improvement for 'patience' epochs
        if epochs_no_improve >= patience:
            print(f"Converged at Epoch {epoch+1} with Loss: {avg_loss:.6f}")
            break

        if (epoch + 1) % 5 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.6f}')

    # --- Visualization ---
    plt.figure(figsize=(8, 4))
    plt.plot(loss_history, label='Reconstruction Loss')
    plt.xlabel('Epochs')
    plt.ylabel('MSE Loss')
    plt.title(f'Convergence: {n_components} Components')
    plt.legend()
    plt.grid(True)
    plt.show() # This will display the graph in your notebook

    return model

### Save & Load

In [7]:
def save_transform(ipca, X_train, X_test, transform, name):
  X_train_pca = ipca.transform(X_train)
  X_test_pca = ipca.transform(X_test)

  np.save(f'{file_path}Data/{transform}/{name}train.npy', X_train_pca)
  np.save(f'{file_path}Data/{transform}/{name}test.npy', X_test_pca)

def load_transform(transform, n_components, is_combined=True, tolerance=1e-4):
  name = f'nc_{n_components}_combined_{str(is_combined).lower()}_tol_{tolerance}'
  X_train = np.load(f'{file_path}Data/{transform}/{name}train.npy')
  X_test = np.load(f'{file_path}Data/{transform}/{name}test.npy')
  return X_train, X_test

def load_pca(n_components, combined, tolerance=1e-4):
  return load_transform('PCA', n_components, combined, tolerance)

def load_sdv(n_components, combined, tolerance=1e-4):
  return load_transform('SVD', n_components, combined, tolerance)


### Run

In [None]:
is_combined = True
n_components = [96, 128, 192, 256, 512,] # 64, 96, 128, 192, 256, 512, 1024  Max seems to be ~600 with 51GB RAM (for PCA)
tolerance = 1e-5
batch_size = 256

transform = 'AE'
if __name__ == '__main__':
    X_train, y_train, X_test = load_data()
    X_train, X_test = remove_null_variance_column(X_train, X_test)

    data = merge_train_test(X_train, X_test) if is_combined else X_train
    for n in n_components:
        name = f'nc_{n}_combined_{str(is_combined).lower()}_tol_{tolerance}'

        if transform == 'PCA':
            model = incremental_pca(data, n, tolerance, batch_size=batch_size)
        elif transform == 'SVD':
            model = truncated_svd(data, n, tolerance)
        elif transform == 'AE':
            model = train_linear_ae(data, n, epochs=100, batch_size=batch_size)
        else:
            raise ValueError("Unknown transform type")

        save_transform(model, X_train, X_test, transform, name)
        print(f'Saved: {name}')

Training on cuda...
Epoch [5/100], Loss: 111.021280
Epoch [10/100], Loss: 1.296625
Epoch [15/100], Loss: 1.054853
Epoch [20/100], Loss: 1.044140
Epoch [25/100], Loss: 1.035530
Epoch [30/100], Loss: 1.027152
Epoch [35/100], Loss: 1.018735
Epoch [40/100], Loss: 1.010402
Epoch [45/100], Loss: 1.001764
Epoch [50/100], Loss: 0.992828
Epoch [55/100], Loss: 0.984007
Epoch [60/100], Loss: 0.974281
Epoch [65/100], Loss: 0.964197
Epoch [70/100], Loss: 0.954350
Epoch [75/100], Loss: 0.944162
