<a href="https://colab.research.google.com/github/VortexOsxo/Chess/blob/master/data_transform/pca-svd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount('/content/drive')
file_path = '/content/drive/My Drive/Inf8245/'

Mounted at /content/drive


### Utils

In [2]:
import numpy as np
import pandas as pd

def load_meta_data():
    meta_train = pd.read_csv(f'{file_path}Data/metadata_train.csv')
    meta_test = pd.read_csv(f'{file_path}Data/metadata_train.csv')
    pass

def load_data():
    data_train = np.load(f'{file_path}Data/train.npz')
    data_test = np.load(f'{file_path}Data/test.npz')

    X_train, y_train = data_train["X_train"], data_train["y_train"] # data_train["ids"]
    X_test = data_test["X_test"] # data_test["ids"]

    return X_train, y_train, X_test

def merge_train_test(X_train, X_test, debug=False):
    X_combined = np.concatenate([X_train, X_test], axis=0)
    if debug: print(X_combined.shape)
    return X_combined

def remove_null_variance_column(X_train, X_test, debug=False):
    mask = (X_train.min(axis=0) != X_train.max(axis=0))

    if debug: print(f'Initial Column number: {X_train.shape[1]}')
    X_train = X_train[:, mask]
    X_test  = X_test[:, mask]
    if debug: print(f'After removed null variance: {X_train.shape[1]}')

    return X_train, X_test

def StandardNormalization(X_train, X_test):
    from sklearn.preprocessing import StandardScaler

    scaler = StandardScaler()
    scaler.fit(X_train)
    return scaler.transform(X_train), scaler.transform(X_test)

def MinMaxNormalization(X_train, X_test):
    from sklearn.preprocessing import MinMaxScaler

    scaler = MinMaxScaler()
    scaler.fit(X_train)
    return scaler.transform(X_train), scaler.transform(X_test)


### PCA

Run incremental PCA epochs until the improvement on explained variance is lower than `tolerance`, for a max of `max_epochs`, with a batch_size of `batch_size`.

In [3]:
import numpy as np
import pandas as pd
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

def incremental_pca(data, n_components, tolerance=1e-4, max_epochs=10, batch_size=500, debug=True):
  ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
  n_samples = len(X_train)
  best_ipca = None
  min_variance_diff = float('inf')

  if debug: print(f"Starting Incremental PCA with {max_epochs} epochs and tolerance {tolerance}")

  for epoch in range(1, max_epochs + 1):
      if debug: print(f"Epoch {epoch}: Starting...")

      shuffled_indices = np.random.permutation(n_samples)
      X_all_shuffled = data[shuffled_indices]

      current_epoch_variance_ratios = []

      for i in range(0, n_samples, batch_size):
          batch = X_all_shuffled[i:i + batch_size]
          ipca.partial_fit(batch)

          if ipca.explained_variance_ratio_ is not None:
              current_epoch_variance_ratios.append(ipca.explained_variance_ratio_.sum())

      if epoch > 1:
          prev_variance_sum = current_epoch_variance_ratios_prev[-1]
          current_variance_sum = current_epoch_variance_ratios[-1]

          variance_diff = abs(current_variance_sum - prev_variance_sum)

          if debug: print(f"Epoch {epoch} finished. Total Explained Variance: {current_variance_sum:.4f}")
          if debug: print(f"Variance change from last epoch: {variance_diff:.6f}")

          if variance_diff < tolerance:
              if debug: print("\n **Converged!** Variance change is below tolerance.")
              break

          # Optional: Save the best model based on minimum variance change
          if variance_diff < min_variance_diff:
              min_variance_diff = variance_diff
              best_ipca = ipca

      else:
          print(f"Epoch {epoch} finished. Total Explained Variance: {current_epoch_variance_ratios[-1]:.4f}")
      current_epoch_variance_ratios_prev = current_epoch_variance_ratios

  else:
      print("\n **Stopped!** Maximum number of epochs reached without convergence.")

  return ipca

### SVD

In [4]:
from sklearn.decomposition import TruncatedSVD

def truncated_svd(data, n_components, tolerance=1e-5):
  svd = TruncatedSVD(n_components=n_components, algorithm='arpack', tol=tolerance)
  svd.fit(data)
  return svd

### Save & Load

In [12]:
def save_transform(ipca, X_train, X_test, transform, name):
  X_train_pca = ipca.transform(X_train)
  X_test_pca = ipca.transform(X_test)

  np.save(f'{file_path}Data/{transform}/{name}train.npy', X_train_pca)
  np.save(f'{file_path}Data/{transform}/{name}test.npy', X_test_pca)

def load_transform(transform, n_components, is_combined=True, tolerance=1e-4):
  name = f'nc_{n_components}_combined_{str(is_combined).lower()}_tol_{tolerance}'
  X_train = np.load(f'{file_path}Data/{transform}/{name}train.npy')
  X_test = np.load(f'{file_path}Data/{transform}/{name}test.npy')
  return X_train, X_test

def load_pca(n_components, combined, tolerance=1e-4):
  return load_transform('PCA', n_components, combined, tolerance)

def load_sdv(n_components, combined, tolerance=1e-4):
  return load_transform('SVD', n_components, combined, tolerance)


### Run

In [None]:
is_combined = True
n_components = [] # 64, 96, 128, 192, 256, 512, 1024  Max seems to be ~600 with 51GB RAM (for PCA)
tolerance = 1e-5
batch_size = 600

transform = 'SVD'
if __name__ == '__main__':
    X_train, y_train, X_test = load_data()
    X_train, X_test = remove_null_variance_column(X_train, X_test)

    data = merge_train_test(X_train, X_test) if is_combined == 'true' else X_train
    for n in n_components:
        name = f'nc_{n}_combined_{str(is_combined).lower()}_tol_{tolerance}'
        if transform == 'PCA':
          model = incremental_pca(data, n, tolerance, batch_size=batch_size)
        elif transform == 'SVD':
          model = truncated_svd(data, n, tolerance)
        else:
          raise 1

        save_transform(model, X_train, X_test, transform, name)
        print(f'Saved: {name}')