In [10]:
import numpy as np
import torch
import matplotlib.pyplot as plt
from OGMM import OGMM, init_cov_and_T_split
from sklearn.mixture import GaussianMixture
from scipy.special import logsumexp

from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.neighbors import NearestNeighbors
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [11]:
CSTR_data = np.load('./data/cstr_acfmeandata.npy')
CSTR_features = CSTR_data[:, :-4]
CSTR_labels = CSTR_data[:, -4]
CSTR_domains = CSTR_data[:, -3]
X_train, X_test, y_train, y_test = train_test_split(CSTR_features, CSTR_labels, test_size=0.2, random_state=0)
X_train_t = torch.from_numpy(X_train)
X_test_t = torch.from_numpy(X_test)

In [3]:
MGR_data = np.load(r'C:\Users\SL276123\Documents\Online DaDiL\MGR_WBTransport\MGR_WBTransport\data\MGR100.npy')
MGR_features = MGR_data[:, :-2]
MGR_labels = MGR_data[:, -2]
MGR_domains = MGR_data[:, -1]
X_train, X_test, y_train, y_test = train_test_split(MGR_features, MGR_labels, test_size=0.2, random_state=0)
X_train_t = torch.from_numpy(X_train)
X_test_t = torch.from_numpy(X_test)

## Log likelihood function

In [3]:
GMM_sklearn = GaussianMixture(n_components=26, random_state=0).fit(X_train)

In [4]:
np.sum(GMM_sklearn.score_samples(X_test))

27961.289158377625

In [5]:
def normal_pdf(x, mean, cov, bib='numpy'):
    if bib == 'numpy':
        return (2*np.pi)**(-len(x)/2) * np.linalg.det(cov)**(-1/2) * np.exp(-1/2 * (x - mean).T @ np.linalg.inv(cov) @ (x - mean))
    else:
        return (2*torch.pi)**(-len(x)/2) * torch.linalg.det(cov)**(-1/2) * torch.exp(-1/2 * (x - mean).T @ torch.inverse(cov) @ (x - mean))

def log_normal_pdf(x, mean, cov, bib='numpy'):
    if bib == 'numpy':
        return -len(x)/2*np.log(2*np.pi) - 1/2*(np.log(np.linalg.det(cov)) + (x - mean).T @ np.linalg.inv(cov) @ (x - mean))
    else:
        return -len(x)/2*torch.log(torch.tensor(2*torch.pi)) - 1/2*(torch.log(torch.linalg.det(cov)) + (x - mean).T @ torch.inverse(cov) @ (x - mean))

In [6]:
def log_likelihood(mixture, X, bib='numpy'):
    log_pdf = np.zeros((X.shape[0], len(mixture)))
    for i in range(X.shape[0]):
        for j in range(len(mixture)):
            log_pdf[i, j] = np.log(mixture[j][0]) + log_normal_pdf(X[i], mixture[j][2], mixture[j][3], bib)
            
    return np.sum(logsumexp(log_pdf, axis=1))

In [7]:
means = GMM_sklearn.means_
weights = GMM_sklearn.weights_
cov = GMM_sklearn.covariances_
GMM_sklearn_list = [[weights[i], 1, means[i], cov[i]] for i in range(len(weights))]

In [8]:
ll_sklearn = log_likelihood(GMM_sklearn_list, X_test)
ll_sklearn

27961.289158377615

In [9]:
range_data = np.mean(np.max(X_train, axis=0) - np.min(X_train, axis=0))

K_max = 26
cov_init, T_split = init_cov_and_T_split(K_max, range_data, X_train.shape[1])
GMM_online = OGMM(X_train_t, K_max, cov_init, 0.1, T_split)

  return (2*np.pi)**(-len(x)/2) * np.linalg.det(cov)**(-1/2) * np.exp(-1/2 * (x - mean).T @ np.linalg.inv(cov) @ (x - mean))


In [16]:
GMM_online

[[tensor(nan),
  tensor(nan),
  tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         dtype=torch.float64),
  tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, na

In [10]:
ll_online = log_likelihood(GMM_online, X_test_t, 'torch')
ll_online

nan

In [11]:
(ll_online - ll_sklearn)/ll_sklearn

nan

## PCA

In [12]:
def approximate_data(data_features, n_neighbors, n_components):
    NN = NearestNeighbors(n_neighbors=n_neighbors)
    NN.fit(data_features)

    _, indices = NN.kneighbors(data_features)
    NN_features = data_features[indices]


    pca = PCA(n_components=n_components)
    pca.whiten = False
    data_approx = np.zeros(data_features.shape)

    for i in range(NN_features.shape[0]):
        X_trans = pca.fit_transform(NN_features[i, :, :])
        X_app = pca.inverse_transform(X_trans)[0]
        data_approx[i] = X_app.reshape(-1,)
    
    return data_approx, pca.explained_variance_ratio_

In [17]:
def approx_classification_error(data_features, data_labels, data_approx):
    #classifier = RandomForestClassifier(criterion = 'log_loss')
    classifier_raw = SVC()
    classifier_approx = SVC()

    X_train, X_test, y_train, y_test = train_test_split(data_features, data_labels, test_size=0.2, random_state=0)
    X_train_approx, X_test_approx, y_train_approx, y_test_approx = train_test_split(data_approx, data_labels, test_size=0.2, random_state=0)

    classifier_raw.fit(X_train, y_train)
    classifier_approx.fit(X_train_approx, y_train_approx)

    approx_error_list = []
    for i in range(1):
        y_pred = classifier_raw.predict(X_test)
        y_pred_approx = classifier_approx.predict(X_test_approx)

        accuracy_raw = accuracy_score(y_test, y_pred)
        accuracy_approx = accuracy_score(y_test_approx, y_pred_approx)
        print(accuracy_raw, accuracy_approx)
        approx_error_list.append(np.abs((accuracy_approx - accuracy_raw))/accuracy_raw)

    return np.mean(approx_error_list)

PCA MGR

In [18]:
target_domain = 4
target_features = MGR_features[np.where(MGR_domains==target_domain)[0], :]
target_labels = MGR_labels[np.where(MGR_domains==target_domain)[0]]
target_domains = MGR_domains[np.where(MGR_domains==target_domain)[0]]

sources_features = MGR_features[np.where(MGR_domains==target_domain)[0], :]
sources_labels = MGR_labels[np.where(MGR_domains==target_domain)[0]]
sources_domains = MGR_domains[np.where(MGR_domains==target_domain)[0]]

In [19]:
data_approx, ex_var = approximate_data(MGR_features, 10, 1)
print(ex_var)

approx_classification_error(MGR_features, MGR_labels, data_approx)

[0.80543947]
0.247 0.245


0.008097165991902841

PCA CSTR

In [20]:
target_domain = 6
target_features = CSTR_features[np.where(CSTR_domains==target_domain)[0], :]
target_labels = CSTR_labels[np.where(CSTR_domains==target_domain)[0]]
target_domains = CSTR_domains[np.where(CSTR_domains==target_domain)[0]]

sources_features = CSTR_features[np.where(CSTR_domains!=target_domain)[0], :]
sources_labels = CSTR_labels[np.where(CSTR_domains!=target_domain)[0]]
sources_domains = CSTR_domains[np.where(CSTR_domains!=target_domain)[0]]


In [21]:
data_approx, ex_var = approximate_data(CSTR_features, 10, 1)
print(ex_var)

approx_classification_error(CSTR_features, CSTR_labels, data_approx)

[0.31433579]
0.9230769230769231 0.9195804195804196


0.00378787878787884

In [16]:
cstr_sources = np.concatenate([sources_features, sources_labels.reshape(-1, 1), sources_domains.reshape(-1, 1)], axis=1)

cstr_features_target_approx = approximate_data(target_features, 10, 1)
cstr_target_approx = np.concatenate([cstr_features_target_approx[0], target_labels.reshape(-1, 1), target_domains.reshape(-1, 1)], axis=1)

cstr_with_target_approx = np.concatenate([cstr_sources, cstr_target_approx], axis=0)

np.save('../CSTR/data/cstr_with_target_approx1.npy', cstr_with_target_approx)

Incremental PCA

In [28]:
n_components = 5
ipca = IncrementalPCA(n_components=n_components)
ipca.whiten = False

target_features_iapprox = np.zeros(target_features.shape)
target_features_iapprox[:n_components-1, :] = target_features[:n_components-1, :]
for i in range(n_components, target_features.shape[0]):
    X_batch = target_features[i-n_components:i, :]
    ipca.partial_fit(X_batch)
    approx_x = ipca.inverse_transform(ipca.transform(X_batch[[-1], :]))
    target_features_iapprox[i-1, :] = approx_x

cstr_target_iapprox = np.concatenate([target_features_iapprox, target_labels.reshape(-1, 1), target_domains.reshape(-1, 1)], axis=1)
cstr_with_target_iapprox = np.concatenate([cstr_sources, cstr_target_iapprox], axis=0)

In [29]:
np.save('../CSTR/data/cstr_with_target_iapprox5.npy', cstr_with_target_iapprox)

In [30]:
approx_classification_error(CSTR_features, CSTR_labels, cstr_with_target_iapprox[:, :-2])

0.9230769230769231 0.9248251748251748


0.0018939393939392997

Online PCA, batch size = 1

In [None]:
# Specify the number of components for PCA
n_components = 3

# Create a PCA object
pca = PCA(n_components=n_components)
pca.whiten = False

# Initialize a running mean for each component
mean_running = np.zeros(n_components)

target_features_online_approx = np.zeros(target_features.shape)
# Perform online learning with batch size 1
for i, x in enumerate(target_features):
    # Update running mean
    mean_running = (i * mean_running + x) / (i + 1)

    # Update covariance matrix (assuming zero mean)
    cov_running = np.outer(x - mean_running, x - mean_running)

    # Update PCA components
    pca.components_ = (i * pca.components_ + cov_running) / (i + 1)

    x_approx = pca.inverse_transform(pca.transform(x))
    target_features_online_approx[i, :] = x_approx

cstr_target_online_approx = np.concatenate([target_features_online_approx, target_labels.reshape(-1, 1), target_domains.reshape(-1, 1)], axis=1)
cstr_with_target_online_approx = np.concatenate([cstr_sources, cstr_target_online_approx], axis=0)

approx_classification_error(CSTR_features, CSTR_labels, cstr_with_target_online_approx[:, :-2])