In [1]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from src.data import *
from src.utils import *
from src.main import *
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from scipy.linalg import sqrtm
from numpy import real
from scipy.sparse.linalg import svds
from sklearn.utils.extmath import randomized_svd

In [2]:
cfg = {
    "beta":0.001,
    "gamma":0.001,
    "n_clusters":50,
    "n_models":2,
    "verbose":True,
    "train_size": 0.01,
    "weak_size": 0.1,
    "test_size": 0.89,
    "random_state":42,
    "path":"./data/pp_gas_emission/",
    "y":["CO", "NOX"],
    "rank":100,
}

np.random.seed(42)

In [3]:
data_2015 = pd.read_csv(cfg["path"]+"gt_2015.csv")
X_data_2015 = data_2015.drop(cfg["y"], axis=1).to_numpy()
y_data_2015 = data_2015[cfg["y"]].to_numpy()

In [4]:
size_weak = cfg["weak_size"]/(cfg["train_size"] + cfg["weak_size"])
X_train, X_test, y_train, y_test = train_test_split(X_data_2015, y_data_2015,
                                                    test_size=cfg["test_size"], random_state=cfg["random_state"])
X_train, X_weak, y_train, y_weak = train_test_split(X_train, y_train,
                                                    test_size=size_weak, random_state=cfg["random_state"])

X_all = np.concatenate([X_train, X_weak, X_test])
y_all = np.concatenate([y_train, y_weak, y_test])

### Regression experiments

#### Linear Regression

In [5]:
reg = LinearRegression().fit(X_train, y_train)
print("Train L2 {:.5f}".format((np.linalg.norm(y_train - reg.predict(X_train), axis=1)**2).mean()))
print("Test L2 {:.5f}".format((np.linalg.norm(y_weak - reg.predict(X_weak), axis=1)**2).mean()))
print("Test L2 {:.5f}".format((np.linalg.norm(y_test - reg.predict(X_test), axis=1)**2).mean()))

Train L2 39.89840
Test L2 56.09307
Test L2 57.60034


#### Co-assosiation Matrix

In [6]:
ScaleX = MinMaxScaler()
X_all_scaled = ScaleX.fit_transform(X_all)
W = get_W_k_means(X_all_scaled, 
                  n_clusters=cfg["n_clusters"],
                  n_models=cfg["n_models"],
                  verbose=cfg["verbose"])


neigh_100 = W.argsort()[:, -100:][:, ::-1]
L = []
for i in (pbar := tqdm(range(neigh_100.shape[0]))):
    #neigh_20 = np.argpartition((y_all[neigh_100[i]]**2).sum(axis=1), 20)[:20]
    cov = np.cov(y_all[neigh_100[i]], rowvar=False)
    try:
        L_i = np.linalg.cholesky(cov)
        L_idx = np.tril_indices_from(L_i)
        L.append(L_i[L_idx])
    except LinAlgError:
        print("Cholesky Failed")
        break

Fitting 2 KMeans models: 1 2 

100%|█████████████████████████████████████| 7384/7384 [00:00<00:00, 9576.40it/s]


In [7]:
L_all = np.stack(L)
A_all = y_all

L_train = L_all[:X_train.shape[0]]
A_train = A_all[:X_train.shape[0]]
L_weak = L_all[X_train.shape[0]:X_train.shape[0]+X_weak.shape[0]]
A_weak = A_all[X_train.shape[0]:X_train.shape[0]+X_weak.shape[0]]
L_test = L_all[X_train.shape[0]+X_weak.shape[0]:]
A_test = A_all[X_train.shape[0]+X_weak.shape[0]:]

X_full = np.concatenate([X_train, X_weak, X_test])
Y_full = np.concatenate([y_train, y_weak, np.zeros_like(y_test)])
L_full = np.concatenate([L_train, L_weak, np.zeros_like(L_test)])

#### Correlated WSR-LRCM
Weakly Supervised Regression Lower Rank Co-assosiation Matrix

In [8]:
B_full = np.diag(np.concatenate([
    np.ones(X_train.shape[0]+X_weak.shape[0])*cfg["beta"] + 1,
    np.ones(X_test.shape[0])*cfg["beta"]]))

A_star, L_star = solve(W, B_full, Y_full, L_full, gamma=cfg["gamma"])

A_star_gt_train = A_star[:X_train.shape[0]]
L_star_gt_train = L_star[:X_train.shape[0]]

A_star_gt_weak = A_star[X_train.shape[0]:X_train.shape[0]+X_weak.shape[0]]
L_star_gt_weak = L_star[X_train.shape[0]:X_train.shape[0]+X_weak.shape[0]]

A_star_gt_test = A_star[X_train.shape[0]+X_weak.shape[0]:]
L_star_gt_test = L_star[X_train.shape[0]+X_weak.shape[0]:]

In [9]:
print("L2")
print("Train L2 {:.5f}".format((np.linalg.norm(A_train - A_star_gt_train, axis=1)**2).mean()))
print("Weak L2 {:.5f}".format((np.linalg.norm(A_weak - A_star_gt_weak, axis=1)**2).mean()))
print("Test L2 {:.5f}".format((np.linalg.norm(A_test - A_star_gt_test, axis=1)**2).mean()), end="\n\n")

print("MWD")
print("Train L2 {:.5f}".format(get_Wasserstain(A_train, L_train, A_star_gt_train, L_star_gt_train)))
print("Weak L2 {:.5f}".format(get_Wasserstain(A_weak, L_weak, A_star_gt_weak, L_star_gt_weak)))
print("Test L2 {:.5f}".format(get_Wasserstain(A_test, L_test, A_star_gt_test, L_star_gt_test)))

L2
Train L2 1.80473
Weak L2 2.26503
Test L2 48.56968

MWD
Train L2 1.93929
Weak L2 2.34095
Test L2 50.63598


#### WSR-LRCM
Correlated Weakly Supervised Regression Lower Rank Co-assosiation Matrix

In [10]:
L_star[:, 1] = 0

print("L2")
print("Train L2 {:.5f}".format((np.linalg.norm(A_train - A_star_gt_train, axis=1)**2).mean()))
print("Weak L2 {:.5f}".format((np.linalg.norm(A_weak - A_star_gt_weak, axis=1)**2).mean()))
print("Test L2 {:.5f}".format((np.linalg.norm(A_test - A_star_gt_test, axis=1)**2).mean()), end="\n\n")

print("MWD")
print("Train L2 {:.5f}".format(get_Wasserstain(A_train, L_train, A_star_gt_train, L_star_gt_train)))
print("Weak L2 {:.5f}".format(get_Wasserstain(A_weak, L_weak, A_star_gt_weak, L_star_gt_weak)))
print("Test L2 {:.5f}".format(get_Wasserstain(A_test, L_test, A_star_gt_test, L_star_gt_test)))

L2
Train L2 1.80473
Weak L2 2.26503
Test L2 48.56968

MWD
Train L2 12.94202
Weak L2 10.35490
Test L2 58.12554


### Low Rank Decomposition

#### Truncated SVD

In [12]:
u, s, vt = svds(W, k=cfg["rank"])
C_1_T_svd, C_2_T_svd = u, np.diag(s)@vt
D = np.diag(np.sum(W, axis=1))

A_star, L_star = solve_sparse(D, C_1_T_svd, C_2_T_svd, B_full, Y_full, L_full, gamma=cfg["gamma"])

A_star_T_train = A_star[:X_train.shape[0]]
L_star_T_train = L_star[:X_train.shape[0]]

A_star_T_weak = A_star[X_train.shape[0]:X_train.shape[0]+X_weak.shape[0]]
L_star_T_weak = L_star[X_train.shape[0]:X_train.shape[0]+X_weak.shape[0]]

A_star_T_test = A_star[X_train.shape[0]+X_weak.shape[0]:]
L_star_T_test = L_star[X_train.shape[0]+X_weak.shape[0]:]

get_statistics(A_train, L_train, A_star_T_train, L_star_T_train,
               A_weak, L_weak, A_star_T_weak, L_star_T_weak,
               A_test, L_test, A_star_T_test, L_star_T_test,)

L2
Train L2 1.80473
Weak L2 2.26503
Test L2 48.56968

MWD
Train L2 1.93929
Weak L2 2.34095
Test L2 50.63598


#### Random SVD

In [13]:
u, s, vt = randomized_svd(W, n_components=cfg["rank"], random_state=cfg["random_state"])
C_1_R_svd, C_2_R_svd = u, np.diag(s)@vt
D = np.diag(np.sum(W, axis=1))

A_star, L_star = solve_sparse(D, C_1_R_svd, C_2_R_svd, B_full, Y_full, L_full, gamma=cfg["gamma"])

A_star_R_train = A_star[:X_train.shape[0]]
L_star_R_train = L_star[:X_train.shape[0]]

A_star_R_weak = A_star[X_train.shape[0]:X_train.shape[0]+X_weak.shape[0]]
L_star_R_weak = L_star[X_train.shape[0]:X_train.shape[0]+X_weak.shape[0]]

A_star_R_test = A_star[X_train.shape[0]+X_weak.shape[0]:]
L_star_R_test = L_star[X_train.shape[0]+X_weak.shape[0]:]

get_statistics(A_train, L_train, A_star_R_train, L_star_R_train,
               A_weak, L_weak, A_star_R_weak, L_star_R_weak,
               A_test, L_test, A_star_R_test, L_star_R_test,)

L2
Train L2 1.80473
Weak L2 2.26503
Test L2 48.56968

MWD
Train L2 1.93929
Weak L2 2.34095
Test L2 50.63598


#### Nystrom Method

In [14]:
nystrom = Nystrom(cfg["rank"])
C_1_N, C_2_N = nystrom(W)
D = np.diag(np.sum(W, axis=1))

A_star, L_star = solve_sparse(D, C_1_N, C_2_N, B_full, Y_full, L_full, gamma=cfg["gamma"])

A_star_N_train = A_star[:X_train.shape[0]]
L_star_N_train = L_star[:X_train.shape[0]]

A_star_N_weak = A_star[X_train.shape[0]:X_train.shape[0]+X_weak.shape[0]]
L_star_N_weak = L_star[X_train.shape[0]:X_train.shape[0]+X_weak.shape[0]]

A_star_N_test = A_star[X_train.shape[0]+X_weak.shape[0]:]
L_star_N_test = L_star[X_train.shape[0]+X_weak.shape[0]:]

get_statistics(A_train, L_train, A_star_N_train, L_star_N_train,
               A_weak, L_weak, A_star_N_weak, L_star_N_weak,
               A_test, L_test, A_star_N_test, L_star_N_test,)

L2
Train L2 11.82160
Weak L2 21.08466
Test L2 603.86608

MWD
Train L2 12.02805
Weak L2 21.36203
Test L2 612.38094
