In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import torch
from torch import nn
from torch import optim
import matplotlib.pyplot as plt
from ActiveLearning import GAL, GNN
import numpy.linalg as nla
import pickle
import networkx as nx
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cluster import KMeans


In [2]:
def sim_mat(X):
	X = X / nla.norm(X, axis=-1).reshape(-1, 1)
	cos_sim_mat = (X @ X.T) - np.eye(X.shape[0])
	cos_sim_mat = np.absolute(cos_sim_mat)
	return cos_sim_mat

def construct_graph(X, thresh=.8):
	cos_sim_mat = sim_mat(X)
	edges = np.vstack(np.where(cos_sim_mat > thresh))
	return edges

def GNN_embed(X, edges, gnn):
	return gnn(X, edges)
	

In [3]:
with open('dataset_q1.pkl', 'rb') as f:
	dataset = pickle.load(f)


In [4]:
K = 1000

In [5]:
dataset
train_samples, train_labels = dataset['train_samples'], dataset['train_labels']
test_samples, test_labels = dataset['test_samples'], dataset['test_labels']
available_pool_samples, available_pool_labels = dataset['available_pool_samples'][:K], dataset['available_pool_labels'][:K]

In [6]:
kmeans = KMeans(n_clusters=4)


In [None]:
import matplotlib.pyplot as plt

fig = plt.figure()
ax = fig.add_subplot(projection='3d')

ax.scatter(*available_pool_samples.T, c=available_pool_labels)


In [8]:
kmeans = kmeans.fit(available_pool_samples)

In [None]:
kmeans.transform(available_pool_samples).min(axis=-1, keepdims=True)

In [10]:
iterations = 40
budget_per_iter = 50
train_limit = 2010
gal = GAL(
	dataset,
	None,
	iterations,
	budget_per_iter,
	train_limit,
	.9,
	None,
	LogisticRegression,
	GNN
)
# gnn = GNN(3, 4, 5)

In [11]:
model, gnn_model = gal._train_model()

In [12]:
A = gal.sim_mat(gal.train_samples)
G = gal.construct_graph(A, gal.available_pool_samples)

In [None]:
# len(gal.entropy(gal.available_pool_samples, model)) == len(gal.available_pool_samples)
# gal.density_score(gal.available_pool_samples)
# gal.entropy(gal.available_pool_samples, model)
# nx.pagerank(G)

In [39]:
import numpy as np


# for n in range(1, 11):
# 	coef_vector = np.random.beta(1, [1/n, 1/n, n], size=(3))
# 	print(coef_vector)

# np.random.beta(1, 300)


0.0009117808967824611

In [44]:
# U_idx = gal.select_points(G, model)

In [106]:
A = gal.sim_mat(available_pool_samples)
E = gal.construct_graph(A, available_pool_samples)
# list(zip(E))
# list(zip(*E))

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(train_samples, train_labels)

# E[0].nodes

In [145]:
def entropy(X):
	if not isinstance(X, torch.Tensor):
		X = torch.Tensor(X)
	ENT = (X * torch.log2(X)).sum(dim=-1)
	return ((ENT - ENT.min()) / (ENT.max() - ENT.min())).numpy()

In [None]:
entropy(model.predict_proba(available_pool_samples))

In [147]:
ENT_DICT = dict(zip(range(len(available_pool_samples)), entropy(model.predict_proba(available_pool_samples))))
PR_DICT = nx.pagerank(E[0])

In [None]:
s = {}
for k in ENT_DICT.keys():
	s[k] = ENT_DICT[k] + PR_DICT[k] 
s

In [None]:
nx.pagerank(E[0])

In [76]:
D_samples = np.concatenate([train_samples, available_pool_samples], axis=0)
D_labels = np.concatenate([train_labels, available_pool_labels], axis=0)
gnn_labeled_idx = list(range(len(train_samples)))

A = gal.sim_metric(D_samples)
D_gnn, E_gnn = gal.construct_graph(A, D_samples)

D_samples_torch = torch.Tensor(D_samples)
E_gnn_torch = torch.tensor(E_gnn)
D_embed = gnn(D_samples_torch, E_gnn_torch)
# D_embed


In [None]:
x = torch.rand(2, 3)
x_emb = torch.rand(2, 5)
print(x, x_emb)
torch.cat([x, x_emb], dim=1)

In [78]:
V = available_pool_samples
A = gal.sim_mat(V)

G, E = gal.construct_graph(A, V)

V = torch.Tensor(V)
E = torch.tensor(E)

U_idx = gal.select_points(G)
gnn_labeled_idx.extend([e + len(train_samples) for e in U_idx])

gal.label_update(U_idx)
gnn.train(gnn_labeled_idx)
model.train()



In [82]:
gnn_labeled_idx.extend([e + len(train_samples) for e in U_idx])

In [74]:
# gnn_labeled_idx

In [None]:

E

In [None]:
gnn(V, E)[U_idx]

In [None]:

nx.draw(G, with_labels=True)
U_idx = gal.select_points(G)
# V_emb = gnn(V, E)[U_idx]
gal.label_update(U_idx)
# gal._train_model()


# R = gal.unceartinty_score(G)
# R


In [None]:
MAT = [
	[0, 1, 0],
	[1, 0, 0],
	[0, -2, 0]
]

pairwise_distances(MAT, MAT)
# sim_mat(MAT)

In [None]:
pairwise_distances(available_pool_samples[:100], available_pool_samples[:100], metric='cosine')
# A

In [None]:
sim_mat(train_samples).round(1)

In [None]:
# construct_graph(train_samples)
# train_samples
G = nx.Graph()
for i, e in enumerate(gal.train_samples):
	G.add_node(i, label=','.join(map(lambda x: str(round(x, 2)), e)))

edges = sim_mat(train_samples)
edges = np.vstack(np.where(edges > .8))

for e in edges.T:
	G.add_edge(*e)

nx.draw(G, with_labels=True)

# nx.degree_centrality(G)
R = nx.pagerank(G)
# print(R)
D = sorted(R, key=lambda x: R[x], reverse=True)[:5]
R, D

In [4]:
model = classifier(2, 3)

In [None]:
hasattr(model, 'forward')

In [None]:
# dir(model).__contains__('forward')

In [None]:
callable(model.register_backward_hook)

In [2]:
model = classifier(2, 3)

In [3]:
cluster_means = [(7, 3), (1, 1), (6, 10)]
ppc = 30
X = np.vstack([np.random.normal(c, np.random.random(), size=(ppc, 2)) for c in cluster_means])
X = torch.tensor(X).type(torch.float)
Y = torch.cat([(torch.ones(ppc) * i) for i in range(len(cluster_means))]).reshape(-1, 1).type(torch.long)
# print(X, Y)


In [None]:
Y.reshape(-1)

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=Y.reshape(-1))

In [None]:
o = model(X)
o

In [7]:
lr = .001
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
EPOCHS = 5
model.train()

for epoch in range(EPOCHS):

	total_loss = 0
	for x, y in zip(X, Y):
	
		o = model(x.unsqueeze(dim=0))

		loss = criterion(o, y)

		optimizer.zero_grad()
		loss.backward()
		optimizer.step()

		total_loss += loss.item()

	print(f'[{epoch + 1} / {EPOCHS}]: loss - {total_loss / len(X)}')




In [None]:
model.predict(X[61])

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=model.predict(X).argmax(dim=-1))

In [None]:
import pickle
from collections import defaultdict
from ActiveLearning import ActiveLearningPipeline, generate_plot

with open('dataset_q1.pkl', 'rb') as f:
	dataset = pickle.load(f)

iterations = 40
budget_per_iter = 50
train_limit = 2010
selection_criteria = ['random', 'custom']
accuracy_scores_dict = defaultdict(list)
for criterion in selection_criteria:
	AL_class = ActiveLearningPipeline(dataset=dataset,
									selection_criterion=criterion,
									iterations=iterations,
									budget_per_iter=budget_per_iter,
									train_limit=train_limit)
	accuracy_scores_dict[criterion] = AL_class.run_pipeline()
generate_plot(accuracy_scores_dict)

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy.linalg as nla

pca = PCA(n_components=2)
X = pca.fit_transform(dataset['available_pool_samples'])
Y = dataset['available_pool_labels']

plt.scatter(X[:100, 0], X[:100, 1], c=Y[:100])
plt.plot()


In [None]:
A = dataset['available_pool_samples'][:100]
A = A / nla.norm(A, axis=-1).reshape(-1, 1)
A

In [59]:
# cos_sim_mat = D @ D.T
cos_sim_mat = (A @ A.T) - np.eye(A.shape[0])

In [54]:
sorted_sim = np.sort(cos_sim_mat, axis=1)[:, -2::-1]

In [61]:
thresh = .8
edges = np.where(cos_sim_mat > thresh)