In [14]:
import sys
import torch
from datetime import date
import numpy
import matplotlib.pyplot  as plt
import seaborn as sns
import umap

sys.path.append('../')
from sage_unsup import SAGE_Unsup

feature_names = ['alexa_rank','daily_pageviews_per_visitor','daily_time_on_site','total_sites_linking_in','bounce_rate']
syn_labels = [f'x{i}' for i in range(32)]
label_features = [f'labelf{i}' for i in range(4)] 


coeffs = { 'model_type': 'sage',
          'epoch':1000,
          'num_layers':4,
          'dim':128,
          'outer_batch_size':5000,
          'inner_batch_size':40,
          'train_percentage':0.8,
          'seed':0,
          'experiment_id':2,
          'gpu_id':0,
          'extra':'s1',
          'labelfeature_names':['feat_label_ben','feat_label_mal','feat_label_unknown'],
          # 'labelfeature_names':['feat_pred_ben_def','feat_pred_mal_def'],
          # 'use_syn':True,
          'syn_labels':syn_labels,
          'syn_file':'../../fakenews/data/fakenews_xavier.csv'
}
ds_names = ["acl2020", "emnlp2018"]
tasks = ['fact', 'bias']
model_names = ['gcn', 'sagesup', 'sageunsup']
level = 3
ds_name = ds_names[1]
task = tasks[1]
model_name = model_names[2]

nodes_file = '../data/features_{}_level{}.txt4'.format(ds_name, level)
edges_file = '../data/edges_{}_level{}.txt2'.format(ds_name, level)

model_file = f"""../model/fakenews_{coeffs['model_type']}.pkl"""


In [15]:
exp = SAGE_Unsup(nodes_file, edges_file, feature_names, **coeffs)
for epoch in range(1, coeffs['epoch']):
    loss = exp.train()
    if epoch % 100 == 0: 
        val_acc, test_acc = exp.test()
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, '
            f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')
        


Train tensor(848) Test tensor(212)
Data converted to undirected: True
Data(x=[78428, 5], edge_index=[2, 232529], y=[78428], train_mask=[78428], test_mask=[78428], validation_mask=[78428])
Epoch: 100, Loss: 1.2004, Val: 0.6073, Test: 0.5755
Epoch: 200, Loss: 1.1822, Val: 0.5991, Test: 0.5943
Epoch: 300, Loss: 1.1677, Val: 0.6050, Test: 0.5802
Epoch: 400, Loss: 1.1623, Val: 0.6014, Test: 0.5943
Epoch: 500, Loss: 1.1565, Val: 0.6014, Test: 0.5896
Epoch: 600, Loss: 1.1519, Val: 0.6073, Test: 0.5660
Epoch: 700, Loss: 1.1385, Val: 0.6061, Test: 0.5755
Epoch: 900, Loss: 1.1295, Val: 0.6167, Test: 0.5755


In [16]:
x, edge_index = exp.data.x, exp.data.edge_index
with torch.no_grad():
    exp.model.eval()
    val_acc, test_acc = exp.test()
    print(f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')
            
    emb = exp.model.full_forward(x, edge_index).cpu()
emb.size()

    

Val: 0.6097, Test: 0.5943


torch.Size([78428, 128])

In [17]:
import pandas as pd
import json
nodes_df = pd.read_csv(nodes_file)
print(nodes_df.shape)
emb_dict = dict(zip(nodes_df.node.tolist(), emb.tolist()))
emb_filename = "../data/{}_{}_level{}_{}.json".format(ds_name, task, level, model_name)
print(emb_filename)
with open(emb_filename, 'w') as emb_file:
    json.dump(emb_dict, emb_file)

(78428, 9)
../data/emnlp2018_bias_level3_sageunsup.json


In [9]:
from sklearn.linear_model import LogisticRegression

data = exp.data.cpu()
# out = data.x
clf = LogisticRegression()
clf.fit(emb[data.validation_mask], data.y[data.validation_mask])

val_acc = clf.score(emb[data.validation_mask], data.y[data.validation_mask])
test_acc = clf.score(emb[data.test_mask], data.y[data.test_mask])
val_acc, test_acc

(0.6049528301886793, 0.5943396226415094)

In [None]:
# data = exp.data.cpu()
palette = {}

for n, y in enumerate(set(data.y.cpu().numpy())):
    palette[y] = f'C{n}'

embd = umap.UMAP().fit_transform(out.cpu().numpy())

print(type(embd),len(embd[0]))
# numpy.savetxt(f"./embeddings/{test_name}.csv", embd, delimiter=",")
plt.figure(figsize=(10, 10))
sns.scatterplot(x=embd.T[0][data.test_mask], y=embd.T[1][data.test_mask], hue=data.y.cpu()[data.test_mask].numpy(), palette=palette)
plt.legend(bbox_to_anchor=(1,1), loc='upper left')
plt.savefig(f"embeddings/{test_name}.png", dpi=120)

In [None]:
print(data.x.size())

embd_x = umap.UMAP().fit_transform(data.x.cpu().numpy())
plt.figure(figsize=(10, 10))
sns.scatterplot(x=embd_x.T[0][data.test_mask], y=embd_x.T[1][data.test_mask], hue=data.y[data.test_mask].cpu().numpy(), palette=palette)
plt.legend(bbox_to_anchor=(1,1), loc='upper left')
plt.savefig(f"./embeddings/{test_name}_data.png", dpi=120)

In [None]:
from src.embedding import visualize_embedding

visualize_embedding(data, data.x)

In [None]:
visualize_embedding(data, out)