In [1]:
import os.path as osp
import matplotlib.pyplot as plt
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv
from torch_geometric.loader import DataLoader
from torch.utils.data import Dataset
from torch_geometric.nn import GNNExplainer
import pandas as pd
from torch_geometric.data import Data
from torch_geometric.loader import NeighborSampler #as RawNeighborSampler
import torch.nn as nn
import torch
from torch_cluster import random_walk
import torch.nn.functional as F

def create_dataset(edges, features, labels, train_mask, test_mask):
    edge_index = torch.tensor(edges, dtype=torch.long)
    x = torch.tensor(features, dtype = torch.float)
    y = torch.tensor(labels, dtype = torch.long)

    data = Data(x=x, edge_index=edge_index.t().contiguous(), y=y)
    
    #supervised setting
    if train_mask != None:
        data.train_mask = torch.tensor(train_mask, dtype = torch.bool)
        if test_mask == None:
            data.test_mask = ~data.train_mask
        
    #semi-supervised setting
    if test_mask != None:
        data.test_mask = torch.tensor(test_mask, dtype = torch.bool)
    return data

In [2]:
class NeighborSamplerX(NeighborSampler):
    def sample(self, batch):
        batch = torch.tensor(batch)
        row, col, _ = self.adj_t.coo()

        # For each node in `batch`, we sample a direct neighbor (as positive
        # example) and a random node (as negative example):
        pos_batch = random_walk(row, col, batch, walk_length=1,
                                coalesced=False)[:, 1]

        neg_batch = torch.randint(0, self.adj_t.size(1), (batch.numel(), ),
                                  dtype=torch.long)

        batch = torch.cat([batch, pos_batch, neg_batch], dim=0)
        batch1 = super(NeighborSamplerX, self).sample(batch)
        return batch1
    
    
class SAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels, num_layers):
        super(SAGE, self).__init__()
        self.num_layers = num_layers
        self.convs = nn.ModuleList()
        
        for i in range(num_layers):
            in_channels = in_channels if i == 0 else hidden_channels
            self.convs.append(SAGEConv(in_channels, hidden_channels))

    def forward(self, x, adjs):
        for i, (edge_index, _, size) in enumerate(adjs):
            x_target = x[:size[1]]  # Target nodes are always placed first.
            x = self.convs[i]((x, x_target), edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x

    def full_forward(self, x, edge_index):
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x
    
def prepare(data, hidden_channels = 128, num_layers = 2, batch_size = 256):
    train_loader = NeighborSamplerX(data.edge_index, sizes=[10, 10], batch_size = batch_size,
                              shuffle=True, num_nodes=data.num_nodes)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("#features = {}".format(data.num_node_features))
    #print("Num layers ", num_layers)
    model = SAGE(data.num_node_features, hidden_channels=hidden_channels, num_layers=num_layers)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    x, edge_index = data.x.to(device), data.edge_index.to(device)
    return model, optimizer, x, edge_index, train_loader, device

def train(model, optimizer, x, train_loader, device):
    model.train()

    total_loss = 0
    for batch_size, n_id, adjs in train_loader:
        # `adjs` holds a list of `(edge_index, e_id, size)` tuples.
        adjs = [adj.to(device) for adj in adjs]
        optimizer.zero_grad()

        out = model(x[n_id], adjs)
        out, pos_out, neg_out = out.split(out.size(0) // 3, dim=0)
        pos_loss = F.logsigmoid((out * pos_out).sum(-1)).mean()
        neg_loss = F.logsigmoid(-(out * neg_out).sum(-1)).mean()
        loss = -pos_loss - neg_loss
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * out.size(0)

    return total_loss / data.num_nodes

from sklearn.manifold import TSNE
import numpy as np

def plot_tsne(out, data, p = 50, nitr = 1000):
    trans = TSNE(n_components=2, perplexity = p, n_iter = nitr)
    node_embeddings_2d = trans.fit_transform(out.cpu().numpy())
    alpha = 1
    label_map = {l: i for i, l in enumerate(np.unique(data.y))}
    node_colors = [label_map[target] for target in data.y.numpy()]

    plt.figure(figsize=(5, 5))
    plt.axes().set(aspect='equal')
    plt.scatter(
        node_embeddings_2d[:, 0],
        node_embeddings_2d[:, 1],
        c=node_colors,
        cmap='viridis',
        alpha=alpha
    )
    plt.show()
    
def train_all(data, nepochs):
    model, optimizer, x, edge_index, train_loader, device = prepare(data)
    print(model)
    for epoch in range(1, nepochs+1):
        loss = train(model, optimizer, x, train_loader, device)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    return model, x, edge_index

def predict_all(model, x, edge_index):
    with torch.no_grad():
        model.eval()
        out = model.full_forward(x, edge_index).cpu()
    return out

def plot_tsne_out(data, out):
    palette = {}

    for n, y in enumerate(set(data.y.numpy())):
        palette[y] = f'C{n}'
    
    embd = umap.UMAP().fit_transform(out.cpu().numpy())

    plt.figure(figsize=(5, 5))
    sns.scatterplot(x=embd.T[0], y=embd.T[1], hue=data.y.cpu().numpy(), palette=palette)
    plt.legend(bbox_to_anchor=(1,1), loc='upper left')
    plt.savefig("umap_embd_sage.png", dpi=120)

## Audiance Overlap - Corpus 2020

In [3]:
import pandas as pd
import sys
import os
sys.path.insert(0, '../../../')

from notebooks.utils import _ALEXA_DATA_PATH, load_node_features, load_level_data, create_nodes, export_model_as_feature
from train import run_experiment

In [4]:
audience_overlap_sites = load_level_data(os.path.join(_ALEXA_DATA_PATH, 'corpus_2020_audience_overlap_sites_scrapping_result.json'), level=1)
audience_overlap_sites_NODES = create_nodes(audience_overlap_sites)

print(audience_overlap_sites_NODES[:5])

01-23 12:35:26 notebooks.utils INFO     Loaded 3489 nodes with records level <= 1 and child size:16981


[('crooked.com', 'votesaveamerica.com'), ('crooked.com', 'art19.com'), ('crooked.com', 'promocodeportal.com'), ('crooked.com', 'mediamatters.org'), ('crooked.com', 'actblue.com')]


In [5]:
edge_df = pd.DataFrame(audience_overlap_sites_NODES, columns=['source', 'target'])
edge_df.head()

Unnamed: 0,source,target
0,crooked.com,votesaveamerica.com
1,crooked.com,art19.com
2,crooked.com,promocodeportal.com
3,crooked.com,mediamatters.org
4,crooked.com,actblue.com


In [6]:
edge_df.shape

(17010, 2)

In [7]:
edge_df2 = pd.DataFrame()
edge_df2['source'] = edge_df['target']
edge_df2['target'] = edge_df['source']

edge_df = pd.concat([edge_df, edge_df2]).drop_duplicates(keep = "first").reset_index()
edge_df.shape

(28779, 3)

In [8]:
nodes_in_edges = list(set(edge_df.source.unique().tolist() + edge_df.target.unique().tolist()))
print('Number of unique nodes in edges:', len(nodes_in_edges), 'Sample:', nodes_in_edges[:5])

Number of unique nodes in edges: 10161 Sample: ['mintmedicaleducation.com', 'abqjournal.com', 'bradford-delong.com', 'freedomsfinalstand.com', 'scdmvonline.com']


In [9]:
node_features_df = load_node_features()
node_features_df = node_features_df.set_index('site')
node_features_df.head()

Unnamed: 0_level_0,alexa_rank,daily_pageviews_per_visitor,daily_time_on_site,total_sites_linking_in,bounce_rate
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
whistleblowersandrelators.com,,,,,
geokov.com,2238341.0,1.0,,60.0,0.9
trainingandfacilitation.ca,,,,,
plumsolutions.com.au,1023533.0,1.0,138.0,60.0,0.813
dbdailyupdate.com,145283.0,1.7,179.0,64.0,0.756


In [10]:
node_features_df = node_features_df.loc[nodes_in_edges]
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10161 entries, mintmedicaleducation.com to reverbpress.news
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   alexa_rank                   7465 non-null   float64
 1   daily_pageviews_per_visitor  7466 non-null   float64
 2   daily_time_on_site           5566 non-null   float64
 3   total_sites_linking_in       9861 non-null   float64
 4   bounce_rate                  5179 non-null   float64
dtypes: float64(5)
memory usage: 476.3+ KB


In [11]:
node_features_df.alexa_rank = node_features_df.alexa_rank.fillna(1000000)
node_features_df.total_sites_linking_in = node_features_df.total_sites_linking_in.fillna(0)
node_features_df.daily_pageviews_per_visitor  = node_features_df.daily_pageviews_per_visitor.fillna(0)
node_features_df.daily_time_on_site = node_features_df.daily_time_on_site.fillna(0)
node_features_df.bounce_rate = node_features_df.bounce_rate.fillna(0)
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10161 entries, mintmedicaleducation.com to reverbpress.news
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   alexa_rank                   10161 non-null  float64
 1   daily_pageviews_per_visitor  10161 non-null  float64
 2   daily_time_on_site           10161 non-null  float64
 3   total_sites_linking_in       10161 non-null  float64
 4   bounce_rate                  10161 non-null  float64
dtypes: float64(5)
memory usage: 476.3+ KB


In [12]:
import math

node_features_df['normalized_alexa_rank'] = node_features_df['alexa_rank'].apply(lambda x: 1/x if x else 0)
node_features_df['normalized_total_sites_linked_in'] = node_features_df['total_sites_linking_in'].apply(lambda x: math.log2(x) if x else 0)

In [13]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

node_features_df[['alexa_rank', 'daily_pageviews_per_visitor', 'daily_time_on_site',
       'total_sites_linking_in', 'bounce_rate', 'normalized_alexa_rank',
       'normalized_total_sites_linked_in']] = scaler.fit_transform(node_features_df[['alexa_rank', 'daily_pageviews_per_visitor', 'daily_time_on_site',
       'total_sites_linking_in', 'bounce_rate', 'normalized_alexa_rank',
       'normalized_total_sites_linked_in']])
node_features_df.head(5)

Unnamed: 0_level_0,alexa_rank,daily_pageviews_per_visitor,daily_time_on_site,total_sites_linking_in,bounce_rate,normalized_alexa_rank,normalized_total_sites_linked_in
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
mintmedicaleducation.com,0.092413,0.0,0.0,4e-06,0.0,9.075871e-07,0.175892
abqjournal.com,0.005778,0.044444,0.024913,0.000736,0.707,1.589991e-05,0.519191
bradford-delong.com,0.105116,0.027778,0.018224,8.6e-05,1.0,7.867358e-07,0.376267
freedomsfinalstand.com,0.092413,0.0,0.0,4e-06,0.0,9.075871e-07,0.170953
scdmvonline.com,0.008211,0.113889,0.050058,0.000165,0.32,1.116251e-05,0.419454


In [14]:
node_map = {dom:i for i, dom in enumerate(node_features_df.index)}

In [15]:
edge_df3 = pd.DataFrame()
edge_df3['source'] = edge_df['source'].map(node_map)    
edge_df3['target'] = edge_df['target'].map(node_map)
edge_df3.head(5)

Unnamed: 0,source,target
0,3705,3598
1,3705,9479
2,3705,8802
3,3705,547
4,3705,5931


In [16]:
data = create_dataset(list(zip(edge_df3['source'], edge_df3['target'])),
                     list(zip(node_features_df['alexa_rank'], 
                              node_features_df['daily_pageviews_per_visitor'], 
                              node_features_df['daily_time_on_site'],
                              node_features_df['total_sites_linking_in'], 
                              node_features_df['bounce_rate'], 
                              node_features_df['normalized_alexa_rank'],
                              node_features_df['normalized_total_sites_linked_in'])),
                     [1] * node_features_df.shape[0], None, None)

In [17]:
model, x, edge_index = train_all(data, 20)

#features = 7
SAGE(
  (convs): ModuleList(
    (0): SAGEConv(7, 128)
    (1): SAGEConv(128, 128)
  )
)
Epoch: 001, Loss: 1.7311
Epoch: 002, Loss: 1.1843
Epoch: 003, Loss: 1.1585
Epoch: 004, Loss: 1.0824
Epoch: 005, Loss: 1.0410
Epoch: 006, Loss: 0.9831
Epoch: 007, Loss: 0.9638
Epoch: 008, Loss: 0.9574
Epoch: 009, Loss: 0.9596
Epoch: 010, Loss: 0.9400
Epoch: 011, Loss: 0.9449
Epoch: 012, Loss: 0.9299
Epoch: 013, Loss: 0.9236
Epoch: 014, Loss: 0.9232
Epoch: 015, Loss: 0.9236
Epoch: 016, Loss: 0.9263
Epoch: 017, Loss: 0.9257
Epoch: 018, Loss: 0.9347
Epoch: 019, Loss: 0.9187
Epoch: 020, Loss: 0.9172


In [18]:
node_embeddings = predict_all(model, x, edge_index)
print("Shape of the embeddings: {}".format(node_embeddings.shape))

Shape of the embeddings: torch.Size([10161, 128])


In [19]:
embeddings_wv = dict(zip(node_features_df.index.tolist(), node_embeddings.tolist()))

In [20]:
print('Sample:', embeddings_wv['crooked.com'][:64])

Sample: [-0.012321801856160164, 0.060122665017843246, 0.038811758160591125, -0.04482794553041458, -0.015558433718979359, -0.136915385723114, 0.07201770693063736, 0.03606337308883667, -0.06561314314603806, 0.10874821245670319, -0.08791086822748184, 0.04509411379694939, 0.04942229017615318, -0.02511897310614586, -0.07731199264526367, -0.009159734472632408, 0.07530989497900009, -0.06520503014326096, 0.0943731814622879, 0.046760424971580505, 0.0345228910446167, -0.06776133179664612, 0.02670087292790413, 0.05625873804092407, -0.0027347393333911896, 0.010001253336668015, 0.039486102759838104, 0.09770621359348297, 0.05271473526954651, 0.022758491337299347, 0.09141097962856293, -0.026808716356754303, -0.048878610134124756, -0.0656418651342392, 0.06087236478924751, -0.09873615205287933, 0.05775177478790283, 0.07382437586784363, 0.07166735082864761, -0.020174100995063782, -0.08460912108421326, -0.028153546154499054, -0.0056526437401771545, -0.08874216675758362, 0.04250780865550041, -0.1152621805

In [21]:
export_model_as_feature(embeddings_wv, 'graph_sage_audience_overlap_level_pyg')

'/export/sec02/nabeel/News-Media-Peers/data/acl2020/features/graph_sage_audience_overlap_level_pyg.json'

In [22]:
run_experiment(features="graph_sage_audience_overlap_level_pyg", normalize_features=False)

+------+---------------------+---------------+--------------------+---------------------------------------+
| task | classification_mode | type_training | normalize_features |                features               |
+------+---------------------+---------------+--------------------+---------------------------------------+
| fact |  single classifier  |    combine    |       False        | graph_sage_audience_overlap_level_pyg |
+------+---------------------+---------------+--------------------+---------------------------------------+


01-23 12:40:50 train        INFO     Start training...
01-23 12:40:50 train        INFO     Fold: 0
01-23 12:40:58 train        INFO     Fold: 1
01-23 12:41:04 train        INFO     Fold: 2
01-23 12:41:10 train        INFO     Fold: 3
01-23 12:41:16 train        INFO     Fold: 4


+------+---------------------+---------------+--------------------+---------------------------------------+--------------------+-------------------+-------------------+--------------------+
| task | classification_mode | type_training | normalize_features |                features               |      Macro-F1      |      Accuracy     |  Flip error-rate  |        MAE         |
+------+---------------------+---------------+--------------------+---------------------------------------+--------------------+-------------------+-------------------+--------------------+
| fact |  single classifier  |    combine    |       False        | graph_sage_audience_overlap_level_pyg | 45.853223425533955 | 54.48195576251456 | 11.40861466821886 | 0.5692665890570431 |
+------+---------------------+---------------+--------------------+---------------------------------------+--------------------+-------------------+-------------------+--------------------+
