In [21]:
import os.path as osp
import matplotlib.pyplot as plt
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv
from torch_geometric.loader import DataLoader
from torch.utils.data import Dataset
from torch_geometric.nn import GNNExplainer
from torch_geometric.loader import NeighborLoader
import pandas as pd
from torch_geometric.data import Data
from torch_geometric.loader import NeighborSampler #as RawNeighborSampler
import torch.nn as nn
import torch
from torch_cluster import random_walk
import torch.nn.functional as F

def create_dataset(edges, features, labels, train_mask, test_mask):
    edge_index = torch.tensor(edges, dtype=torch.long)
    x = torch.tensor(features, dtype = torch.float)
    y = torch.tensor(labels, dtype = torch.long)

    data = Data(x=x, edge_index=edge_index.t().contiguous(), y=y)
    
    #supervised setting
    if train_mask != None:
        data.train_mask = torch.tensor(train_mask, dtype = torch.bool)
        if test_mask == None:
            data.test_mask = ~data.train_mask
        
    #semi-supervised setting
    if test_mask != None:
        data.test_mask = torch.tensor(test_mask, dtype = torch.bool)
    return data

In [22]:
class NeighborLoaderX(NeighborLoader):
    def sample(self, batch):
        batch = torch.tensor(batch)
        row, col, _ = self.adj_t.coo()

        # For each node in `batch`, we sample a direct neighbor (as positive
        # example) and a random node (as negative example):
        pos_batch = random_walk(row, col, batch, walk_length=1,
                                coalesced=False)[:, 1]

        neg_batch = torch.randint(0, self.adj_t.size(1), (batch.numel(), ),
                                  dtype=torch.long)

        batch = torch.cat([batch, pos_batch, neg_batch], dim=0)
        batch1 = super(NeighborLoaderX, self).sample(batch)
        return batch1

In [23]:
class GCN(nn.Module):
    def __init__(self, 
                 in_channels,
                 hidden_channels,
                 #out_dim,
                 num_layers):
        super(GCN, self).__init__()
        self.num_layers = num_layers
        self.convs = nn.ModuleList()
        
        for i in range(num_layers):
            in_channels = in_channels if i == 0 else hidden_channels
            self.convs.append(GCNConv(in_channels, hidden_channels))
        # # post-message-passing
        # self.post_mp = nn.Sequential(
        #     nn.Linear(hidden_channels, hidden_channels), nn.Dropout(0.25), 
        #     nn.Linear(hidden_channels, out_dim))

    def forward(self, x, edge_index):
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)

        #x = self.post_mp(x)
        return x

In [24]:
def prepare(data, hidden_channels = 128, num_layers = 3, batch_size = 256):
    print("Entered prepare()")
    train_mask = torch.ones(10161, dtype=torch.bool)
    train_loader = NeighborLoaderX(data, input_nodes=train_mask, num_neighbors=[10]*2,
                            shuffle=True,  batch_size = 256)
    print("train_loader= {}".format(train_loader))
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("#features = {}".format(data.num_node_features))
    #print("Num layers ", num_layers)
    model = GCN(data.num_node_features, hidden_channels=hidden_channels, num_layers=num_layers)
    #model = GCN(data.num_node_features,64, 64)
    #model = Encoder(data.num_node_features, 64)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    x, edge_index = data.x.to(device), data.edge_index.to(device)
    return model, optimizer, x, edge_index, train_loader, device

In [33]:
def train(model, optimizer, train_loader, device):
    print("Entered train()")
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        out = model(batch.x.to(device), batch.edge_index.to(device))
        out, pos_out, neg_out = out.split(out.size(0) // 3, dim=0)
        pos_loss = F.logsigmoid((out * pos_out).sum(-1)).mean()
        neg_loss = F.logsigmoid(-(out * neg_out).sum(-1)).mean()
        loss = -pos_loss - neg_loss
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * out.size(0)
        total_loss += float(loss)


    return total_loss / data.num_nodes

In [34]:
def train_all(data, nepochs):
    print("Entered train_all()")
    model, optimizer, x, edge_index, train_loader, device = prepare(data)
    print(model)

    for epoch in range(1, nepochs+1):
        loss = train(model, optimizer, train_loader, device)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    return model, x, edge_index


def predict_all(model, x, edge_index):
    with torch.no_grad():
        model.eval()
        #out = model.full_forward(x, edge_index).cpu()
        out = model(x, edge_index).cpu()
    return out

In [None]:
model, x, edge_index = train_all(data, 1)
### This code is giving error "too many values to unpack" at line "out, pos_out, neg_out = out.split(out.size(0) // 3, dim=0)" because it was accepting exactly 3* batchsize as was Neigbour sampler

In [1]:
import pandas as pd
import sys
import os
sys.path.insert(0, '../../../')

from notebooks.utils import _ALEXA_DATA_PATH, load_node_features, load_level_data, create_audience_overlap_nodes, export_model_as_feature
from train import run_experiment

2023-02-22 17:54:22.321332: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-22 17:54:22.328966: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [2]:
audience_overlap_sites = load_level_data(os.path.join(_ALEXA_DATA_PATH, 'corpus_2020_audience_overlap_sites_scrapping_result.json'), level=1)
audience_overlap_sites_NODES = create_audience_overlap_nodes(audience_overlap_sites)

print(audience_overlap_sites_NODES[:5])

02-22 17:54:23 notebooks.utils INFO     Loaded 3489 nodes with records level <= 1 and child size:16981


[('crooked.com', 'votesaveamerica.com'), ('crooked.com', 'art19.com'), ('crooked.com', 'promocodeportal.com'), ('crooked.com', 'mediamatters.org'), ('crooked.com', 'actblue.com')]


In [3]:
edge_df = pd.DataFrame(audience_overlap_sites_NODES, columns=['source', 'target'])
edge_df.head()

Unnamed: 0,source,target
0,crooked.com,votesaveamerica.com
1,crooked.com,art19.com
2,crooked.com,promocodeportal.com
3,crooked.com,mediamatters.org
4,crooked.com,actblue.com


In [5]:
edge_df2 = pd.DataFrame()
edge_df2['source'] = edge_df['target']
edge_df2['target'] = edge_df['source']

edge_df = pd.concat([edge_df, edge_df2]).drop_duplicates(keep = "first").reset_index()
edge_df.shape

(28779, 3)

In [6]:
nodes_in_edges = list(set(edge_df.source.unique().tolist() + edge_df.target.unique().tolist()))
print('Number of unique nodes in edges:', len(nodes_in_edges), 'Sample:', nodes_in_edges[:5])

Number of unique nodes in edges: 10161 Sample: ['leftvoice.org', 'windquest.com', 'iowahouserepublicans.com', 'uni-muenchen.de', 'everydayinbox.com']


In [7]:
node_features_df = load_node_features()
node_features_df = node_features_df.set_index('site')
node_features_df.head()

Unnamed: 0_level_0,alexa_rank,daily_pageviews_per_visitor,daily_time_on_site,total_sites_linking_in,bounce_rate
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
whistleblowersandrelators.com,,,,,
geokov.com,2238341.0,1.0,,60.0,0.9
trainingandfacilitation.ca,,,,,
plumsolutions.com.au,1023533.0,1.0,138.0,60.0,0.813
dbdailyupdate.com,145283.0,1.7,179.0,64.0,0.756


In [8]:
node_features_df = node_features_df.loc[nodes_in_edges]
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10161 entries, leftvoice.org to sfhoardingcleanup.com
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   alexa_rank                   7465 non-null   float64
 1   daily_pageviews_per_visitor  7466 non-null   float64
 2   daily_time_on_site           5566 non-null   float64
 3   total_sites_linking_in       9861 non-null   float64
 4   bounce_rate                  5179 non-null   float64
dtypes: float64(5)
memory usage: 476.3+ KB


In [9]:
node_features_df.alexa_rank = node_features_df.alexa_rank.fillna(1000000)
node_features_df.total_sites_linking_in = node_features_df.total_sites_linking_in.fillna(0)
node_features_df.daily_pageviews_per_visitor  = node_features_df.daily_pageviews_per_visitor.fillna(0)
node_features_df.daily_time_on_site = node_features_df.daily_time_on_site.fillna(0)
node_features_df.bounce_rate = node_features_df.bounce_rate.fillna(0)
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10161 entries, leftvoice.org to sfhoardingcleanup.com
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   alexa_rank                   10161 non-null  float64
 1   daily_pageviews_per_visitor  10161 non-null  float64
 2   daily_time_on_site           10161 non-null  float64
 3   total_sites_linking_in       10161 non-null  float64
 4   bounce_rate                  10161 non-null  float64
dtypes: float64(5)
memory usage: 476.3+ KB


In [10]:
import math

node_features_df['normalized_alexa_rank'] = node_features_df['alexa_rank'].apply(lambda x: 1/x if x else 0)
node_features_df['normalized_total_sites_linked_in'] = node_features_df['total_sites_linking_in'].apply(lambda x: math.log2(x) if x else 0)

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

node_features_df[['alexa_rank', 'daily_pageviews_per_visitor', 'daily_time_on_site',
       'total_sites_linking_in', 'bounce_rate', 'normalized_alexa_rank',
       'normalized_total_sites_linked_in']] = scaler.fit_transform(node_features_df[['alexa_rank', 'daily_pageviews_per_visitor', 'daily_time_on_site',
       'total_sites_linking_in', 'bounce_rate', 'normalized_alexa_rank',
       'normalized_total_sites_linked_in']])
node_features_df.head(5)

Unnamed: 0_level_0,alexa_rank,daily_pageviews_per_visitor,daily_time_on_site,total_sites_linking_in,bounce_rate,normalized_alexa_rank,normalized_total_sites_linked_in
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
leftvoice.org,0.012169,0.041667,0.030219,7.3e-05,0.819,7.501467e-06,0.365282
windquest.com,0.092413,0.0,0.0,1.1e-05,0.0,9.075871e-07,0.23884
iowahouserepublicans.com,0.092413,0.0,0.0,3.3e-05,0.0,9.075871e-07,0.312062
uni-muenchen.de,0.000897,0.072222,0.034602,0.002267,0.616,0.0001029154,0.594138
everydayinbox.com,0.092413,0.0,0.0,3e-06,0.0,9.075871e-07,0.159819


In [12]:
node_map = {dom:i for i, dom in enumerate(node_features_df.index)}


In [13]:
edge_df3 = pd.DataFrame()
edge_df3['source'] = edge_df['source'].map(node_map)    
edge_df3['target'] = edge_df['target'].map(node_map)
edge_df3.head(5)

Unnamed: 0,source,target
0,4187,9957
1,4187,9775
2,4187,2812
3,4187,6849
4,4187,3867


In [28]:
data = create_dataset(list(zip(edge_df3['source'], edge_df3['target'])),
                     list(zip(node_features_df['alexa_rank'], 
                              node_features_df['daily_pageviews_per_visitor'], 
                              node_features_df['daily_time_on_site'],
                              node_features_df['total_sites_linking_in'], 
                              node_features_df['bounce_rate'], 
                              node_features_df['normalized_alexa_rank'],
                              node_features_df['normalized_total_sites_linked_in'])),
                     [1] * node_features_df.shape[0], None, None)

In [38]:
data

Data(x=[10161, 7], edge_index=[2, 28779], y=[10161])