In [1]:
import pandas as pd
import sys
import os
sys.path.insert(0, '../../../')

from notebooks.utils import _ALEXA_DATA_PATH, load_node_features, load_level_data, create_overlap_nodes, export_model_as_feature
from train import run_experiment

2022-01-30 12:27:09.814478: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load audience overlap edges for level 2

In [2]:
level=2
audience_overlap_sites = load_level_data(os.path.join(_ALEXA_DATA_PATH, 'corpus_2020_audience_overlap_sites_scrapping_result.json'), level=level)
audience_overlap_sites_NODES = create_overlap_nodes(audience_overlap_sites)

print(audience_overlap_sites_NODES[:5])

01-30 12:27:11 notebooks.utils INFO     Loaded 10161 nodes with records level <= 2 and child size:49342


[('crooked.com', 'votesaveamerica.com'), ('crooked.com', 'art19.com'), ('crooked.com', 'promocodeportal.com'), ('crooked.com', 'mediamatters.org'), ('crooked.com', 'actblue.com')]


In [3]:
edge_df = pd.DataFrame(audience_overlap_sites_NODES, columns=['source', 'target'])

edge_df.head()

Unnamed: 0,source,target
0,crooked.com,votesaveamerica.com
1,crooked.com,art19.com
2,crooked.com,promocodeportal.com
3,crooked.com,mediamatters.org
4,crooked.com,actblue.com


### Find all unique nodes in edges

In [4]:
nodes_in_edges = list(set(edge_df.source.unique().tolist() + edge_df.target.unique().tolist()))
print('Number of unique nodes in edges:', len(nodes_in_edges), 'Sample:', nodes_in_edges[:5])

Number of unique nodes in edges: 26573 Sample: ['absolutadventure.com', 'jalopnik.com', 'longbeachclothing.com', 'tel-avivtimes.com', 'survivingcipro.com']


### 1. Load all node features

In [5]:
node_features_df = load_node_features()
node_features_df = node_features_df.set_index('site')
node_features_df.head()

Unnamed: 0_level_0,alexa_rank,daily_pageviews_per_visitor,daily_time_on_site,total_sites_linking_in,bounce_rate
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
whistleblowersandrelators.com,,,,,
geokov.com,2238341.0,1.0,,60.0,0.9
trainingandfacilitation.ca,,,,,
plumsolutions.com.au,1023533.0,1.0,138.0,60.0,0.813
dbdailyupdate.com,145283.0,1.7,179.0,64.0,0.756


# Subset node_features

In [6]:
node_features_df = node_features_df.loc[nodes_in_edges]

In [7]:
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26573 entries, absolutadventure.com to aidc-iacl.org
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   alexa_rank                   17677 non-null  float64
 1   daily_pageviews_per_visitor  17684 non-null  float64
 2   daily_time_on_site           11799 non-null  float64
 3   total_sites_linking_in       25446 non-null  float64
 4   bounce_rate                  10665 non-null  float64
dtypes: float64(5)
memory usage: 1.2+ MB


### 2. Fill all missing alexa_rank and total_sites_linking_in with 0 

In [8]:
node_features_df.alexa_rank = node_features_df.alexa_rank.fillna(0)
node_features_df.total_sites_linking_in = node_features_df.total_sites_linking_in.fillna(0)
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26573 entries, absolutadventure.com to aidc-iacl.org
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   alexa_rank                   26573 non-null  float64
 1   daily_pageviews_per_visitor  17684 non-null  float64
 2   daily_time_on_site           11799 non-null  float64
 3   total_sites_linking_in       26573 non-null  float64
 4   bounce_rate                  10665 non-null  float64
dtypes: float64(5)
memory usage: 1.2+ MB


### 3. Normalizing features

In [9]:
import math

node_features_df['normalized_alexa_rank'] = node_features_df['alexa_rank'].apply(lambda x: 1/x if x else 0)
node_features_df['normalized_total_sites_linked_in'] = node_features_df['total_sites_linking_in'].apply(lambda x: math.log2(x) if x else 0)

# Create Graph

In [10]:
import stellargraph as sg

G = sg.StellarGraph(nodes=node_features_df.loc[nodes_in_edges, ['normalized_alexa_rank', 'normalized_total_sites_linked_in']], edges=edge_df)

print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 26573, Edges: 49372

 Node types:
  default: [26573]
    Features: float32 vector, length 2
    Edge types: default-default->default

 Edge types:
    default-default->default: [49372]
        Weights: all 1 (default)
        Features: none


# Unsupervised GraphSAGE
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, link_classification
from stellargraph.data import UnsupervisedSampler

from tensorflow import keras

In [11]:
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, link_classification
from stellargraph.data import UnsupervisedSampler

from tensorflow import keras

# 1. Specify the other optional parameter values: root nodes, the number of walks to take per node, the length of each walk, and random seed.

nodes = list(G.nodes())
number_of_walks = 1
length = 5

# 2. Create the UnsupervisedSampler instance with the relevant parameters passed to it.
unsupervised_samples = UnsupervisedSampler(G, nodes=nodes, length=length, number_of_walks=number_of_walks)

# 3. Create a node pair generator:
batch_size = 128
epochs = 4
num_samples = [10, 5]

graphsage_link_generator = GraphSAGELinkGenerator(G, batch_size, num_samples)
train_graphsage_link_gen = graphsage_link_generator.flow(unsupervised_samples)

layer_sizes = [128, 512]
graphsage = GraphSAGE(
    layer_sizes=layer_sizes, generator=graphsage_link_generator, bias=True, dropout=0.0, normalize="l2"
)

# Build the model and expose input and output sockets of graphsage, for node pair inputs:
x_inp, x_out = graphsage.in_out_tensors()

prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
)(x_out)

graphsage_model = keras.Model(inputs=x_inp, outputs=prediction)

graphsage_model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.categorical_crossentropy,
    metrics=['acc'],
)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [12]:
history = graphsage_model.fit(
    train_graphsage_link_gen,
    epochs=epochs,
    verbose=1,
    use_multiprocessing=False,
    workers=4,
    shuffle=True,
)


# Epoch 1/4
#    2/5047 [..............................] - ETA: 3:09 - loss: 0.7847 - binary_accuracy: 0.515 - ETA: 9:20 - loss: 0.7903 - binary_accuracy: 0.5039WARNING:tensorflow:Callbacks method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0129s vs `on_train_batch_end` time: 0.2059s). Check your callbacks.
# 5047/5047 [==============================] - 548s 109ms/step - loss: 0.5881 - binary_accuracy: 0.7075 8:59 - ETA: 8:3 - ETA: 8:01 - - ETA: 7:50 - loss: 0.6312  - ETA: 7:35 - loss: 0.6251 - bin - ETA: 7:13 - loss: 0.6196 - binary_acc - ETA: 7:11 - loss: 0.6192 -  - ETA: 7:08 - loss: 0.6188 -  - ETA: 7:04 - loss: 0.6183 - binary_accu - ETA: 7:04 - loss: 0.6179 - binary_a - ETA: 7:01 - loss: 0.6174 -  - ETA: 6:52 - los - ETA: 6:48 - loss: 0.6148 - binary_accuracy: 0. - ETA: 6:47 - loss: 0.6146 - - ETA: 6:45 - loss: 0.6 - ETA: 6:41  - ETA: 6:31 - loss: 0.6121 - binary_accuracy: 0.6 - ETA: 6:30 - loss: 0. - ETA: 6:26 - los - ETA: 6:02 - loss: 0.608 - ETA: 5:58 -  - ETA: 5:54  - ETA: 4:32 - loss: 0.6004 -  - ETA: 4:30 - loss: 0.6001 - binary_accuracy:  - ETA: 4:29 - loss: 0.6001 - binary_accuracy: - ETA: 4:28 - loss: 0.6001 - binary_accu - ETA: 10s - loss: 0.5883 - bina - ETA: 9s - loss: 0.5883 - binary_accuracy: 0.7 - ETA: 9s - loss: 0.5883 - binary 
# Epoch 2/4
# 5047/5047 [==============================] - 547s 108ms/step - loss: 0.5730 - binary_accuracy: 0.7293 9:22 - loss: 0.5795 - binary_accur - ETA: 8:40 - loss: 0.5779 - binary_accuracy - ETA: 8:38 - loss: 0.5780 - binary_accuracy: 0. - ETA: 8:37 - loss: 0.5778 - binary_accuracy - ETA: 8:36 - loss: 0.5779 - b - ETA:  - ETA: 5:54 - loss: 0.57 - ETA: 5:31 - lo - ETA: 5:02 - loss: 0.57 - ETA: 4:58 - loss: 0.5736 - binary_accuracy: 0.7 - ETA: 4:58 - loss: 0.5737 - - ETA: 4:48 - loss: 0.5738 - binary_accurac - ETA: 4:48 - l - ETA:  - ETA: 4:32 - loss: - ETA: 4:28 - loss: 0.5738 - binary_accuracy: 0.7 - ETA: 4:28 - loss: 0.5738  -  - ETA: 3:46 - loss: 0.5737 - binary_accuracy: 0.726 - ET - ETA: 3:41 - loss: 0.5737 - binary_ac - ETA: 3:39 -  - ETA: 3:34 - loss: 0.5737 - binary_ac - E - ETA: 2:50 - loss: 0.5736 - binary_accuracy: 0. - ETA: 2:24 - loss: 0.5735 - binary_accuracy: 0. - ETA: 2:23 - -
# Epoch 3/4
# 5047/5047 [==============================] - 550s 109ms/step - loss: 0.5698 - binary_accuracy: 0.7398
# Epoch 4/4
# 5047/5047 [==============================] - 546s 108ms/step - loss: 0.5698 - binary_accuracy: 0.7398 10:00 - loss:  - ETA: 4:31 - lo - ETA: 4:27 - loss: 0.5700 - binary_accu - - ETA: 4:07 - loss: 0.5699 - binary_accura - ETA: 3:59 - ETA: 3:55 - loss - ETA:  - ETA: 3:39 - loss: 0.5701 - binary_accuracy - ETA: 3:38 - loss: 0.5701 - binary_ac - ETA: 3: - ETA: 3:31 - loss: 0.5701 - binary_acc - ETA: 3:30 - loss: 0.5702 - bin - ETA: 3:27 - loss: 0 - ETA:  - ETA: 2:53 - lo - E - ETA: 2:37 - l - ETA: 2:32 - loss:  - ETA: 2:28 - loss: 0.5703 - binary_accuracy: 0. - ETA: 2:28 - loss: 0.5703 - binary_accura - ETA: 2:27 - loss: 0.57 - ETA: 2:23 - loss: 0.5702 - b - ETA: 2:21 -  - E - ETA: 54s - loss: 0.5698 - binary_a - ETA: 4s - loss: 0.5


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [13]:
embedding_model = keras.Model(inputs=x_inp[0::2], outputs=x_out[0])

from stellargraph.mapper import GraphSAGENodeGenerator

node_gen = GraphSAGENodeGenerator(G, batch_size, num_samples).flow(node_features_df.index)
node_embeddings = embedding_model.predict(node_gen, workers=4, verbose=1)

embeddings_wv = dict(zip(node_features_df.index.tolist(), node_embeddings.tolist()))

print('Sample:', embeddings_wv['crooked.com'][:10])

Sample: [-0.023203831166028976, -0.017092309892177582, 0.009941032156348228, 0.06367132067680359, -0.01949622668325901, 0.07100989669561386, 0.06012813746929169, -0.016986969858407974, -0.03124253638088703, 0.013894619420170784]


# Export embeddings as feature

In [14]:
export_model_as_feature(embeddings_wv, f'graph_sage_audience_overlap_level_{level}_epochs_{epochs}')

'/Users/panayot/Documents/News-Media-Peers/data/acl2020/features/graph_sage_audience_overlap_level_2_epochs_4.json'

In [15]:
run_experiment(features=f'graph_sage_audience_overlap_level_{level}_epochs_{epochs}')

+------+---------+---------------------+---------------+--------------------+----------------------------------------------+
| task | dataset | classification_mode | type_training | normalize_features |                   features                   |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+
| fact | acl2020 |  single classifier  |    combine    |       False        | graph_sage_audience_overlap_level_2_epochs_4 |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+


01-30 13:00:32 train        INFO     Start training...
01-30 13:00:32 train        INFO     Fold: 0
01-30 13:01:00 train        INFO     Fold: 1
01-30 13:01:14 train        INFO     Fold: 2
01-30 13:01:38 train        INFO     Fold: 3
01-30 13:01:53 train        INFO     Fold: 4


+------+---------+---------------------+---------------+--------------------+----------------------------------------------+--------------------+-------------------+--------------------+--------------------+
| task | dataset | classification_mode | type_training | normalize_features |                   features                   |      Macro-F1      |      Accuracy     |  Flip error-rate   |        MAE         |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+--------------------+-------------------+--------------------+--------------------+
| fact | acl2020 |  single classifier  |    combine    |       False        | graph_sage_audience_overlap_level_2_epochs_4 | 30.949565747612322 | 53.66705471478463 | 17.229336437718278 | 0.6356228172293364 |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+--------------------+-------------------+---

In [16]:
run_experiment(features=f'graph_sage_audience_overlap_level_{level}_epochs_{epochs}', task='bias')

+------+---------+---------------------+---------------+--------------------+----------------------------------------------+
| task | dataset | classification_mode | type_training | normalize_features |                   features                   |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+
| bias | acl2020 |  single classifier  |    combine    |       False        | graph_sage_audience_overlap_level_2_epochs_4 |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+


01-30 13:02:36 train        INFO     Start training...
01-30 13:02:36 train        INFO     Fold: 0
01-30 13:03:01 train        INFO     Fold: 1
01-30 13:03:32 train        INFO     Fold: 2
01-30 13:03:48 train        INFO     Fold: 3
01-30 13:04:09 train        INFO     Fold: 4


+------+---------+---------------------+---------------+--------------------+----------------------------------------------+--------------------+-------------------+--------------------+--------------------+
| task | dataset | classification_mode | type_training | normalize_features |                   features                   |      Macro-F1      |      Accuracy     |  Flip error-rate   |        MAE         |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+--------------------+-------------------+--------------------+--------------------+
| bias | acl2020 |  single classifier  |    combine    |       False        | graph_sage_audience_overlap_level_2_epochs_4 | 33.739040145866774 | 42.49126891734576 | 14.202561117578579 | 0.7171129220023282 |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+--------------------+-------------------+---