In [1]:
import pandas as pd
import sys
import os
sys.path.insert(0, '../../../')

from notebooks.utils import _ALEXA_DATA_PATH, load_node_features, load_level_data, create_audience_overlap_nodes, export_model_as_feature
from train import run_experiment

2022-02-20 11:48:38.162559: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load audience overlap edges for level 1

In [2]:
level = 1
audience_overlap_sites = load_level_data(os.path.join(_ALEXA_DATA_PATH, 'corpus_2020_audience_overlap_sites_scrapping_result.json'), level=level)
audience_overlap_sites_NODES = create_audience_overlap_nodes(audience_overlap_sites)

print(audience_overlap_sites_NODES[:5])

02-20 11:48:39 notebooks.utils INFO     Loaded 3489 nodes with records level <= 1 and child size:16981


[('crooked.com', 'votesaveamerica.com'), ('crooked.com', 'art19.com'), ('crooked.com', 'promocodeportal.com'), ('crooked.com', 'mediamatters.org'), ('crooked.com', 'actblue.com')]


In [3]:
edge_df = pd.DataFrame(audience_overlap_sites_NODES, columns=['source', 'target'])

edge_df.head()

Unnamed: 0,source,target
0,crooked.com,votesaveamerica.com
1,crooked.com,art19.com
2,crooked.com,promocodeportal.com
3,crooked.com,mediamatters.org
4,crooked.com,actblue.com


### Find all unique nodes in edges

In [4]:
nodes_in_edges = list(set(edge_df.source.unique().tolist() + edge_df.target.unique().tolist()))
print('Number of unique nodes in edges:', len(nodes_in_edges), 'Sample:', nodes_in_edges[:5])

Number of unique nodes in edges: 10161 Sample: ['idsa.in', 'fuckyourpoliticalcorrectness.com', 'catholictradition.org', 'publicityhound.com', 'sandratrappen.com']


### 1. Load all node features

In [5]:
node_features_df = load_node_features()
node_features_df = node_features_df.set_index('site')
node_features_df.head()

Unnamed: 0_level_0,alexa_rank,daily_pageviews_per_visitor,daily_time_on_site,total_sites_linking_in,bounce_rate
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
whistleblowersandrelators.com,,,,,
geokov.com,2238341.0,1.0,,60.0,0.9
trainingandfacilitation.ca,,,,,
plumsolutions.com.au,1023533.0,1.0,138.0,60.0,0.813
dbdailyupdate.com,145283.0,1.7,179.0,64.0,0.756


# Subset node_features

In [6]:
node_features_df = node_features_df.loc[nodes_in_edges]

In [7]:
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10161 entries, idsa.in to gbtimes.com
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   alexa_rank                   7465 non-null   float64
 1   daily_pageviews_per_visitor  7466 non-null   float64
 2   daily_time_on_site           5566 non-null   float64
 3   total_sites_linking_in       9861 non-null   float64
 4   bounce_rate                  5179 non-null   float64
dtypes: float64(5)
memory usage: 476.3+ KB


### 2. Fill all missing alexa_rank and total_sites_linking_in with 0 

In [8]:
node_features_df.alexa_rank = node_features_df.alexa_rank.fillna(0)
node_features_df.total_sites_linking_in = node_features_df.total_sites_linking_in.fillna(0)
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10161 entries, idsa.in to gbtimes.com
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   alexa_rank                   10161 non-null  float64
 1   daily_pageviews_per_visitor  7466 non-null   float64
 2   daily_time_on_site           5566 non-null   float64
 3   total_sites_linking_in       10161 non-null  float64
 4   bounce_rate                  5179 non-null   float64
dtypes: float64(5)
memory usage: 476.3+ KB


### 3. Normalizing features

In [9]:
import math

node_features_df['normalized_alexa_rank'] = node_features_df['alexa_rank'].apply(lambda x: 1/x if x else 0)
node_features_df['normalized_total_sites_linked_in'] = node_features_df['total_sites_linking_in'].apply(lambda x: math.log2(x) if x else 0)

# Create Graph

In [10]:
import stellargraph as sg

G = sg.StellarGraph(nodes=node_features_df.loc[nodes_in_edges, ['normalized_alexa_rank', 'normalized_total_sites_linked_in']], edges=edge_df)

print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 10161, Edges: 17010

 Node types:
  default: [10161]
    Features: float32 vector, length 2
    Edge types: default-default->default

 Edge types:
    default-default->default: [17010]
        Weights: all 1 (default)
        Features: none


# Unsupervised Attrib2Vec

In [11]:
from stellargraph.mapper import Attri2VecLinkGenerator, Attri2VecNodeGenerator
from stellargraph.layer import Attri2Vec, link_classification
from stellargraph.data import UnsupervisedSampler

from tensorflow import keras

# 1. Specify the other optional parameter values: root nodes, the number of walks to take per node, the length of each walk, and random seed.

nodes = list(G.nodes())
number_of_walks = 1
length = 5

# 2. Create the UnsupervisedSampler instance with the relevant parameters passed to it.
unsupervised_samples = UnsupervisedSampler(G, nodes=nodes, length=length, number_of_walks=number_of_walks)

# 3. Create a node pair generator:
batch_size = 50
epochs = 4
num_samples = [10, 5]

generator = Attri2VecLinkGenerator(G, batch_size)
train_gen = generator.flow(unsupervised_samples)

layer_sizes = [128]
attri2vec = Attri2Vec(layer_sizes=layer_sizes, generator=generator, bias=False, normalize=None)

# Build the model and expose input and output sockets of attri2vec, for node pair inputs:
x_inp, x_out = attri2vec.in_out_tensors()

prediction = link_classification(output_dim=1, output_act="sigmoid", edge_embedding_method="ip")(x_out)

model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [12]:
history = model.fit(train_gen, epochs=32, verbose=2, use_multiprocessing=False, workers=1, shuffle=True)

Epoch 1/32
1626/1626 - 15s - loss: 0.6946 - binary_accuracy: 0.5256 - 15s/epoch - 9ms/step
Epoch 2/32
1626/1626 - 15s - loss: 0.6715 - binary_accuracy: 0.5395 - 15s/epoch - 9ms/step
Epoch 3/32
1626/1626 - 14s - loss: 0.6645 - binary_accuracy: 0.5394 - 14s/epoch - 9ms/step
Epoch 4/32
1626/1626 - 15s - loss: 0.6618 - binary_accuracy: 0.5364 - 15s/epoch - 9ms/step
Epoch 5/32
1626/1626 - 15s - loss: 0.6590 - binary_accuracy: 0.5404 - 15s/epoch - 9ms/step
Epoch 6/32
1626/1626 - 15s - loss: 0.6567 - binary_accuracy: 0.5432 - 15s/epoch - 9ms/step
Epoch 7/32
1626/1626 - 14s - loss: 0.6521 - binary_accuracy: 0.5440 - 14s/epoch - 8ms/step
Epoch 8/32
1626/1626 - 14s - loss: 0.6510 - binary_accuracy: 0.5484 - 14s/epoch - 9ms/step
Epoch 9/32
1626/1626 - 14s - loss: 0.6494 - binary_accuracy: 0.5506 - 14s/epoch - 9ms/step
Epoch 10/32
1626/1626 - 14s - loss: 0.6504 - binary_accuracy: 0.5516 - 14s/epoch - 9ms/step
Epoch 11/32
1626/1626 - 15s - loss: 0.6475 - binary_accuracy: 0.5524 - 15s/epoch - 9ms/st

In [13]:
x_inp_src = x_inp[0]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

node_gen = Attri2VecNodeGenerator(G, batch_size).flow(node_features_df.index.tolist())
node_embeddings = embedding_model.predict(node_gen, workers=1, verbose=1)

embeddings_wv = dict(zip(node_features_df.index.tolist(), node_embeddings.tolist()))

print('Sample:', embeddings_wv['crooked.com'][:10], len(embeddings_wv['crooked.com']))

Sample: [0.024228185415267944, 0.0031242668628692627, 3.06463130073098e-06, 0.011180847883224487, 0.019945353269577026, 8.418160746259673e-07, 1.2754874489928625e-07, 0.013223797082901001, 7.395060947601451e-06, 0.0068540871143341064] 128


# Export embeddings as feature

In [14]:
export_model_as_feature(embeddings_wv, f'attrib2vec_audience_overlap_level_{level}_epochs_{epochs}')

'/Users/panayot/Documents/News-Media-Peers/data/acl2020/features/attrib2vec_audience_overlap_level_1_epochs_4.json'

In [15]:
run_experiment(features=f'attrib2vec_audience_overlap_level_{level}_epochs_{epochs}')

+------+---------+---------------------+---------------+--------------------+----------------------------------------------+
| task | dataset | classification_mode | type_training | normalize_features |                   features                   |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+
| fact | acl2020 |  single classifier  |    combine    |       False        | attrib2vec_audience_overlap_level_1_epochs_4 |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+


02-20 12:01:22 train        INFO     Start training...
02-20 12:01:22 train        INFO     Fold: 0
02-20 12:01:31 train        INFO     Fold: 1
02-20 12:01:36 train        INFO     Fold: 2
02-20 12:01:40 train        INFO     Fold: 3
02-20 12:01:44 train        INFO     Fold: 4


+------+---------+---------------------+---------------+--------------------+----------------------------------------------+-------------------+--------------------+--------------------+-------------------+
| task | dataset | classification_mode | type_training | normalize_features |                   features                   |      Macro-F1     |      Accuracy      |  Flip error-rate   |        MAE        |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+-------------------+--------------------+--------------------+-------------------+
| fact | acl2020 |  single classifier  |    combine    |       False        | attrib2vec_audience_overlap_level_1_epochs_4 | 33.69612663514097 | 54.249126891734576 | 15.250291036088475 | 0.610011641443539 |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+-------------------+--------------------+-------

In [16]:
run_experiment(features=f'attrib2vec_audience_overlap_level_{level}_epochs_{epochs}', task='bias')

+------+---------+---------------------+---------------+--------------------+----------------------------------------------+
| task | dataset | classification_mode | type_training | normalize_features |                   features                   |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+
| bias | acl2020 |  single classifier  |    combine    |       False        | attrib2vec_audience_overlap_level_1_epochs_4 |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+


02-20 12:01:50 train        INFO     Start training...
02-20 12:01:50 train        INFO     Fold: 0
02-20 12:01:55 train        INFO     Fold: 1
02-20 12:02:00 train        INFO     Fold: 2
02-20 12:02:04 train        INFO     Fold: 3
02-20 12:02:08 train        INFO     Fold: 4


+------+---------+---------------------+---------------+--------------------+----------------------------------------------+-------------------+--------------------+--------------------+--------------------+
| task | dataset | classification_mode | type_training | normalize_features |                   features                   |      Macro-F1     |      Accuracy      |  Flip error-rate   |        MAE         |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+-------------------+--------------------+--------------------+--------------------+
| bias | acl2020 |  single classifier  |    combine    |       False        | attrib2vec_audience_overlap_level_1_epochs_4 | 35.22845851931143 | 45.401629802095464 | 12.339930151338766 | 0.6693830034924331 |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------+-------------------+--------------------+---