In [1]:
import pandas as pd
import sys
import os
sys.path.insert(0, '../../../')

from notebooks.utils import (
    _ALEXA_DATA_PATH, load_level_data,
    create_overlap_nodes, export_model_as_feature, create_node2vec_model
)
from train import run_experiment

2022-01-14 11:28:16.563492: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load audience overlap edges for level 1

In [2]:
level = 1
audience_overlap_sites = load_level_data(os.path.join(_ALEXA_DATA_PATH, 'corpus_2020_audience_overlap_sites_scrapping_result.json'), level=level)
audience_overlap_sites_NODES = create_overlap_nodes(audience_overlap_sites)

print(audience_overlap_sites_NODES[:5])

01-14 11:28:18 notebooks.utils INFO     Loaded 3489 nodes with records level <= 1 and child size:16981


[('crooked.com', 'votesaveamerica.com'), ('crooked.com', 'art19.com'), ('crooked.com', 'promocodeportal.com'), ('crooked.com', 'mediamatters.org'), ('crooked.com', 'actblue.com')]


In [3]:
edge_df = pd.DataFrame(audience_overlap_sites_NODES, columns=['source', 'target'])

edge_df.head()

Unnamed: 0,source,target
0,crooked.com,votesaveamerica.com
1,crooked.com,art19.com
2,crooked.com,promocodeportal.com
3,crooked.com,mediamatters.org
4,crooked.com,actblue.com


# Create Graph

In [4]:
import stellargraph as sg

G = sg.StellarGraph(edges=edge_df)

print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 10161, Edges: 17010

 Node types:
  default: [10161]
    Features: none
    Edge types: default-default->default

 Edge types:
    default-default->default: [17010]
        Weights: all 1 (default)
        Features: none


# Create Node2Vec models

In [5]:
models = create_node2vec_model(G, dimensions=[64, 128, 256, 512, 1024], is_weighted=False,
                               prefix='corpus_2020_audience_overlap_lvl_one')

Start creating random walks
Number of random walks: 101610


01-14 11:38:30 gensim.models.word2vec INFO     collecting all words and their counts
01-14 11:38:30 gensim.models.word2vec INFO     PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
01-14 11:38:30 gensim.models.word2vec INFO     PROGRESS: at sentence #10000, processed 1000000 words, keeping 10149 word types
01-14 11:38:30 gensim.models.word2vec INFO     PROGRESS: at sentence #20000, processed 2000000 words, keeping 10154 word types


64 corpus_2020_audience_overlap_lvl_one_unweighted_64D.model


01-14 11:38:30 gensim.models.word2vec INFO     PROGRESS: at sentence #30000, processed 3000000 words, keeping 10161 word types
01-14 11:38:30 gensim.models.word2vec INFO     PROGRESS: at sentence #40000, processed 4000000 words, keeping 10161 word types
01-14 11:38:30 gensim.models.word2vec INFO     PROGRESS: at sentence #50000, processed 5000000 words, keeping 10161 word types
01-14 11:38:31 gensim.models.word2vec INFO     PROGRESS: at sentence #60000, processed 6000000 words, keeping 10161 word types
01-14 11:38:31 gensim.models.word2vec INFO     PROGRESS: at sentence #70000, processed 7000000 words, keeping 10161 word types
01-14 11:38:31 gensim.models.word2vec INFO     PROGRESS: at sentence #80000, processed 8000000 words, keeping 10161 word types
01-14 11:38:31 gensim.models.word2vec INFO     PROGRESS: at sentence #90000, processed 9000000 words, keeping 10161 word types
01-14 11:38:31 gensim.models.word2vec INFO     PROGRESS: at sentence #100000, processed 10000000 words, keeping

Successful save of model: corpus_2020_audience_overlap_lvl_one_unweighted_64D.model!
128 corpus_2020_audience_overlap_lvl_one_unweighted_128D.model


01-14 11:39:51 gensim.models.word2vec INFO     PROGRESS: at sentence #20000, processed 2000000 words, keeping 10154 word types
01-14 11:39:51 gensim.models.word2vec INFO     PROGRESS: at sentence #30000, processed 3000000 words, keeping 10161 word types
01-14 11:39:51 gensim.models.word2vec INFO     PROGRESS: at sentence #40000, processed 4000000 words, keeping 10161 word types
01-14 11:39:51 gensim.models.word2vec INFO     PROGRESS: at sentence #50000, processed 5000000 words, keeping 10161 word types
01-14 11:39:51 gensim.models.word2vec INFO     PROGRESS: at sentence #60000, processed 6000000 words, keeping 10161 word types
01-14 11:39:51 gensim.models.word2vec INFO     PROGRESS: at sentence #70000, processed 7000000 words, keeping 10161 word types
01-14 11:39:51 gensim.models.word2vec INFO     PROGRESS: at sentence #80000, processed 8000000 words, keeping 10161 word types
01-14 11:39:51 gensim.models.word2vec INFO     PROGRESS: at sentence #90000, processed 9000000 words, keeping 1

Successful save of model: corpus_2020_audience_overlap_lvl_one_unweighted_128D.model!
256 corpus_2020_audience_overlap_lvl_one_unweighted_256D.model


01-14 11:41:23 gensim.models.word2vec INFO     PROGRESS: at sentence #20000, processed 2000000 words, keeping 10154 word types
01-14 11:41:23 gensim.models.word2vec INFO     PROGRESS: at sentence #30000, processed 3000000 words, keeping 10161 word types
01-14 11:41:23 gensim.models.word2vec INFO     PROGRESS: at sentence #40000, processed 4000000 words, keeping 10161 word types
01-14 11:41:23 gensim.models.word2vec INFO     PROGRESS: at sentence #50000, processed 5000000 words, keeping 10161 word types
01-14 11:41:23 gensim.models.word2vec INFO     PROGRESS: at sentence #60000, processed 6000000 words, keeping 10161 word types
01-14 11:41:23 gensim.models.word2vec INFO     PROGRESS: at sentence #70000, processed 7000000 words, keeping 10161 word types
01-14 11:41:24 gensim.models.word2vec INFO     PROGRESS: at sentence #80000, processed 8000000 words, keeping 10161 word types
01-14 11:41:24 gensim.models.word2vec INFO     PROGRESS: at sentence #90000, processed 9000000 words, keeping 1

Successful save of model: corpus_2020_audience_overlap_lvl_one_unweighted_256D.model!
512 corpus_2020_audience_overlap_lvl_one_unweighted_512D.model


01-14 11:44:06 gensim.models.word2vec INFO     PROGRESS: at sentence #30000, processed 3000000 words, keeping 10161 word types
01-14 11:44:07 gensim.models.word2vec INFO     PROGRESS: at sentence #40000, processed 4000000 words, keeping 10161 word types
01-14 11:44:07 gensim.models.word2vec INFO     PROGRESS: at sentence #50000, processed 5000000 words, keeping 10161 word types
01-14 11:44:07 gensim.models.word2vec INFO     PROGRESS: at sentence #60000, processed 6000000 words, keeping 10161 word types
01-14 11:44:07 gensim.models.word2vec INFO     PROGRESS: at sentence #70000, processed 7000000 words, keeping 10161 word types
01-14 11:44:07 gensim.models.word2vec INFO     PROGRESS: at sentence #80000, processed 8000000 words, keeping 10161 word types
01-14 11:44:07 gensim.models.word2vec INFO     PROGRESS: at sentence #90000, processed 9000000 words, keeping 10161 word types
01-14 11:44:07 gensim.models.word2vec INFO     PROGRESS: at sentence #100000, processed 10000000 words, keeping

Successful save of model: corpus_2020_audience_overlap_lvl_one_unweighted_512D.model!
1024 corpus_2020_audience_overlap_lvl_one_unweighted_1024D.model


01-14 11:48:30 gensim.models.word2vec INFO     PROGRESS: at sentence #30000, processed 3000000 words, keeping 10161 word types
01-14 11:48:30 gensim.models.word2vec INFO     PROGRESS: at sentence #40000, processed 4000000 words, keeping 10161 word types
01-14 11:48:30 gensim.models.word2vec INFO     PROGRESS: at sentence #50000, processed 5000000 words, keeping 10161 word types
01-14 11:48:30 gensim.models.word2vec INFO     PROGRESS: at sentence #60000, processed 6000000 words, keeping 10161 word types
01-14 11:48:30 gensim.models.word2vec INFO     PROGRESS: at sentence #70000, processed 7000000 words, keeping 10161 word types
01-14 11:48:30 gensim.models.word2vec INFO     PROGRESS: at sentence #80000, processed 8000000 words, keeping 10161 word types
01-14 11:48:30 gensim.models.word2vec INFO     PROGRESS: at sentence #90000, processed 9000000 words, keeping 10161 word types
01-14 11:48:31 gensim.models.word2vec INFO     PROGRESS: at sentence #100000, processed 10000000 words, keeping

Successful save of model: corpus_2020_audience_overlap_lvl_one_unweighted_1024D.model!


# Export embeddings as feature

In [6]:
for model_name, model in models.items():
    print(f'Processing model: {model_name}')
    embeddings_wv = {site: model.wv.get_vector(site).tolist() for site in G.nodes()}
    export_model_as_feature(embeddings_wv, model_name, data_year='2020')
    run_experiment(features=model_name, dataset='acl2020', task='fact')
    print('\n', '-'*50, '\n')
    run_experiment(features=model_name, dataset='acl2020', task='bias')
    print('\n', '='*50, '\n')

Processing model: corpus_2020_audience_overlap_lvl_one_unweighted_64D.model
+------+---------------------+---------------+--------------------+-----------------------------------------------------------+
| task | classification_mode | type_training | normalize_features |                          features                         |
+------+---------------------+---------------+--------------------+-----------------------------------------------------------+
| fact |  single classifier  |    combine    |       False        | corpus_2020_audience_overlap_lvl_one_unweighted_64D.model |
+------+---------------------+---------------+--------------------+-----------------------------------------------------------+


01-14 11:55:16 train        INFO     Start training...
01-14 11:55:16 train        INFO     Fold: 0
01-14 11:55:21 train        INFO     Fold: 1
01-14 11:55:23 train        INFO     Fold: 2
01-14 11:55:25 train        INFO     Fold: 3
01-14 11:55:27 train        INFO     Fold: 4


+------+---------------------+---------------+--------------------+-----------------------------------------------------------+-------------------+-------------------+-------------------+---------------------+
| task | classification_mode | type_training | normalize_features |                          features                         |      Macro-F1     |      Accuracy     |  Flip error-rate  |         MAE         |
+------+---------------------+---------------+--------------------+-----------------------------------------------------------+-------------------+-------------------+-------------------+---------------------+
| fact |  single classifier  |    combine    |       False        | corpus_2020_audience_overlap_lvl_one_unweighted_64D.model | 54.94154975237065 | 64.26076833527358 | 8.149010477299184 | 0.43888242142025613 |
+------+---------------------+---------------+--------------------+-----------------------------------------------------------+-------------------+-------------

01-14 11:55:29 train        INFO     Start training...
01-14 11:55:29 train        INFO     Fold: 0
01-14 11:55:33 train        INFO     Fold: 1
01-14 11:55:35 train        INFO     Fold: 2
01-14 11:55:38 train        INFO     Fold: 3
01-14 11:55:40 train        INFO     Fold: 4


+------+---------------------+---------------+--------------------+-----------------------------------------------------------+-------------------+-----------------+-------------------+--------------------+
| task | classification_mode | type_training | normalize_features |                          features                         |      Macro-F1     |     Accuracy    |  Flip error-rate  |        MAE         |
+------+---------------------+---------------+--------------------+-----------------------------------------------------------+-------------------+-----------------+-------------------+--------------------+
| bias |  single classifier  |    combine    |       False        | corpus_2020_audience_overlap_lvl_one_unweighted_64D.model | 69.05883036510843 | 70.081490104773 | 7.683352735739232 | 0.3760186263096624 |
+------+---------------------+---------------+--------------------+-----------------------------------------------------------+-------------------+-----------------+-------

01-14 11:55:45 train        INFO     Start training...
01-14 11:55:45 train        INFO     Fold: 0
01-14 11:55:48 train        INFO     Fold: 1
01-14 11:55:51 train        INFO     Fold: 2
01-14 11:55:53 train        INFO     Fold: 3
01-14 11:55:55 train        INFO     Fold: 4


+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+-------------------+-------------------+--------------------+
| task | classification_mode | type_training | normalize_features |                          features                          |      Macro-F1     |      Accuracy     |  Flip error-rate  |        MAE         |
+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+-------------------+-------------------+--------------------+
| fact |  single classifier  |    combine    |       False        | corpus_2020_audience_overlap_lvl_one_unweighted_128D.model | 55.90694998794245 | 64.14435389988358 | 8.614668218859139 | 0.4447031431897555 |
+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+------------

01-14 11:55:58 train        INFO     Start training...
01-14 11:55:58 train        INFO     Fold: 0
01-14 11:56:02 train        INFO     Fold: 1
01-14 11:56:05 train        INFO     Fold: 2
01-14 11:56:08 train        INFO     Fold: 3
01-14 11:56:10 train        INFO     Fold: 4


+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+-------------------+------------------+--------------------+
| task | classification_mode | type_training | normalize_features |                          features                          |      Macro-F1     |      Accuracy     | Flip error-rate  |        MAE         |
+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+-------------------+------------------+--------------------+
| bias |  single classifier  |    combine    |       False        | corpus_2020_audience_overlap_lvl_one_unweighted_128D.model | 68.57006360857841 | 69.49941792782305 | 9.31315483119907 | 0.3981373690337602 |
+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+----------------

01-14 11:56:19 train        INFO     Start training...
01-14 11:56:19 train        INFO     Fold: 0
01-14 11:56:25 train        INFO     Fold: 1
01-14 11:56:30 train        INFO     Fold: 2
01-14 11:56:34 train        INFO     Fold: 3
01-14 11:56:39 train        INFO     Fold: 4


+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+-------------------+------------------+--------------------+
| task | classification_mode | type_training | normalize_features |                          features                          |      Macro-F1     |      Accuracy     | Flip error-rate  |        MAE         |
+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+-------------------+------------------+--------------------+
| fact |  single classifier  |    combine    |       False        | corpus_2020_audience_overlap_lvl_one_unweighted_256D.model | 55.41786090787247 | 64.49359720605355 | 9.19674039580908 | 0.4470314318975553 |
+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+----------------

01-14 11:56:46 train        INFO     Start training...
01-14 11:56:46 train        INFO     Fold: 0
01-14 11:56:53 train        INFO     Fold: 1
01-14 11:56:58 train        INFO     Fold: 2
01-14 11:57:03 train        INFO     Fold: 3
01-14 11:57:08 train        INFO     Fold: 4


+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+-------------------+-------------------+---------------------+
| task | classification_mode | type_training | normalize_features |                          features                          |      Macro-F1     |      Accuracy     |  Flip error-rate  |         MAE         |
+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+-------------------+-------------------+---------------------+
| bias |  single classifier  |    combine    |       False        | corpus_2020_audience_overlap_lvl_one_unweighted_256D.model | 70.47128237259815 | 71.36204889406287 | 8.614668218859139 | 0.37252619324796277 |
+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+--------

01-14 11:57:24 train        INFO     Start training...
01-14 11:57:24 train        INFO     Fold: 0
01-14 11:57:33 train        INFO     Fold: 1
01-14 11:57:40 train        INFO     Fold: 2
01-14 11:57:47 train        INFO     Fold: 3
01-14 11:57:54 train        INFO     Fold: 4


+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+-------------------+-------------------+--------------------+
| task | classification_mode | type_training | normalize_features |                          features                          |      Macro-F1     |      Accuracy     |  Flip error-rate  |        MAE         |
+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+-------------------+-------------------+--------------------+
| fact |  single classifier  |    combine    |       False        | corpus_2020_audience_overlap_lvl_one_unweighted_512D.model | 53.60886198138948 | 62.74738067520372 | 8.498253783469151 | 0.4575087310826543 |
+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+------------

01-14 11:58:03 train        INFO     Start training...
01-14 11:58:03 train        INFO     Fold: 0
01-14 11:58:12 train        INFO     Fold: 1
01-14 11:58:18 train        INFO     Fold: 2
01-14 11:58:25 train        INFO     Fold: 3
01-14 11:58:31 train        INFO     Fold: 4


+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+------------------+-------------------+--------------------+
| task | classification_mode | type_training | normalize_features |                          features                          |      Macro-F1     |     Accuracy     |  Flip error-rate  |        MAE         |
+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+------------------+-------------------+--------------------+
| bias |  single classifier  |    combine    |       False        | corpus_2020_audience_overlap_lvl_one_unweighted_512D.model | 71.14307602895335 | 71.9441210710128 | 7.566938300349244 | 0.3562281722933644 |
+------+---------------------+---------------+--------------------+------------------------------------------------------------+-------------------+----------------

01-14 11:58:54 train        INFO     Start training...
01-14 11:58:54 train        INFO     Fold: 0
01-14 11:59:14 train        INFO     Fold: 1
01-14 11:59:32 train        INFO     Fold: 2
01-14 11:59:53 train        INFO     Fold: 3
01-14 12:00:12 train        INFO     Fold: 4


+------+---------------------+---------------+--------------------+-------------------------------------------------------------+-------------------+-------------------+-------------------+-------------------+
| task | classification_mode | type_training | normalize_features |                           features                          |      Macro-F1     |      Accuracy     |  Flip error-rate  |        MAE        |
+------+---------------------+---------------+--------------------+-------------------------------------------------------------+-------------------+-------------------+-------------------+-------------------+
| fact |  single classifier  |    combine    |       False        | corpus_2020_audience_overlap_lvl_one_unweighted_1024D.model | 57.67208611044041 | 65.65774155995344 | 7.683352735739232 | 0.420256111757858 |
+------+---------------------+---------------+--------------------+-------------------------------------------------------------+-------------------+-----------

01-14 12:00:33 train        INFO     Start training...
01-14 12:00:33 train        INFO     Fold: 0
01-14 12:00:53 train        INFO     Fold: 1
01-14 12:01:11 train        INFO     Fold: 2
01-14 12:01:29 train        INFO     Fold: 3
01-14 12:01:50 train        INFO     Fold: 4


+------+---------------------+---------------+--------------------+-------------------------------------------------------------+-------------------+-------------------+-------------------+--------------------+
| task | classification_mode | type_training | normalize_features |                           features                          |      Macro-F1     |      Accuracy     |  Flip error-rate  |        MAE         |
+------+---------------------+---------------+--------------------+-------------------------------------------------------------+-------------------+-------------------+-------------------+--------------------+
| bias |  single classifier  |    combine    |       False        | corpus_2020_audience_overlap_lvl_one_unweighted_1024D.model | 72.36545847488097 | 73.22467986030267 | 7.916181606519208 | 0.3469150174621653 |
+------+---------------------+---------------+--------------------+-------------------------------------------------------------+-------------------+-------