In [3]:
import pandas as pd
import sys
sys.path.insert(0, '../../../')

from notebooks.utils import (
    get_referral_sites_edges, export_model_as_feature, create_node2vec_model
)
from train import run_experiment

# Load audience overlap edges for level 1

In [4]:
level = 1
referral_sites_NODES = get_referral_sites_edges(data_year=2020, level=level)

print(referral_sites_NODES[:5])

02-26 23:33:46 notebooks.utils INFO     Processing level 0
02-26 23:33:46 notebooks.utils INFO     Node length: 3258
02-26 23:33:46 notebooks.utils INFO     Distinct node length: 3258
02-26 23:33:46 notebooks.utils INFO     Processing level 1
02-26 23:33:47 notebooks.utils INFO     Node length: 9178
02-26 23:33:47 notebooks.utils INFO     Distinct node length: 9178


[('digitaljournal.com', 'transparencymarketresearch.com'), ('gallup.com', 'strengthsquest.com'), ('other98.com', 'cancelkavanaugh.com'), ('crimethinc.com', 'sproutdistro.com'), ('puppetstringnews.com', 'bigleaguepolitics.com')]


In [5]:
edge_df = pd.DataFrame(referral_sites_NODES, columns=['source', 'target'])

edge_df.head()

Unnamed: 0,source,target
0,digitaljournal.com,transparencymarketresearch.com
1,gallup.com,strengthsquest.com
2,other98.com,cancelkavanaugh.com
3,crimethinc.com,sproutdistro.com
4,puppetstringnews.com,bigleaguepolitics.com


# Create Graph

In [6]:
import stellargraph as sg

G = sg.StellarGraph(edges=edge_df)

print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 6927, Edges: 12436

 Node types:
  default: [6927]
    Features: none
    Edge types: default-default->default

 Edge types:
    default-default->default: [12436]
        Weights: all 1 (default)
        Features: none


# Create Node2Vec models

In [7]:
models = create_node2vec_model(G, dimensions=[64, 128, 256, 512, 1024], is_weighted=False,
                               prefix='corpus_2020_referral_sites_lvl_one')

Start creating random walks
Number of random walks: 69270


02-26 23:43:27 gensim.models.word2vec INFO     collecting all words and their counts
02-26 23:43:27 gensim.models.word2vec INFO     PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
02-26 23:43:27 gensim.models.word2vec INFO     PROGRESS: at sentence #10000, processed 1000000 words, keeping 6926 word types
02-26 23:43:28 gensim.models.word2vec INFO     PROGRESS: at sentence #20000, processed 2000000 words, keeping 6927 word types


64 corpus_2020_referral_sites_lvl_one_unweighted_64D.model


02-26 23:43:28 gensim.models.word2vec INFO     PROGRESS: at sentence #30000, processed 3000000 words, keeping 6927 word types
02-26 23:43:28 gensim.models.word2vec INFO     PROGRESS: at sentence #40000, processed 4000000 words, keeping 6927 word types
02-26 23:43:28 gensim.models.word2vec INFO     PROGRESS: at sentence #50000, processed 5000000 words, keeping 6927 word types
02-26 23:43:28 gensim.models.word2vec INFO     PROGRESS: at sentence #60000, processed 6000000 words, keeping 6927 word types
02-26 23:43:28 gensim.models.word2vec INFO     collected 6927 word types from a corpus of 6927000 raw words and 69270 sentences
02-26 23:43:28 gensim.models.word2vec INFO     Creating a fresh vocabulary
02-26 23:43:28 gensim.utils INFO     Word2Vec lifecycle event {'msg': 'effective_min_count=0 retains 6927 unique words (100.0%% of original 6927, drops 0)', 'datetime': '2022-02-26T23:43:28.485086', 'gensim': '4.1.2', 'python': '3.8.12 (default, Oct 12 2021, 06:23:56) \n[Clang 10.0.0 ]', 'pla

Successful save of model: corpus_2020_referral_sites_lvl_one_unweighted_64D.model!
128 corpus_2020_referral_sites_lvl_one_unweighted_128D.model


02-26 23:44:30 gensim.models.word2vec INFO     PROGRESS: at sentence #20000, processed 2000000 words, keeping 6927 word types
02-26 23:44:30 gensim.models.word2vec INFO     PROGRESS: at sentence #30000, processed 3000000 words, keeping 6927 word types
02-26 23:44:30 gensim.models.word2vec INFO     PROGRESS: at sentence #40000, processed 4000000 words, keeping 6927 word types
02-26 23:44:30 gensim.models.word2vec INFO     PROGRESS: at sentence #50000, processed 5000000 words, keeping 6927 word types
02-26 23:44:30 gensim.models.word2vec INFO     PROGRESS: at sentence #60000, processed 6000000 words, keeping 6927 word types
02-26 23:44:30 gensim.models.word2vec INFO     collected 6927 word types from a corpus of 6927000 raw words and 69270 sentences
02-26 23:44:30 gensim.models.word2vec INFO     Creating a fresh vocabulary
02-26 23:44:30 gensim.utils INFO     Word2Vec lifecycle event {'msg': 'effective_min_count=0 retains 6927 unique words (100.0%% of original 6927, drops 0)', 'datetime'

Successful save of model: corpus_2020_referral_sites_lvl_one_unweighted_128D.model!
256 corpus_2020_referral_sites_lvl_one_unweighted_256D.model


02-26 23:45:39 gensim.models.word2vec INFO     PROGRESS: at sentence #20000, processed 2000000 words, keeping 6927 word types
02-26 23:45:40 gensim.models.word2vec INFO     PROGRESS: at sentence #30000, processed 3000000 words, keeping 6927 word types
02-26 23:45:40 gensim.models.word2vec INFO     PROGRESS: at sentence #40000, processed 4000000 words, keeping 6927 word types
02-26 23:45:40 gensim.models.word2vec INFO     PROGRESS: at sentence #50000, processed 5000000 words, keeping 6927 word types
02-26 23:45:40 gensim.models.word2vec INFO     PROGRESS: at sentence #60000, processed 6000000 words, keeping 6927 word types
02-26 23:45:40 gensim.models.word2vec INFO     collected 6927 word types from a corpus of 6927000 raw words and 69270 sentences
02-26 23:45:40 gensim.models.word2vec INFO     Creating a fresh vocabulary
02-26 23:45:40 gensim.utils INFO     Word2Vec lifecycle event {'msg': 'effective_min_count=0 retains 6927 unique words (100.0%% of original 6927, drops 0)', 'datetime'

Successful save of model: corpus_2020_referral_sites_lvl_one_unweighted_256D.model!
512 corpus_2020_referral_sites_lvl_one_unweighted_512D.model


02-26 23:47:19 gensim.models.word2vec INFO     PROGRESS: at sentence #20000, processed 2000000 words, keeping 6927 word types
02-26 23:47:19 gensim.models.word2vec INFO     PROGRESS: at sentence #30000, processed 3000000 words, keeping 6927 word types
02-26 23:47:19 gensim.models.word2vec INFO     PROGRESS: at sentence #40000, processed 4000000 words, keeping 6927 word types
02-26 23:47:20 gensim.models.word2vec INFO     PROGRESS: at sentence #50000, processed 5000000 words, keeping 6927 word types
02-26 23:47:20 gensim.models.word2vec INFO     PROGRESS: at sentence #60000, processed 6000000 words, keeping 6927 word types
02-26 23:47:20 gensim.models.word2vec INFO     collected 6927 word types from a corpus of 6927000 raw words and 69270 sentences
02-26 23:47:20 gensim.models.word2vec INFO     Creating a fresh vocabulary
02-26 23:47:20 gensim.utils INFO     Word2Vec lifecycle event {'msg': 'effective_min_count=0 retains 6927 unique words (100.0%% of original 6927, drops 0)', 'datetime'

Successful save of model: corpus_2020_referral_sites_lvl_one_unweighted_512D.model!
1024 corpus_2020_referral_sites_lvl_one_unweighted_1024D.model


02-26 23:50:40 gensim.models.word2vec INFO     PROGRESS: at sentence #30000, processed 3000000 words, keeping 6927 word types
02-26 23:50:40 gensim.models.word2vec INFO     PROGRESS: at sentence #40000, processed 4000000 words, keeping 6927 word types
02-26 23:50:40 gensim.models.word2vec INFO     PROGRESS: at sentence #50000, processed 5000000 words, keeping 6927 word types
02-26 23:50:40 gensim.models.word2vec INFO     PROGRESS: at sentence #60000, processed 6000000 words, keeping 6927 word types
02-26 23:50:40 gensim.models.word2vec INFO     collected 6927 word types from a corpus of 6927000 raw words and 69270 sentences
02-26 23:50:40 gensim.models.word2vec INFO     Creating a fresh vocabulary
02-26 23:50:40 gensim.utils INFO     Word2Vec lifecycle event {'msg': 'effective_min_count=0 retains 6927 unique words (100.0%% of original 6927, drops 0)', 'datetime': '2022-02-26T23:50:40.587518', 'gensim': '4.1.2', 'python': '3.8.12 (default, Oct 12 2021, 06:23:56) \n[Clang 10.0.0 ]', 'pla

Successful save of model: corpus_2020_referral_sites_lvl_one_unweighted_1024D.model!


# Export embeddings as feature

In [8]:
for model_name, model in models.items():
    print(f'Processing model: {model_name}')
    embeddings_wv = {site: model.wv.get_vector(site).tolist() for site in G.nodes()}
    export_model_as_feature(embeddings_wv, model_name, data_year='2020')
    run_experiment(features=model_name, dataset='acl2020', task='fact')
    print('\n', '-'*50, '\n')
    run_experiment(features=model_name, dataset='acl2020', task='bias')
    print('\n', '='*50, '\n')

Processing model: corpus_2020_referral_sites_lvl_one_unweighted_64D.model
+------+---------+---------------------+---------------+--------------------+---------------------------------------------------------+
| task | dataset | classification_mode | type_training | normalize_features |                         features                        |
+------+---------+---------------------+---------------+--------------------+---------------------------------------------------------+
| fact | acl2020 |  single classifier  |    combine    |       False        | corpus_2020_referral_sites_lvl_one_unweighted_64D.model |
+------+---------+---------------------+---------------+--------------------+---------------------------------------------------------+


02-26 23:56:32 train        INFO     Start training...
02-26 23:56:33 train        INFO     Fold: 0
02-26 23:56:39 train        INFO     Fold: 1
02-26 23:56:44 train        INFO     Fold: 2
02-26 23:56:47 train        INFO     Fold: 3
02-26 23:56:51 train        INFO     Fold: 4


+------+---------+---------------------+---------------+--------------------+---------------------------------------------------------+-------------------+-------------------+------------------+--------------------+
| task | dataset | classification_mode | type_training | normalize_features |                         features                        |      Macro-F1     |      Accuracy     | Flip error-rate  |        MAE         |
+------+---------+---------------------+---------------+--------------------+---------------------------------------------------------+-------------------+-------------------+------------------+--------------------+
| fact | acl2020 |  single classifier  |    combine    |       False        | corpus_2020_referral_sites_lvl_one_unweighted_64D.model | 54.65453796654556 | 63.67869615832363 | 9.19674039580908 | 0.4551804423748545 |
+------+---------+---------------------+---------------+--------------------+---------------------------------------------------------+-

02-26 23:56:55 train        INFO     Start training...
02-26 23:56:55 train        INFO     Fold: 0
02-26 23:56:59 train        INFO     Fold: 1
02-26 23:57:04 train        INFO     Fold: 2
02-26 23:57:08 train        INFO     Fold: 3
02-26 23:57:12 train        INFO     Fold: 4


+------+---------+---------------------+---------------+--------------------+---------------------------------------------------------+-------------------+-------------------+-------------------+---------------------+
| task | dataset | classification_mode | type_training | normalize_features |                         features                        |      Macro-F1     |      Accuracy     |  Flip error-rate  |         MAE         |
+------+---------+---------------------+---------------+--------------------+---------------------------------------------------------+-------------------+-------------------+-------------------+---------------------+
| bias | acl2020 |  single classifier  |    combine    |       False        | corpus_2020_referral_sites_lvl_one_unweighted_64D.model | 69.81474803328392 | 70.66356228172293 | 8.381839348079161 | 0.37718277066356226 |
+------+---------+---------------------+---------------+--------------------+---------------------------------------------------

02-26 23:57:19 train        INFO     Start training...
02-26 23:57:19 train        INFO     Fold: 0
02-26 23:57:24 train        INFO     Fold: 1
02-26 23:57:31 train        INFO     Fold: 2
02-26 23:57:35 train        INFO     Fold: 3
02-26 23:57:39 train        INFO     Fold: 4


+------+---------+---------------------+---------------+--------------------+----------------------------------------------------------+--------------------+-------------------+------------------+---------------------+
| task | dataset | classification_mode | type_training | normalize_features |                         features                         |      Macro-F1      |      Accuracy     | Flip error-rate  |         MAE         |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------------------+--------------------+-------------------+------------------+---------------------+
| fact | acl2020 |  single classifier  |    combine    |       False        | corpus_2020_referral_sites_lvl_one_unweighted_128D.model | 54.069554448321725 | 63.09662398137369 | 9.31315483119907 | 0.46216530849825377 |
+------+---------+---------------------+---------------+--------------------+-----------------------------------------------

02-26 23:57:44 train        INFO     Start training...
02-26 23:57:44 train        INFO     Fold: 0
02-26 23:57:50 train        INFO     Fold: 1
02-26 23:57:55 train        INFO     Fold: 2
02-26 23:57:59 train        INFO     Fold: 3
02-26 23:58:03 train        INFO     Fold: 4


+------+---------+---------------------+---------------+--------------------+----------------------------------------------------------+-------------------+-------------------+-------------------+--------------------+
| task | dataset | classification_mode | type_training | normalize_features |                         features                         |      Macro-F1     |      Accuracy     |  Flip error-rate  |        MAE         |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------------------+-------------------+-------------------+-------------------+--------------------+
| bias | acl2020 |  single classifier  |    combine    |       False        | corpus_2020_referral_sites_lvl_one_unweighted_128D.model | 68.82413477512324 | 69.49941792782305 | 8.963911525029102 | 0.3946449359720605 |
+------+---------+---------------------+---------------+--------------------+---------------------------------------------------

02-26 23:58:11 train        INFO     Start training...
02-26 23:58:11 train        INFO     Fold: 0
02-26 23:58:16 train        INFO     Fold: 1
02-26 23:58:21 train        INFO     Fold: 2
02-26 23:58:26 train        INFO     Fold: 3
02-26 23:58:31 train        INFO     Fold: 4


+------+---------+---------------------+---------------+--------------------+----------------------------------------------------------+--------------------+--------------------+------------------+--------------------+
| task | dataset | classification_mode | type_training | normalize_features |                         features                         |      Macro-F1      |      Accuracy      | Flip error-rate  |        MAE         |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------------------+--------------------+--------------------+------------------+--------------------+
| fact | acl2020 |  single classifier  |    combine    |       False        | corpus_2020_referral_sites_lvl_one_unweighted_256D.model | 53.849599770123326 | 62.398137369033755 | 9.31315483119907 | 0.4691501746216531 |
+------+---------+---------------------+---------------+--------------------+-----------------------------------------------

02-26 23:58:39 train        INFO     Start training...
02-26 23:58:39 train        INFO     Fold: 0
02-26 23:58:46 train        INFO     Fold: 1
02-26 23:58:53 train        INFO     Fold: 2
02-26 23:58:57 train        INFO     Fold: 3
02-26 23:59:01 train        INFO     Fold: 4


+------+---------+---------------------+---------------+--------------------+----------------------------------------------------------+------------------+-----------------+-------------------+---------------------+
| task | dataset | classification_mode | type_training | normalize_features |                         features                         |     Macro-F1     |     Accuracy    |  Flip error-rate  |         MAE         |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------------------+------------------+-----------------+-------------------+---------------------+
| bias | acl2020 |  single classifier  |    combine    |       False        | corpus_2020_referral_sites_lvl_one_unweighted_256D.model | 69.3601885803015 | 69.965075669383 | 8.847497089639116 | 0.38882421420256114 |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------------------+

02-26 23:59:11 train        INFO     Start training...
02-26 23:59:11 train        INFO     Fold: 0
02-26 23:59:18 train        INFO     Fold: 1
02-26 23:59:26 train        INFO     Fold: 2
02-26 23:59:32 train        INFO     Fold: 3
02-26 23:59:41 train        INFO     Fold: 4


+------+---------+---------------------+---------------+--------------------+----------------------------------------------------------+-------------------+-------------------+-------------------+---------------------+
| task | dataset | classification_mode | type_training | normalize_features |                         features                         |      Macro-F1     |      Accuracy     |  Flip error-rate  |         MAE         |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------------------+-------------------+-------------------+-------------------+---------------------+
| fact | acl2020 |  single classifier  |    combine    |       False        | corpus_2020_referral_sites_lvl_one_unweighted_512D.model | 54.29470249973304 | 62.16530849825378 | 8.731082654249127 | 0.46565774155995343 |
+------+---------+---------------------+---------------+--------------------+-----------------------------------------------

02-26 23:59:52 train        INFO     Start training...
02-26 23:59:52 train        INFO     Fold: 0
02-27 00:00:04 train        INFO     Fold: 1
02-27 00:00:14 train        INFO     Fold: 2
02-27 00:00:22 train        INFO     Fold: 3
02-27 00:00:31 train        INFO     Fold: 4


+------+---------+---------------------+---------------+--------------------+----------------------------------------------------------+-------------------+-------------------+------------------+-------------------+
| task | dataset | classification_mode | type_training | normalize_features |                         features                         |      Macro-F1     |      Accuracy     | Flip error-rate  |        MAE        |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------------------+-------------------+-------------------+------------------+-------------------+
| bias | acl2020 |  single classifier  |    combine    |       False        | corpus_2020_referral_sites_lvl_one_unweighted_512D.model | 69.54116410044651 | 70.31431897555296 | 9.31315483119907 | 0.389988358556461 |
+------+---------+---------------------+---------------+--------------------+----------------------------------------------------------+

02-27 00:01:00 train        INFO     Start training...
02-27 00:01:00 train        INFO     Fold: 0
02-27 00:01:23 train        INFO     Fold: 1
02-27 00:01:44 train        INFO     Fold: 2
02-27 00:02:05 train        INFO     Fold: 3
02-27 00:02:25 train        INFO     Fold: 4


+------+---------+---------------------+---------------+--------------------+-----------------------------------------------------------+--------------------+--------------------+-------------------+--------------------+
| task | dataset | classification_mode | type_training | normalize_features |                          features                         |      Macro-F1      |      Accuracy      |  Flip error-rate  |        MAE         |
+------+---------+---------------------+---------------+--------------------+-----------------------------------------------------------+--------------------+--------------------+-------------------+--------------------+
| fact | acl2020 |  single classifier  |    combine    |       False        | corpus_2020_referral_sites_lvl_one_unweighted_1024D.model | 55.258812474972565 | 63.445867287543656 | 8.963911525029102 | 0.4551804423748545 |
+------+---------+---------------------+---------------+--------------------+---------------------------------------

02-27 00:02:47 train        INFO     Start training...
02-27 00:02:47 train        INFO     Fold: 0
02-27 00:03:09 train        INFO     Fold: 1
02-27 00:03:31 train        INFO     Fold: 2
02-27 00:03:51 train        INFO     Fold: 3
02-27 00:04:12 train        INFO     Fold: 4


+------+---------+---------------------+---------------+--------------------+-----------------------------------------------------------+-------------------+-------------------+-------------------+--------------------+
| task | dataset | classification_mode | type_training | normalize_features |                          features                         |      Macro-F1     |      Accuracy     |  Flip error-rate  |        MAE         |
+------+---------+---------------------+---------------+--------------------+-----------------------------------------------------------+-------------------+-------------------+-------------------+--------------------+
| bias | acl2020 |  single classifier  |    combine    |       False        | corpus_2020_referral_sites_lvl_one_unweighted_1024D.model | 70.39208021305208 | 71.24563445867288 | 9.545983701979045 | 0.3830034924330617 |
+------+---------+---------------------+---------------+--------------------+-----------------------------------------------