In [1]:
import sys
sys.path.insert(0, '../../')

In [2]:
from notebooks.utils import load_corpus, load_splits, load_node_features
import pandas as pd

2022-01-14 01:44:58.330551: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load corpus, features and splits

In [3]:
df_corpus = pd.DataFrame(load_corpus(data_year='2018'))
df_features = load_node_features()
splits = load_splits(data_year='2018')

df_corpus.head()

Unnamed: 0,source_url,source_url_normalized,ref,fact,bias
0,http://www.villagevoice.com/,villagevoice.com,http://mediabiasfactcheck.com/the-village-voice/,high,left
1,https://insideclimatenews.org/,insideclimatenews.org,https://mediabiasfactcheck.com/insideclimate-n...,high,left-center
2,http://www.fury.news/,fury.news,http://mediabiasfactcheck.com/fury-news/,low,extreme-right
3,http://now8news.com/,now8news.com,http://mediabiasfactcheck.com/now8news/,low,center
4,http://constitution.com/,constitution.com,http://mediabiasfactcheck.com/the-constitution/,low,extreme-right


In [4]:
df_features.head()

Unnamed: 0,site,alexa_rank,daily_pageviews_per_visitor,daily_time_on_site,total_sites_linking_in,bounce_rate
0,whistleblowersandrelators.com,,,,,
1,geokov.com,2238341.0,1.0,,60.0,0.9
2,trainingandfacilitation.ca,,,,,
3,plumsolutions.com.au,1023533.0,1.0,138.0,60.0,0.813
4,dbdailyupdate.com,145283.0,1.7,179.0,64.0,0.756


In [5]:
splits.keys()

dict_keys(['0', '1', '2', '3', '4'])

In [6]:
total_df = pd.merge(df_features, df_corpus[['source_url_normalized', 'fact', 'bias']], left_on='site', right_on='source_url_normalized', how='right')
total_df.head()

Unnamed: 0,site,alexa_rank,daily_pageviews_per_visitor,daily_time_on_site,total_sites_linking_in,bounce_rate,source_url_normalized,fact,bias
0,villagevoice.com,83035.0,1.5,125.0,13134.0,0.659,villagevoice.com,high,left
1,insideclimatenews.org,131049.0,1.3,121.0,1017.0,0.797,insideclimatenews.org,high,left-center
2,fury.news,2659140.0,1.0,,35.0,,fury.news,low,extreme-right
3,now8news.com,2881397.0,2.0,,90.0,,now8news.com,low,center
4,constitution.com,2260170.0,2.0,93.0,200.0,,constitution.com,low,extreme-right


# Fill the missing values with mean

In [7]:
column_means = total_df.mean()
print(column_means)

alexa_rank                     918036.260486
daily_pageviews_per_visitor         1.759603
daily_time_on_site                159.245259
total_sites_linking_in           4632.524171
bounce_rate                         0.685276
dtype: float64


  column_means = total_df.mean()


In [8]:
total_df = total_df.fillna(column_means)
total_df = total_df.drop(['site', 'alexa_rank', 'daily_pageviews_per_visitor'], axis=1)
total_df = total_df[['source_url_normalized', 'daily_time_on_site', 'total_sites_linking_in', 'bounce_rate', 'bias', 'fact']]

total_df.head()

Unnamed: 0,source_url_normalized,daily_time_on_site,total_sites_linking_in,bounce_rate,bias,fact
0,villagevoice.com,125.0,13134.0,0.659,left,high
1,insideclimatenews.org,121.0,1017.0,0.797,left-center,high
2,fury.news,159.245259,35.0,0.685276,extreme-right,low
3,now8news.com,159.245259,90.0,0.685276,center,low
4,constitution.com,93.0,200.0,0.685276,extreme-right,low


In [9]:
from typing import Dict, List
import numpy as np
from sklearn.semi_supervised import LabelPropagation
from train import calculate_metrics

def train_model(splits: Dict[str, Dict[str, List[str]]], features: Dict[str, Dict[str, List[float]]], task):
    all_urls = []
    actual = []
    predicted = []

    if task=="fact":
        other_task = "bias"

    if task=="bias":
        other_task = "fact"

    i = 0
    num_folds = len(splits)
    for f in range(num_folds):
        # get the training and testing media for the current fold
        urls = {
            "train": splits[str(f)]["train"],
            "test": splits[str(f)]["test"],
        }

        all_urls.extend(splits[str(f)]["test"])


        # concatenate the different features/labels for the training sources
        X_train = features[features["source_url_normalized"].isin(urls["train"])]
        X_train = X_train.drop(['source_url_normalized', task, other_task], axis = 1)
        #print(X_train.head())
        y_train = np.asarray(features[features["source_url_normalized"].isin(urls["train"])][task])
        #print(y_train)

        X_test = features[features["source_url_normalized"].isin(urls["test"])]
        X_test = X_test.drop(['source_url_normalized', task, other_task], axis = 1)
        y_test = np.asarray(features[features["source_url_normalized"].isin(urls["test"])][task])

        clf = LabelPropagation()

        # train the classifier using the training data
        clf.fit(X_train, y_train)
        print(clf.score(X_test, y_test))

        # generate predictions
        pred = clf.predict(X_test)

        # generate probabilites
        prob = clf.predict_proba(X_test)
        #print(y_test)
        # cumulate the actual and predicted labels, and the probabilities over the different folds.  then, move the index
        actual[i: i + y_test.shape[0]] = y_test
        predicted[i: i + y_test.shape[0]] = pred
        i += y_test.shape[0]


    # calculate the performance metrics on the whole set of predictions (5 folds all together)
    f1, accuracy, flip_err, mae = calculate_metrics(actual, predicted)
    print('f1:', f1, 'accuracy:', accuracy, 'flip_err:', flip_err, 'mae:', mae)

In [10]:
# TODO normalize labels
from train import label2int

total_df['fact'] = total_df['fact'].map(label2int['fact'])
total_df['bias'] = total_df['bias'].map(label2int['bias'])

total_df.head()

Unnamed: 0,source_url_normalized,daily_time_on_site,total_sites_linking_in,bounce_rate,bias,fact
0,villagevoice.com,125.0,13134.0,0.659,0,2
1,insideclimatenews.org,121.0,1017.0,0.797,1,2
2,fury.news,159.245259,35.0,0.685276,2,0
3,now8news.com,159.245259,90.0,0.685276,1,0
4,constitution.com,93.0,200.0,0.685276,2,0


# Train label propagation on splits - fact

In [11]:
train_model(splits, total_df, "fact")

0.21962616822429906
0.22065727699530516
0.20754716981132076
0.20853080568720378
0.26066350710900477
f1: 17.23714445180206 accuracy: 22.33741753063148 flip_err: 49.01036757775684 mae: 1.2667295004712535


# Train label propagation on splits - bias

In [12]:
train_model(splits, total_df, "bias")

0.22897196261682243
0.215962441314554
0.21226415094339623
0.24644549763033174
0.25118483412322273
f1: 23.110276845067023 accuracy: 23.091423185673893 flip_err: 23.185673892554195 mae: 1.000942507068803


In [13]:
labels = total_df[['fact', 'bias']]

In [14]:
features = total_df.drop(['fact', 'bias', 'source_url_normalized'], axis=1)

In [15]:
features.head()

Unnamed: 0,daily_time_on_site,total_sites_linking_in,bounce_rate
0,125.0,13134.0,0.659
1,121.0,1017.0,0.797
2,159.245259,35.0,0.685276
3,159.245259,90.0,0.685276
4,93.0,200.0,0.685276


# Label propagation on fact

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import LabelPropagation

features_train, features_test, labels_train, labels_test = train_test_split(features, labels['fact'], test_size=0.20)

# initialize
clf = LabelPropagation()

# train the classifier using the training data
clf.fit(features_train, labels_train)

# compute accuracy using test data
acc_test = clf.score(features_test, labels_test)

print ("Test Accuracy:", acc_test)
# Test Accuracy: 0.98

Test Accuracy: 0.19718309859154928


# Label propagation on bias

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import LabelPropagation

features_train, features_test, labels_train, labels_test = train_test_split(features, labels['bias'], test_size=0.20)

# initialize
clf = LabelPropagation()

# train the classifier using the training data
clf.fit(features_train, labels_train)

# compute accuracy using test data
acc_test = clf.score(features_test, labels_test)

print ("Test Accuracy:", acc_test)
# Test Accuracy: 0.98

Test Accuracy: 0.23943661971830985
