# NELA Source Aggregation

Created: 2019.10.1  
Notebook sequence: 2

For determining which indicator label sets have values for which sources

---

In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
import util
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [18]:
labels_df = util.nela_load_labels()
labels_df.columns

Index(['Source', 'NewsGuard, Does not repeatedly publish false content',
       'NewsGuard, Gathers and presents information responsibly',
       'NewsGuard, Regularly corrects or clarifies errors',
       'NewsGuard, Handles the difference between news and opinion responsibly',
       'NewsGuard, Avoids deceptive headlines',
       'NewsGuard, Website discloses ownership and financing',
       'NewsGuard, Clearly labels advertising',
       'NewsGuard, Reveals who's in charge, including any possible conflicts of interest',
       'NewsGuard, Provides information about content creators',
       'NewsGuard, score', 'NewsGuard, overall_class',
       'Pew Research Center, known_by_40%', 'Pew Research Center, total',
       'Pew Research Center, consistently_liberal',
       'Pew Research Center, mostly_liberal', 'Pew Research Center, mixed',
       'Pew Research Center, mostly conservative',
       'Pew Research Center, consistently conservative', 'Wikipedia, is_fake',
       'Open Sourc

In [19]:
def add_counts_to_df(df):
    counts = []
    for source in tqdm(df.Source, "Counting articles for each source"):
        count_df = util.nela_count_articles_from_source(source)
        counts.append(count_df.iloc[0,0])
    df['counts'] = counts
    return df

# Reliable news

From notebook 1, will be using:


    Open Sources, unreliable | Open Sources, reliable
    Media Bias / Fact Check, factual_reporting 1-3 | 4-5 (PROBLEM: justify split)
    NewsGuard, overall_class 1.0 | 0.0


## Open Sources

In [20]:
os_unreliable_lbl = "Open Sources, unreliable"
os_reliable_lbl = "Open Sources, reliable"

In [21]:
labels_df[os_unreliable_lbl].value_counts()

2.0    9
1.0    4
3.0    2
Name: Open Sources, unreliable, dtype: int64

In [22]:
os_unreliable = labels_df[labels_df[os_unreliable_lbl].notnull()][[os_unreliable_lbl, "Source"]]
os_unreliable

Unnamed: 0,"Open Sources, unreliable",Source
4,2.0,Addicting Info
13,2.0,Breitbart
19,1.0,CNS News
65,2.0,Intellihub
70,2.0,LewRockwell
80,2.0,NODISINFO
102,2.0,Politicus USA
118,1.0,Shareblue
143,1.0,The Duran
145,3.0,The Gateway Pundit


In [23]:
os_unreliable = add_counts_to_df(os_unreliable)
os_unreliable

HBox(children=(IntProgress(value=0, description='Counting articles for each source', max=15, style=ProgressSty…




Unnamed: 0,"Open Sources, unreliable",Source,counts
4,2.0,Addicting Info,429
13,2.0,Breitbart,1877
19,1.0,CNS News,5263
65,2.0,Intellihub,334
70,2.0,LewRockwell,1278
80,2.0,NODISINFO,29
102,2.0,Politicus USA,4018
118,1.0,Shareblue,2134
143,1.0,The Duran,959
145,3.0,The Gateway Pundit,5667


In [37]:
os_unreliable.counts.sum()

28797

In [24]:
labels_df[os_reliable_lbl].value_counts()

2.0    3
Name: Open Sources, reliable, dtype: int64

In [25]:
os_reliable = labels_df[labels_df[os_reliable_lbl].notnull()][[os_reliable_lbl, "Source"]]
os_reliable

Unnamed: 0,"Open Sources, reliable",Source
6,2.0,Alternet
53,2.0,Fusion
190,2.0,oann


In [26]:
os_reliable = add_counts_to_df(os_reliable)
os_reliable

HBox(children=(IntProgress(value=0, description='Counting articles for each source', max=3, style=ProgressStyl…




Unnamed: 0,"Open Sources, reliable",Source,counts
6,2.0,Alternet,4816
53,2.0,Fusion,141
190,2.0,oann,14267


In [38]:
os_reliable.counts.sum()

19224

## Media Bias / Fact Check

In [27]:
mbfc_lbl = "Media Bias / Fact Check, factual_reporting"

In [28]:
labels_df[mbfc_lbl].value_counts()

4.0    54
3.0    48
2.0     7
5.0     2
1.0     1
Name: Media Bias / Fact Check, factual_reporting, dtype: int64

In [29]:
mbfc_unreliable = labels_df[labels_df[mbfc_lbl] <= 3][[mbfc_lbl, "Source"]]
mbfc_unreliable

Unnamed: 0,"Media Bias / Fact Check, factual_reporting",Source
0,3.0,21stCenturyWire
3,2.0,Activist Post
4,3.0,Addicting Info
6,3.0,Alternet
10,3.0,Bearing Arms
12,3.0,Birmingham Mail
15,3.0,Buzzfeed
18,3.0,CNN
25,3.0,Counter Current News
27,3.0,Crooks and Liars


In [30]:
mbfc_reliable = add_counts_to_df(labels_df[labels_df[mbfc_lbl] > 3][[mbfc_lbl, "Source"]])
mbfc_reliable

HBox(children=(IntProgress(value=0, description='Counting articles for each source', max=56, style=ProgressSty…




Unnamed: 0,"Media Bias / Fact Check, factual_reporting",Source,counts
1,4.0,ABC News,2808
5,4.0,Al Jazeera,4522
8,4.0,BBC,16416
14,4.0,Business Insider,445
16,4.0,CBS News,5397
17,4.0,CNBC,2426
21,4.0,Chicago Sun-Times,2113
29,4.0,Daily Beast,6634
35,4.0,Democracy 21,24
38,4.0,Evening Standard,17638


In [39]:
mbfc_reliable.counts.sum()

227997

In [31]:
mbfc_unreliable = add_counts_to_df(labels_df[labels_df[mbfc_lbl] <= 3][[mbfc_lbl, "Source"]])
mbfc_unreliable

HBox(children=(IntProgress(value=0, description='Counting articles for each source', max=56, style=ProgressSty…




Unnamed: 0,"Media Bias / Fact Check, factual_reporting",Source,counts
0,3.0,21stCenturyWire,322
3,2.0,Activist Post,1797
4,3.0,Addicting Info,429
6,3.0,Alternet,4816
10,3.0,Bearing Arms,1193
12,3.0,Birmingham Mail,9243
15,3.0,Buzzfeed,1661
18,3.0,CNN,8202
25,3.0,Counter Current News,23
27,3.0,Crooks and Liars,2465


In [40]:
mbfc_unreliable.counts.sum()

287689

## NewsGuard

In [32]:
ng_lbl = "NewsGuard, overall_class"

In [33]:
labels_df[ng_lbl].value_counts()

1.0    60
0.0    25
Name: NewsGuard, overall_class, dtype: int64

In [34]:
ng_reliable = add_counts_to_df(labels_df[labels_df[ng_lbl] == 1.0][[ng_lbl, "Source"]])
ng_reliable

HBox(children=(IntProgress(value=0, description='Counting articles for each source', max=60, style=ProgressSty…




Unnamed: 0,"NewsGuard, overall_class",Source,counts
1,1.0,ABC News,2808
6,1.0,Alternet,4816
8,1.0,BBC,16416
10,1.0,Bearing Arms,1193
14,1.0,Business Insider,445
15,1.0,Buzzfeed,1661
16,1.0,CBS News,5397
17,1.0,CNBC,2426
18,1.0,CNN,8202
19,1.0,CNS News,5263


In [41]:
ng_reliable.counts.sum()

247605

In [35]:
ng_unreliable = add_counts_to_df(labels_df[labels_df[ng_lbl] == 0.0][[ng_lbl, "Source"]])
ng_unreliable

HBox(children=(IntProgress(value=0, description='Counting articles for each source', max=25, style=ProgressSty…




Unnamed: 0,"NewsGuard, overall_class",Source,counts
5,0.0,Al Jazeera,4522
11,0.0,Bipartisan Report,4060
13,0.0,Breitbart,1877
30,0.0,Daily Kos,994
31,0.0,Daily Mail,3596
37,0.0,Drudge Report,18885
52,0.0,FrontPage Magazine,892
62,0.0,Infowars,2518
63,0.0,Instapundit,15584
72,0.0,Live Action,1054


In [42]:
ng_unreliable.counts.sum()

137375