# NELA Article Collection

Created: 2019.10.8  
Notebook sequence: 5

For getting code together to build the sets of articles to use for each indicator set

---

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import util
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [4]:
def random_balanced_sample(source_name_array, count, reject_minimum=100, force_balance=True, random_seed=13, verbose=True):
    building_df = None
    counts = {}
    minimum_count = 100000000 # derp
    returned_counts = {}
    rejected = []
    
    for name in tqdm(source_name_array, "Querying sources", disable=(not verbose)):
        local_df = util.nela_load_articles_from_source(name)
        local_count = local_df.shape[0]
        if verbose: print(name, local_count, "articles")
        
        if local_count < reject_minimum:
            rejected.append(name)
            if verbose: print(name, "rejected")
            continue
    
        if local_count < minimum_count:
            minimum_count = local_count
    
        counts[name] = local_count
        building_df = util.stack_dfs(building_df, local_df)
        
    max_possible_balanced = minimum_count*len(counts.keys())
    if verbose: print(max_possible_balanced,"maximum possible balanced sample size")
        
    sample_df = None
    if count > max_possible_balanced:
        if verbose: print("Grabbing maximum")
        if force_balance:
            if verbose: print("Force balance requested, will not return", count, "as requested")
            
            for name in tqdm(counts.keys(), "Sampling", disable=(not verbose)):
                source_sample_df = building_df[building_df.source == name].sample(minimum_count, random_state=random_seed)
                sample_df = util.stack_dfs(sample_df, source_sample_df)
        
        else:
            print("WARNING: unbalanced output")
            
            # TODO
    else:
        total_per = int(count / len(counts.keys()))
        remainder = count % len(counts.keys())
        if verbose: print("Grabbing", total_per, "per source")
        
        for name in tqdm(counts.keys(), "Sampling", disable=(not verbose)):
            sample_size = total_per
            if remainder > 0: 
                sample_size += 1
                remainder -= 1
            source_sample_df = building_df[building_df.source == name].sample(sample_size, random_state=random_seed)
            returned_counts[name] = source_sample_df.shape[0]
            sample_df = util.stack_dfs(sample_df, source_sample_df)

                
    return sample_df, returned_counts, rejected
    

In [5]:
# see notebook 2
os_unreliable = ['Addicting Info', 'Breitbart', 'CNS News', 'Intellihub', 'LewRockwell', 'NODISINFO', 'Politicus USA', 'Shareblue', 'The Duran', 'The Gateway Pundit', 'The Political Insider', 'The Washington Examiner', 'TheAntiMedia', 'True Activist', 'Veterans Today']
os_reliable = ['Alternet', 'Fusion', 'oann']

mbfc_unreliable = ['21stCenturyWire', 'Activist Post', 'Addicting Info', 'Alternet', 'Bearing Arms', 'Birmingham Mail', 'Buzzfeed', 'CNN', 'Counter Current News', 'Crooks and Liars', 'Daily Kos', 'Daily Signal', 'Drudge Report', 'Feministing Blog', 'Fox News', 'Fusion', 'GlobalResearch', 'HumansAreFree', 'Infowars', 'Intellihub', 'Investors Business Daily', 'Live Action', 'MSNBC', 'NODISINFO', 'National Review', 'Natural News', 'New York Daily News', 'New York Post', 'Newsweek', 'Palmer Report', 'Pravada Report', 'Prison Planet', 'Raw Story', 'RedState', 'RightWingWatch', 'Russia-Insider', 'Shareblue', 'Sputnik', 'Telesur TV', 'The Daily Caller', 'The Daily Express', 'The Daily Mirror', 'The Daily Record', 'The Daily Star', 'The Political Insider', 'The Right Scoop', 'The Sun', 'TheAntiMedia', 'TheBlaze', 'ThinkProgress', 'True Pundit', 'Veterans Today', 'Waking Times', 'Western Journal', 'Yahoo News', 'sott.net']
mbfc_reliable = ['ABC News', 'Al Jazeera', 'BBC', 'Business Insider', 'CBS News', 'CNBC', 'Chicago Sun-Times', 'Daily Beast', 'Democracy 21', 'Evening Standard', 'FiveThirtyEight', 'Foreign Policy', 'Fortune', 'Forward Progessives', 'France24', 'Hot Air', 'Interpreter Mag', 'Media Matters for America', 'Mercury News', 'MotherJones', 'NPR', 'New Yorker', 'PBS', 'Pink News UK', 'Politico', 'Real Clear Politics', 'Reuters', 'Salon', 'Shadow Proof', 'SkyNewsPolitics', 'SkyNewsUS', 'Slate', 'Spiegel', 'Talking Points Memo', 'Tass', 'The American Conservative', 'The Atlantic', 'The Denver Post', 'The Fiscal Times', 'The Guardian', 'The Hill', 'The Huffington Post', 'The Independent', 'The Intercept', 'The Irish Times', 'The Moscow Times', 'The New York Times', 'The Telegraph', 'The Verge', 'The Washington Examiner', 'USA Today', 'Vox', 'Washington Monthly', 'Washington Post', 'Wings Over Scotland', 'iPolitics']

ng_unreliable = ['Al Jazeera', 'Bipartisan Report', 'Breitbart', 'Daily Kos', 'Daily Mail', 'Drudge Report', 'FrontPage Magazine', 'Infowars', 'Instapundit', 'Live Action', 'Natural News', 'Palmer Report', 'Pamela Geller Report', 'RT', 'Shareblue', 'Sputnik', 'The Conservative Tree House', 'The Duran', 'The Gateway Pundit', 'The Political Insider', 'The Right Scoop', 'TheAntiMedia', 'TheBlaze', 'True Pundit', 'Western Journal']
ng_reliable = ['ABC News', 'Alternet', 'BBC', 'Bearing Arms', 'Business Insider', 'Buzzfeed', 'CBS News', 'CNBC', 'CNN', 'CNS News', 'Chicago Sun-Times', 'Crooks and Liars', 'Daily Beast', 'Daily Signal', 'FT Westminster Blog', 'FiveThirtyEight', 'Foreign Policy', 'Fortune', 'Fox News', 'Investors Business Daily', 'MSNBC', 'Media Matters for America', 'Mercury News', 'MotherJones', 'NPR', 'National Review', 'New York Daily News', 'New York Post', 'New Yorker', 'News Busters', 'Newsweek', 'Observer', 'PBS', 'Politico', 'Politicus USA', 'Raw Story', 'Real Clear Politics', 'Reuters', 'Salon', 'Slate', 'Talking Points Memo', 'The American Conservative', 'The Atlantic', 'The Daily Caller', 'The Denver Post', 'The Guardian', 'The Hill', 'The Huffington Post', 'The Independent', 'The Intercept', 'The New York Times', 'The Verge', 'The Washington Examiner', 'ThinkProgress', 'USA Today', 'Vox', 'WSJ Washington Wire', 'Washington Monthly', 'Washington Post', 'Yahoo News']

In [5]:
mbfc_biased = ['ABC News', 'Addicting Info', 'Al Jazeera', 'Alternet', 'BBC', 'Bearing Arms', 'Birmingham Mail', 'Business Insider', 'Buzzfeed', 'CBS News', 'CNBC', 'CNN', 'Chicago Sun-Times', 'Crooks and Liars', 'Daily Beast', 'Daily Kos', 'Daily Signal', 'Democracy 21', 'Drudge Report', 'Evening Standard', 'Feministing Blog', 'FiveThirtyEight', 'Fortune', 'Forward Progessives', 'Fox News', 'France24', 'Fusion', 'Hot Air', 'Interpreter Mag', 'Investors Business Daily', 'MSNBC', 'Media Matters for America', 'Mercury News', 'MotherJones', 'NPR', 'National Review', 'New York Daily News', 'New York Post', 'New Yorker', 'Newsweek', 'PBS', 'Palmer Report', 'Pink News UK', 'Pravada Report', 'Raw Story', 'Real Clear Politics', 'RedState', 'RightWingWatch', 'Russia-Insider', 'Salon', 'Shadow Proof', 'Shareblue', 'SkyNewsPolitics', 'SkyNewsUS', 'Slate', 'Spiegel', 'Sputnik', 'Talking Points Memo', 'Tass', 'Telesur TV', 'The American Conservative', 'The Atlantic', 'The Daily Caller', 'The Daily Express', 'The Daily Mirror', 'The Daily Record', 'The Denver Post', 'The Fiscal Times', 'The Guardian', 'The Hill', 'The Huffington Post', 'The Independent', 'The Intercept', 'The Irish Times', 'The Moscow Times', 'The New York Times', 'The Political Insider', 'The Right Scoop', 'The Sun', 'The Telegraph', 'The Verge', 'The Washington Examiner', 'TheBlaze', 'ThinkProgress', 'USA Today', 'Vox', 'Washington Monthly', 'Washington Post', 'Western Journal', 'Wings Over Scotland', 'Yahoo News', 'iPolitics']
mbfc_unbiased = ['Foreign Policy', 'Politico', 'Reuters']

as_biased = ['ABC News', 'Alternet', 'Breitbart', 'Buzzfeed', 'CBS News', 'CNN', 'CNS News', 'Chicago Sun-Times', 'Daily Beast', 'Daily Kos', 'Daily Mail', 'Daily Signal', 'Drudge Report', 'Fox News', 'FrontPage Magazine', 'Hot Air', 'Infowars', 'Intellectual Conservative', 'Investors Business Daily', 'Live Action', 'MSNBC', 'Media Matters for America', 'MotherJones', 'National Review', 'New York Daily News', 'New York Post', 'New Yorker', 'Newsweek', 'Politics UK', 'Politicus USA', 'Raw Story', 'RedState', 'RightWingWatch', 'Salon', 'Slate', 'The American Conservative', 'The Atlantic', 'The Daily Caller', 'The Fiscal Times', 'The Gateway Pundit', 'The Guardian', 'The Huffington Post', 'The Intercept', 'The Michelle Malkin Blog', 'The New York Times', 'The Telegraph', 'The Verge', 'The Washington Examiner', 'ThinkProgress', 'Vox', 'Washington Monthly', 'Washington Post', 'Western Journal', 'Yahoo News']
as_unbiased = ['Al Jazeera', 'BBC', 'Business Insider', 'CNBC', 'FiveThirtyEight', 'NPR', 'PBS', 'Real Clear Politics', 'Reuters', 'The Hill', 'USA Today']

In [6]:
df, counts, rejected = random_balanced_sample(os_unreliable, 4500)

HBox(children=(IntProgress(value=0, description='Querying sources', max=15, style=ProgressStyle(description_wi…

Addicting Info 429 articles
Breitbart 1877 articles
CNS News 5263 articles
Intellihub 334 articles
LewRockwell 1278 articles
NODISINFO 29 articles
NODISINFO rejected
Politicus USA 4018 articles
Shareblue 2134 articles
The Duran 959 articles
The Gateway Pundit 5667 articles
The Political Insider 2680 articles
The Washington Examiner 469 articles
TheAntiMedia 666 articles
True Activist 370 articles
Veterans Today 2624 articles

4676 maximum possible balanced sample size
Grabbing 321 per source


HBox(children=(IntProgress(value=0, description='Sampling', max=14, style=ProgressStyle(description_width='ini…




In [7]:
counts

{'Addicting Info': 322,
 'Breitbart': 322,
 'CNS News': 322,
 'Intellihub': 322,
 'LewRockwell': 322,
 'Politicus USA': 322,
 'Shareblue': 321,
 'The Duran': 321,
 'The Gateway Pundit': 321,
 'The Political Insider': 321,
 'The Washington Examiner': 321,
 'TheAntiMedia': 321,
 'True Activist': 321,
 'Veterans Today': 321}

# OS Reliability

In [8]:
print(os_reliable)
print(os_unreliable)

['Alternet', 'Fusion', 'oann']
['Addicting Info', 'Breitbart', 'CNS News', 'Intellihub', 'LewRockwell', 'NODISINFO', 'Politicus USA', 'Shareblue', 'The Duran', 'The Gateway Pundit', 'The Political Insider', 'The Washington Examiner', 'TheAntiMedia', 'True Activist', 'Veterans Today']


In [16]:
df_os_unreliable, os_unreliable_counts, os_unreliable_rejected = random_balanced_sample(os_unreliable, count=5000, reject_minimum=300)

HBox(children=(IntProgress(value=0, description='Querying sources', max=15, style=ProgressStyle(description_wi…

Addicting Info 429 articles
Breitbart 1877 articles
CNS News 5263 articles
Intellihub 334 articles
LewRockwell 1278 articles
NODISINFO 29 articles
NODISINFO rejected
Politicus USA 4018 articles
Shareblue 2134 articles
The Duran 959 articles
The Gateway Pundit 5667 articles
The Political Insider 2680 articles
The Washington Examiner 469 articles
TheAntiMedia 666 articles
True Activist 370 articles
Veterans Today 2624 articles

4676 maximum possible balanced sample size
Grabbing maximum
Force balance requested, will not return 5000 as requested


HBox(children=(IntProgress(value=0, description='Sampling', max=14, style=ProgressStyle(description_width='ini…




In [17]:
df_os_reliable, os_reliable_counts, os_reliable_rejected = random_balanced_sample(os_reliable, count=4676, reject_minimum=300)

HBox(children=(IntProgress(value=0, description='Querying sources', max=3, style=ProgressStyle(description_wid…

Alternet 4816 articles
Fusion 141 articles
Fusion rejected
oann 14267 articles

9632 maximum possible balanced sample size
Grabbing 2338 per source


HBox(children=(IntProgress(value=0, description='Sampling', max=2, style=ProgressStyle(description_width='init…




In [19]:
df_os_unreliable["reliable"] = 0
df_os_reliable["reliable"] = 0

df_os_reliability = util.stack_dfs(df_os_unreliable, df_os_reliable)

# MBFC Reliability

In [21]:
print(mbfc_reliable)
print(mbfc_unreliable)

['ABC News', 'Al Jazeera', 'BBC', 'Business Insider', 'CBS News', 'CNBC', 'Chicago Sun-Times', 'Daily Beast', 'Democracy 21', 'Evening Standard', 'FiveThirtyEight', 'Foreign Policy', 'Fortune', 'Forward Progessives', 'France24', 'Hot Air', 'Interpreter Mag', 'Media Matters for America', 'Mercury News', 'MotherJones', 'NPR', 'New Yorker', 'PBS', 'Pink News UK', 'Politico', 'Real Clear Politics', 'Reuters', 'Salon', 'Shadow Proof', 'SkyNewsPolitics', 'SkyNewsUS', 'Slate', 'Spiegel', 'Talking Points Memo', 'Tass', 'The American Conservative', 'The Atlantic', 'The Denver Post', 'The Fiscal Times', 'The Guardian', 'The Hill', 'The Huffington Post', 'The Independent', 'The Intercept', 'The Irish Times', 'The Moscow Times', 'The New York Times', 'The Telegraph', 'The Verge', 'The Washington Examiner', 'USA Today', 'Vox', 'Washington Monthly', 'Washington Post', 'Wings Over Scotland', 'iPolitics']
['21stCenturyWire', 'Activist Post', 'Addicting Info', 'Alternet', 'Bearing Arms', 'Birmingham Ma

In [22]:
df_mbfc_unreliable, mbfc_unreliable_counts, mbfc_unreliable_rejected = random_balanced_sample(mbfc_unreliable, count=5000, reject_minimum=300)

HBox(children=(IntProgress(value=0, description='Querying sources', max=56, style=ProgressStyle(description_wi…

21stCenturyWire 322 articles
Activist Post 1797 articles
Addicting Info 429 articles
Alternet 4816 articles
Bearing Arms 1193 articles
Birmingham Mail 9243 articles
Buzzfeed 1661 articles
CNN 8202 articles
Counter Current News 23 articles
Counter Current News rejected
Crooks and Liars 2465 articles
Daily Kos 994 articles
Daily Signal 310 articles
Drudge Report 18885 articles
Feministing Blog 23 articles
Feministing Blog rejected
Fox News 3106 articles
Fusion 141 articles
Fusion rejected
GlobalResearch 30 articles
GlobalResearch rejected
HumansAreFree 426 articles
Infowars 2518 articles
Intellihub 334 articles
Investors Business Daily 730 articles
Live Action 1054 articles
MSNBC 6604 articles
NODISINFO 29 articles
NODISINFO rejected
National Review 5129 articles
Natural News 4187 articles
New York Daily News 2042 articles
New York Post 25407 articles
Newsweek 9411 articles
Palmer Report 3539 articles
Pravada Report 601 articles
Prison Planet 2253 articles
Raw Story 3719 articles
RedStat

HBox(children=(IntProgress(value=0, description='Sampling', max=50, style=ProgressStyle(description_width='ini…




In [23]:
df_mbfc_reliable, mbfc_reliable_counts, mbfc_reliable_rejected = random_balanced_sample(mbfc_reliable, count=5000, reject_minimum=300)

HBox(children=(IntProgress(value=0, description='Querying sources', max=56, style=ProgressStyle(description_wi…

ABC News 2808 articles
Al Jazeera 4522 articles
BBC 16416 articles
Business Insider 445 articles
CBS News 5397 articles
CNBC 2426 articles
Chicago Sun-Times 2113 articles
Daily Beast 6634 articles
Democracy 21 24 articles
Democracy 21 rejected
Evening Standard 17638 articles
FiveThirtyEight 556 articles
Foreign Policy 702 articles
Fortune 7630 articles
Forward Progessives 142 articles
Forward Progessives rejected
France24 1732 articles
Hot Air 4642 articles
Interpreter Mag 28 articles
Interpreter Mag rejected
Media Matters for America 2316 articles
Mercury News 4828 articles
MotherJones 1128 articles
NPR 5515 articles
New Yorker 265 articles
New Yorker rejected
PBS 1113 articles
Pink News UK 1645 articles
Politico 629 articles
Real Clear Politics 7247 articles
Reuters 3929 articles
Salon 1702 articles
Shadow Proof 260 articles
Shadow Proof rejected
SkyNewsPolitics 826 articles
SkyNewsUS 995 articles
Slate 514 articles
Spiegel 4171 articles
Talking Points Memo 5846 articles
Tass 6160 ar

HBox(children=(IntProgress(value=0, description='Sampling', max=50, style=ProgressStyle(description_width='ini…




In [24]:
df_mbfc_unreliable["reliable"] = 0
df_mbfc_reliable["reliable"] = 0

df_mbfc_reliability = util.stack_dfs(df_mbfc_unreliable, df_mbfc_reliable)

# NewsGuard Reliability

In [6]:
print(ng_reliable)
print(ng_unreliable)

['ABC News', 'Alternet', 'BBC', 'Bearing Arms', 'Business Insider', 'Buzzfeed', 'CBS News', 'CNBC', 'CNN', 'CNS News', 'Chicago Sun-Times', 'Crooks and Liars', 'Daily Beast', 'Daily Signal', 'FT Westminster Blog', 'FiveThirtyEight', 'Foreign Policy', 'Fortune', 'Fox News', 'Investors Business Daily', 'MSNBC', 'Media Matters for America', 'Mercury News', 'MotherJones', 'NPR', 'National Review', 'New York Daily News', 'New York Post', 'New Yorker', 'News Busters', 'Newsweek', 'Observer', 'PBS', 'Politico', 'Politicus USA', 'Raw Story', 'Real Clear Politics', 'Reuters', 'Salon', 'Slate', 'Talking Points Memo', 'The American Conservative', 'The Atlantic', 'The Daily Caller', 'The Denver Post', 'The Guardian', 'The Hill', 'The Huffington Post', 'The Independent', 'The Intercept', 'The New York Times', 'The Verge', 'The Washington Examiner', 'ThinkProgress', 'USA Today', 'Vox', 'WSJ Washington Wire', 'Washington Monthly', 'Washington Post', 'Yahoo News']
['Al Jazeera', 'Bipartisan Report', '

In [8]:
df = random_balanced_sample(ng_unreliable, count=5000, reject_minimum=300)

HBox(children=(IntProgress(value=0, description='Querying sources', max=25, style=ProgressStyle(description_wi…

Al Jazeera 4522 articles
Bipartisan Report 4060 articles
Breitbart 1877 articles
Daily Kos 994 articles
Daily Mail 3596 articles
Drudge Report 18885 articles
FrontPage Magazine 892 articles
Infowars 2518 articles
Instapundit 15584 articles
Live Action 1054 articles
Natural News 4187 articles
Palmer Report 3539 articles
Pamela Geller Report 410 articles
RT 4286 articles
Shareblue 2134 articles
Sputnik 30372 articles
The Conservative Tree House 2120 articles
The Duran 959 articles
The Gateway Pundit 5667 articles
The Political Insider 2680 articles
The Right Scoop 2697 articles
TheAntiMedia 666 articles
TheBlaze 5287 articles
True Pundit 13660 articles
Western Journal 4729 articles

10250 maximum possible balanced sample size
Grabbing 200 per source


HBox(children=(IntProgress(value=0, description='Sampling', max=25, style=ProgressStyle(description_width='ini…




In [11]:
df = df[0]

In [15]:
df[(df.source == "Pamela Geller Report") & (df.content.notnull())]

Unnamed: 0,date,source,name,content
2400,2018-03-20,Pamela Geller Report,Christian Widow Bereaved Father Show Reality B...,Armed herdsmen have been attacking Christians ...
2401,2018-09-21,Pamela Geller Report,Juanita Broaddrick Sen Feinstein Had No Intere...,"Juanita Broaddrick, who accused Bill Clinton o..."
2402,2018-03-28,Pamela Geller Report,Westminster jihad attack Details of Islamic ma...,The jihadis cite quran chapter and verse in th...
2403,2018-10-12,Pamela Geller Report,WATCH BDS neo-Nazis disrupt Holocaust film scr...,BDS is the 21st-century version of Kristallnac...
2404,2018-08-14,Pamela Geller Report,UK MP needs police security for saying Britain...,Note that MP Sarah Champion didnt even refer t...
...,...,...,...,...
2595,2018-08-08,Pamela Geller Report,Netherlands New Muslim building owner evicts J...,
2596,2018-10-30,Pamela Geller Report,HORRIBLE Muslim student punches disabled senio...,The tragedy led to the victims 86-year old mot...
2597,2018-04-02,Pamela Geller Report,Canada Muslims seek to erect Muslim housing co...,We just dont want density in the neighbourhood...
2598,2018-09-13,Pamela Geller Report,Boston police captains son convert to Islam ge...,"A police captains son. In Boston, the site of ..."


In [25]:
df[df.name.str.match("China defends")].iloc[0].content

''

In [3]:
util.nela_load_articles_from_source("MSNBC")

Unnamed: 0,date,source,name,content
0,2018-03-21,MSNBC,Wednesdays Mini-Report 32118,* Austin: Exotic batteries ordered online hel...
1,2018-03-22,MSNBC,After more than two decades Congress clarifies...,"The day after the mass shooting in Parkland, F..."
2,2018-03-22,MSNBC,Biden Trump and the kind of rhetoric,Joe Biden has made clear on many occasions tha...
3,2018-03-22,MSNBC,FBI authorized perjury investigation into,"As a rule, we tend to think of U.S. attorneys ..."
4,2018-03-22,MSNBC,Thursdays Campaign Round-Up 32218,Todays installment of campaign-related news it...
...,...,...,...,...
6599,2018-11-30,MSNBC,Trump cabinet secretary targets House Dem,"Donald Trumps cabinet may be a rogues gallery,..."
6600,2018-11-30,MSNBC,Trump cancels Putin meeting after Cohen admits...,From Russia with Love: Trump sought business i...
6601,2018-11-30,MSNBC,Trump tweets he lightly looked at real estate ...,Breaking down the week of Trump blockbusters
6602,2018-11-30,MSNBC,What does NYT report on TM Landry school say a...,# What does NYT report on TM Landry school say...
