# NELA Article Collection

Created: 2019.10.8  
Notebook sequence: 5

For getting code together to build the sets of articles to use for each indicator set

---

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import util
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [20]:
def random_balanced_sample(source_name_array, count, reject_minimum=100, force_balance=True, random_seed=13, verbose=True):
    building_df = None
    counts = {}
    minimum_count = 100000000 # derp
    returned_counts = {}
    rejected = []
    
    for name in tqdm(source_name_array, "Querying sources", disable=(not verbose)):
        local_df = util.nela_load_articles_from_source(name)
        local_count = local_df.shape[0]
        if verbose: print(name, local_count, "articles")
        
        if local_count < reject_minimum:
            rejected.append(name)
            if verbose: print(name, "rejected")
            continue
    
        if local_count < minimum_count:
            minimum_count = local_count
    
        counts[name] = local_count
        building_df = util.stack_dfs(building_df, local_df)
        
    max_possible_balanced = minimum_count*len(counts.keys())
    if verbose: print(max_possible_balanced,"maximum possible balanced sample size")
        
    sample_df = None
    if count > max_possible_balanced:
        if verbose: print("Grabbing maximum")
        if force_balance:
            if verbose: print("Force balance requested, will not return", count, "as requested")
            
            for name in tqdm(counts.keys(), "Sampling", disable=(not verbose)):
                source_sample_df = building_df[building_df.source == name].sample(minimum_count, random_state=random_seed)
                sample_df = util.stack_dfs(sample_df, source_sample_df)
        
        else:
            print("WARNING: unbalanced output")
            
            # TODO
    else:
        total_per = int(count / len(counts.keys()))
        remainder = count % len(counts.keys())
        if verbose: print("Grabbing", total_per, "per source")
        
        for name in tqdm(counts.keys(), "Sampling", disable=(not verbose)):
            sample_size = total_per
            if remainder > 0: 
                sample_size += 1
                remainder -= 1
            source_sample_df = building_df[building_df.source == name].sample(sample_size, random_state=random_seed)
            returned_counts[name] = source_sample_df.shape[0]
            sample_df = util.stack_dfs(sample_df, source_sample_df)

                
    return sample_df, returned_counts, rejected
    

In [12]:
# see notebook 2
os_unreliable = ['Addicting Info', 'Breitbart', 'CNS News', 'Intellihub', 'LewRockwell', 'NODISINFO', 'Politicus USA', 'Shareblue', 'The Duran', 'The Gateway Pundit', 'The Political Insider', 'The Washington Examiner', 'TheAntiMedia', 'True Activist', 'Veterans Today']
os_reliable = ['Alternet', 'Fusion', 'oann']

mbfc_unreliable = ['21stCenturyWire', 'Activist Post', 'Addicting Info', 'Alternet', 'Bearing Arms', 'Birmingham Mail', 'Buzzfeed', 'CNN', 'Counter Current News', 'Crooks and Liars', 'Daily Kos', 'Daily Signal', 'Drudge Report', 'Feministing Blog', 'Fox News', 'Fusion', 'GlobalResearch', 'HumansAreFree', 'Infowars', 'Intellihub', 'Investors Business Daily', 'Live Action', 'MSNBC', 'NODISINFO', 'National Review', 'Natural News', 'New York Daily News', 'New York Post', 'Newsweek', 'Palmer Report', 'Pravada Report', 'Prison Planet', 'Raw Story', 'RedState', 'RightWingWatch', 'Russia-Insider', 'Shareblue', 'Sputnik', 'Telesur TV', 'The Daily Caller', 'The Daily Express', 'The Daily Mirror', 'The Daily Record', 'The Daily Star', 'The Political Insider', 'The Right Scoop', 'The Sun', 'TheAntiMedia', 'TheBlaze', 'ThinkProgress', 'True Pundit', 'Veterans Today', 'Waking Times', 'Western Journal', 'Yahoo News', 'sott.net']
mbfc_reliable = ['ABC News', 'Al Jazeera', 'BBC', 'Business Insider', 'CBS News', 'CNBC', 'Chicago Sun-Times', 'Daily Beast', 'Democracy 21', 'Evening Standard', 'FiveThirtyEight', 'Foreign Policy', 'Fortune', 'Forward Progessives', 'France24', 'Hot Air', 'Interpreter Mag', 'Media Matters for America', 'Mercury News', 'MotherJones', 'NPR', 'New Yorker', 'PBS', 'Pink News UK', 'Politico', 'Real Clear Politics', 'Reuters', 'Salon', 'Shadow Proof', 'SkyNewsPolitics', 'SkyNewsUS', 'Slate', 'Spiegel', 'Talking Points Memo', 'Tass', 'The American Conservative', 'The Atlantic', 'The Denver Post', 'The Fiscal Times', 'The Guardian', 'The Hill', 'The Huffington Post', 'The Independent', 'The Intercept', 'The Irish Times', 'The Moscow Times', 'The New York Times', 'The Telegraph', 'The Verge', 'The Washington Examiner', 'USA Today', 'Vox', 'Washington Monthly', 'Washington Post', 'Wings Over Scotland', 'iPolitics']

ng_unreliable = ['Al Jazeera', 'Bipartisan Report', 'Breitbart', 'Daily Kos', 'Daily Mail', 'Drudge Report', 'FrontPage Magazine', 'Infowars', 'Instapundit', 'Live Action', 'Natural News', 'Palmer Report', 'Pamela Geller Report', 'RT', 'Shareblue', 'Sputnik', 'The Conservative Tree House', 'The Duran', 'The Gateway Pundit', 'The Political Insider', 'The Right Scoop', 'TheAntiMedia', 'TheBlaze', 'True Pundit', 'Western Journal']
ng_reliable = ['ABC News', 'Alternet', 'BBC', 'Bearing Arms', 'Business Insider', 'Buzzfeed', 'CBS News', 'CNBC', 'CNN', 'CNS News', 'Chicago Sun-Times', 'Crooks and Liars', 'Daily Beast', 'Daily Signal', 'FT Westminster Blog', 'FiveThirtyEight', 'Foreign Policy', 'Fortune', 'Fox News', 'Investors Business Daily', 'MSNBC', 'Media Matters for America', 'Mercury News', 'MotherJones', 'NPR', 'National Review', 'New York Daily News', 'New York Post', 'New Yorker', 'News Busters', 'Newsweek', 'Observer', 'PBS', 'Politico', 'Politicus USA', 'Raw Story', 'Real Clear Politics', 'Reuters', 'Salon', 'Slate', 'Talking Points Memo', 'The American Conservative', 'The Atlantic', 'The Daily Caller', 'The Denver Post', 'The Guardian', 'The Hill', 'The Huffington Post', 'The Independent', 'The Intercept', 'The New York Times', 'The Verge', 'The Washington Examiner', 'ThinkProgress', 'USA Today', 'Vox', 'WSJ Washington Wire', 'Washington Monthly', 'Washington Post', 'Yahoo News']

In [None]:
mbfc_biased = ['ABC News', 'Addicting Info', 'Al Jazeera', 'Alternet', 'BBC', 'Bearing Arms', 'Birmingham Mail', 'Business Insider', 'Buzzfeed', 'CBS News', 'CNBC', 'CNN', 'Chicago Sun-Times', 'Crooks and Liars', 'Daily Beast', 'Daily Kos', 'Daily Signal', 'Democracy 21', 'Drudge Report', 'Evening Standard', 'Feministing Blog', 'FiveThirtyEight', 'Fortune', 'Forward Progessives', 'Fox News', 'France24', 'Fusion', 'Hot Air', 'Interpreter Mag', 'Investors Business Daily', 'MSNBC', 'Media Matters for America', 'Mercury News', 'MotherJones', 'NPR', 'National Review', 'New York Daily News', 'New York Post', 'New Yorker', 'Newsweek', 'PBS', 'Palmer Report', 'Pink News UK', 'Pravada Report', 'Raw Story', 'Real Clear Politics', 'RedState', 'RightWingWatch', 'Russia-Insider', 'Salon', 'Shadow Proof', 'Shareblue', 'SkyNewsPolitics', 'SkyNewsUS', 'Slate', 'Spiegel', 'Sputnik', 'Talking Points Memo', 'Tass', 'Telesur TV', 'The American Conservative', 'The Atlantic', 'The Daily Caller', 'The Daily Express', 'The Daily Mirror', 'The Daily Record', 'The Denver Post', 'The Fiscal Times', 'The Guardian', 'The Hill', 'The Huffington Post', 'The Independent', 'The Intercept', 'The Irish Times', 'The Moscow Times', 'The New York Times', 'The Political Insider', 'The Right Scoop', 'The Sun', 'The Telegraph', 'The Verge', 'The Washington Examiner', 'TheBlaze', 'ThinkProgress', 'USA Today', 'Vox', 'Washington Monthly', 'Washington Post', 'Western Journal', 'Wings Over Scotland', 'Yahoo News', 'iPolitics']
mbfc_unbiased = ['Foreign Policy', 'Politico', 'Reuters']

as_biased = ['ABC News', 'Alternet', 'Breitbart', 'Buzzfeed', 'CBS News', 'CNN', 'CNS News', 'Chicago Sun-Times', 'Daily Beast', 'Daily Kos', 'Daily Mail', 'Daily Signal', 'Drudge Report', 'Fox News', 'FrontPage Magazine', 'Hot Air', 'Infowars', 'Intellectual Conservative', 'Investors Business Daily', 'Live Action', 'MSNBC', 'Media Matters for America', 'MotherJones', 'National Review', 'New York Daily News', 'New York Post', 'New Yorker', 'Newsweek', 'Politics UK', 'Politicus USA', 'Raw Story', 'RedState', 'RightWingWatch', 'Salon', 'Slate', 'The American Conservative', 'The Atlantic', 'The Daily Caller', 'The Fiscal Times', 'The Gateway Pundit', 'The Guardian', 'The Huffington Post', 'The Intercept', 'The Michelle Malkin Blog', 'The New York Times', 'The Telegraph', 'The Verge', 'The Washington Examiner', 'ThinkProgress', 'Vox', 'Washington Monthly', 'Washington Post', 'Western Journal', 'Yahoo News']
as_unbiased = ['Al Jazeera', 'BBC', 'Business Insider', 'CNBC', 'FiveThirtyEight', 'NPR', 'PBS', 'Real Clear Politics', 'Reuters', 'The Hill', 'USA Today']

In [18]:
df, counts, rejected = random_balanced_sample(os_unreliable, 4500)

HBox(children=(IntProgress(value=0, description='Querying sources', max=15, style=ProgressStyle(description_wi…

Addicting Info 429 articles
Breitbart 1877 articles
CNS News 5263 articles
Intellihub 334 articles
LewRockwell 1278 articles
NODISINFO 29 articles
NODISINFO rejected
Politicus USA 4018 articles
Shareblue 2134 articles
The Duran 959 articles
The Gateway Pundit 5667 articles
The Political Insider 2680 articles
The Washington Examiner 469 articles
TheAntiMedia 666 articles
True Activist 370 articles
Veterans Today 2624 articles

4676 maximum possible balanced sample size
Grabbing 321 per source


HBox(children=(IntProgress(value=0, description='Sampling', max=14, style=ProgressStyle(description_width='ini…




In [19]:
counts

{'Addicting Info': 322,
 'Breitbart': 322,
 'CNS News': 322,
 'Intellihub': 322,
 'LewRockwell': 322,
 'Politicus USA': 322,
 'Shareblue': 321,
 'The Duran': 321,
 'The Gateway Pundit': 321,
 'The Political Insider': 321,
 'The Washington Examiner': 321,
 'TheAntiMedia': 321,
 'True Activist': 321,
 'Veterans Today': 321}