https://components.one/datasets/all-the-news-2-news-articles-dataset

## Read media bias ratings

In [1]:
import csv
import pandas as pd
import numpy as np
import re

In [2]:
media_bias = {}
with open('media-bias.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        pub = re.sub(r" \(.*\)", '', row[0])
        pub = re.sub(r"^The ", '', pub)
        media_bias[pub] = float(row[1])

## Read publication names

8.8M rows total

In [4]:
batch_size = 100000
total_rows = 9000000
publications = np.array([], dtype=object)

In [5]:
for i in range(0, total_rows, batch_size):
    print(i)
    news = pd.read_csv('all-the-news-2-1.csv', nrows=batch_size, skiprows=range(1, i))
    #print(publications)
    #print(news.publication.unique())
    publications = np.concatenate((publications, news.publication.unique()), axis=None)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000


  exec(code_obj, self.user_global_ns, self.user_ns)


2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [45]:
filtered_pubs = [p for p in publications if type(p) == str]

In [48]:
unique_pubs = np.unique(filtered_pubs)

In [76]:
unique_pubs

array(['Axios', 'Business Insider', 'Buzzfeed News', 'CNBC', 'CNN',
       'Economist', 'Fox News', 'Gizmodo', 'Hyperallergic', 'Mashable',
       'New Republic', 'New Yorker', 'People', 'Politico', 'Refinery 29',
       'Reuters', 'TMZ', 'TechCrunch', 'The Hill', 'The New York Times',
       'The Verge', 'Vice', 'Vice News', 'Vox', 'Washington Post',
       'Wired'], dtype='<U18')

## Check overlap

In [6]:
unique_pubs = ['Axios', 'Business Insider', 'Buzzfeed News', 'CNBC', 'CNN',
       'Economist', 'Fox News', 'Gizmodo', 'Hyperallergic', 'Mashable',
       'New Republic', 'New Yorker', 'People', 'Politico', 'Refinery 29',
       'Reuters', 'TMZ', 'TechCrunch', 'The Hill', 'The New York Times',
       'The Verge', 'Vice', 'Vice News', 'Vox', 'Washington Post',
       'Wired']

In [13]:
len(media_bias.keys()), len(unique_pubs)

(589, 26)

In [9]:
unique_pubs_proc = [re.sub(r"^The ", '', p) for p in unique_pubs]
unique_pubs_proc = [re.sub(r"^Refinery 29", 'Refinery29', p) for p in unique_pubs]

In [15]:
overlap = set(unique_pubs_proc).intersection(set(media_bias.keys()))
len(overlap), overlap

(19,
 {'Axios',
  'CNBC',
  'CNN',
  'Economist',
  'Fox News',
  'Gizmodo',
  'Hill',
  'Mashable',
  'New Republic',
  'New York Times',
  'New Yorker',
  'Politico',
  'Reuters',
  'TechCrunch',
  'Verge',
  'Vice',
  'Vox',
  'Washington Post',
  'Wired'})

In [17]:
set(unique_pubs_proc) - overlap

{'Business Insider',
 'Buzzfeed News',
 'Hyperallergic',
 'People',
 'Refinery 29',
 'TMZ',
 'Vice News'}

- Buzzfeed News vs BuzzFeed news (case sensitivity)
- Refinery29 vs Refinery 29

In [31]:
[k for k in media_bias.keys() if re.findall('vice', k, re.IGNORECASE)]

['Vice']

## Map publication names

- CNN (online vs opinion)
- Fox News (online vs opinion)
- New York Times (vs The)

1. online and opinion have the same score
2. strip trailing "The"

In [11]:
def media_bias_for(publication):
    pub = re.sub(r"^The ", '', publication)
    try:
        return media_bias[pub]
    except KeyError:
        return None

In [12]:
sorted([(media_bias_for(p), p) for p in unique_pubs if media_bias_for(p) != None])

[(-1.0, 'CNN'),
 (-1.0, 'Mashable'),
 (-1.0, 'New Republic'),
 (-1.0, 'New Yorker'),
 (-1.0, 'The New York Times'),
 (-1.0, 'Vice'),
 (-1.0, 'Vox'),
 (-0.5, 'Economist'),
 (-0.5, 'Gizmodo'),
 (-0.5, 'Politico'),
 (-0.5, 'The Verge'),
 (-0.5, 'Washington Post'),
 (0.0, 'Axios'),
 (0.0, 'CNBC'),
 (0.0, 'Reuters'),
 (0.0, 'TechCrunch'),
 (0.0, 'The Hill'),
 (0.0, 'Wired'),
 (1.0, 'Fox News')]

## Collate data

What would I like this dataset to contain?

- An even number of articles for each bias level
- A publication at each bias level held back for the validation set
  - Maybe two or three

How would I like to build that?

- Make a per-publication train/valid split ahead of time
- Sample various offsets in the source file without replacement
- Write all fields, plus a numeric bias
- Start ignoring 