https://www.kaggle.com/datasets/snapcrack/all-the-news

## Read media bias ratings

In [39]:
import csv
import pandas as pd
import numpy as np
import re

In [40]:
media_bias = {}
with open('media-bias.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        pub = re.sub(r" \(.*\)", '', row[0])
        pub = re.sub(r"^The ", '', pub)
        media_bias[pub] = float(row[1])

In [41]:
def get_media_bias(publication):
    publication = re.sub(r"^The ", '', publication)
    publication = re.sub(r"^Breitbart", 'Breitbart News', publication)
    publication = re.sub(r"^Buzzfeed News", 'BuzzFeed News', publication)
    
    try:
        return media_bias[publication]
    except KeyError:
        return None

## Read publication names

In [42]:
publications = np.array([], dtype=object)

In [43]:
news = pd.read_csv(f'data/articles1.csv')
news.keys()

Index(['Unnamed: 0', 'id', 'title', 'publication', 'author', 'date', 'year',
       'month', 'url', 'content'],
      dtype='object')

In [44]:
news1 = pd.read_csv(f'data/articles1.csv')
news2 = pd.read_csv(f'data/articles2.csv')
news3 = pd.read_csv(f'data/articles3.csv')
news4 = pd.read_csv(f'data/articles4.csv')
news = pd.concat([news1, news2, news3, news4])

In [45]:
news.shape, news.keys()

((242574, 12),
 Index(['Unnamed: 0', 'id', 'title', 'publication', 'author', 'date', 'year',
        'month', 'url', 'content', 'day', 'section'],
       dtype='object'))

In [46]:
news.groupby(by='publication').agg('count').sort_values('content')

Unnamed: 0_level_0,Unnamed: 0,id,title,author,date,year,month,url,content,day,section
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
publication,0,0,4,4,4,4,4,4,4,4,4
Fox News,4354,4354,4354,1117,4352,4352,4352,4351,4354,0,0
Buzzfeed News,4854,4854,4854,4853,4854,4854,4854,4854,4854,0,0
Vox,4947,4947,4947,4947,4947,4947,4947,4947,4947,0,0
Talking Points Memo,5214,5214,5213,1676,2615,2615,2615,5214,5214,0,0
National Review,6203,6203,6203,6203,6203,6203,6203,6203,6203,0,0
Business Insider,6757,6757,6757,4950,6757,6757,6757,0,6757,0,0
Atlantic,7179,7179,7179,6199,7179,7179,7179,0,7179,0,0
New York Times,7803,7803,7803,7767,7803,7803,7803,0,7803,0,0
Guardian,8681,8681,8681,7250,8641,8641,8641,8681,8681,0,0


In [47]:
publications = news.publication.unique()

In [48]:
publications

array(['New York Times', 'Breitbart', 'CNN', 'Business Insider',
       'Atlantic', 'Fox News', 'Talking Points Memo', 'Buzzfeed News',
       'National Review', 'New York Post', 'Guardian', 'NPR', 'Reuters',
       'Vox', 'Washington Post', 'Axios', 'publication', 'CNBC',
       'TechCrunch', 'The Hill', 'Wired'], dtype=object)

## Assess balance of biases

In [49]:
sorted([(get_media_bias(p), p) for p in publications if get_media_bias(p) != None])

[(-1.0, 'Buzzfeed News'),
 (-1.0, 'CNN'),
 (-1.0, 'New York Times'),
 (-1.0, 'Vox'),
 (-0.5, 'Atlantic'),
 (-0.5, 'Guardian'),
 (-0.5, 'NPR'),
 (-0.5, 'Washington Post'),
 (0.0, 'Axios'),
 (0.0, 'CNBC'),
 (0.0, 'Reuters'),
 (0.0, 'TechCrunch'),
 (0.0, 'The Hill'),
 (0.0, 'Wired'),
 (1.0, 'Breitbart'),
 (1.0, 'Fox News'),
 (1.0, 'National Review'),
 (1.0, 'New York Post')]

In [50]:
[k for k in media_bias.keys() if re.match('talk', k, re.IGNORECASE)]

[]

## Split train/valid by publication

In [51]:
validation_pubs = ['Vox', 'Washington Post', 'Wired', 'New York Post']

In [52]:
validation_news = news[news['publication'].isin(validation_pubs)]

In [80]:
train_news = news[~news['publication'].isin(validation_pubs)]

In [81]:
len(train_news), len(validation_news)

(189020, 53554)

## Write output

In [85]:
def write_news(news, filename):
    news['text'] = news['title'] + '. ' + news['content']
    news['rating'] = news['publication'].map(get_media_bias)
    news = news.dropna(subset=['text', 'rating'])
    news[['text', 'rating']].to_csv(filename, index=False)

In [86]:
write_news(train_news, 'data/train.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news['text'] = news['title'] + '. ' + news['content']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news['rating'] = news['publication'].map(get_media_bias)


In [87]:
write_news(validation_news, 'data/valid.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news['text'] = news['title'] + '. ' + news['content']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news['rating'] = news['publication'].map(get_media_bias)
