# This Notebook is intended to take the original all-the-news-2-1.csv dataset and pre-process it.

First I will prune the dataset to remove undesired publications, "nan" values in most entries.

Second I will combine it with another dataset which includes more publications

Third I will add publication media bias and factual reporting scores as fields, based on the scoring on https://mediabiasfactcheck.com/. 

Fourth (not in this notebook) I will break each article into statement groups and ID any social groups / identities contained within. 

In [7]:
# General-purpose Libraries
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import spacy
from time import time
%matplotlib inline

# Tools for processing data
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, classification_report, confusion_matrix, make_scorer, adjusted_rand_score, silhouette_score, homogeneity_score, normalized_mutual_info_score
# Classifiers, supervised and unsupervised
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AffinityPropagation

import warnings
warnings.filterwarnings("ignore")

In [8]:
spacy.prefer_gpu()

False

In [9]:
df = pd.read_csv('all-the-news-2-1.csv')

In [10]:
df.head(10)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,year,month,day,author,title,article,url,section,publication
0,0,0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,1,1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2,2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,3,3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,4,4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ
5,5,5,2019-06-17 00:00:00,2019,6.0,17,,"ECB's Coeure: If we decide to cut rates, we'd ...","BERLIN, June 17 (Reuters) - ECB board member B...",https://www.reuters.com/article/ecb-policy-coe...,Financials,Reuters
6,6,6,2019-06-23 00:00:00,2019,6.0,23,,"Venezuela detains six military, police officia...",CARACAS (Reuters) - Venezuelan authorities hav...,https://www.reuters.com/article/us-venezuela-p...,World News,Reuters
7,7,7,2018-05-02 17:09:00,2018,5.0,2,Caroline Williams,You Can Trick Your Brain Into Being More Focused,If only every day could be like this. You can’...,https://www.vice.com/en_us/article/9kgp4v/how-...,Health,Vice
8,8,8,2016-05-18 13:00:06,2016,5.0,18,Mark Bergen,How to watch the Google I/O keynote live,"Google I/O, the company's big developer confer...",https://www.vox.com/2016/5/18/11697070/how-to-...,,Vox
9,9,9,2017-03-02 00:00:00,2017,3.0,2,Tim Hume,China is dismissing unfavorable media reports ...,China is dismissing unfavorable media reports ...,https://news.vice.com/en_us/article/xwvj7j/chi...,,Vice News


In [17]:
df.publication.unique()

array(['Vox', 'Business Insider', 'Reuters', 'TMZ', 'Vice', 'Vice News',
       'Hyperallergic', 'TechCrunch', 'Axios', 'Refinery 29', 'The Verge',
       'Mashable', 'People', 'Economist', 'CNN', 'Gizmodo', 'New Yorker',
       'CNBC', 'Wired', 'New Republic', 'Fox News', 'The Hill',
       'Politico', 'The New York Times', 'Buzzfeed News',
       'Washington Post', nan], dtype=object)

In [18]:
keep_pubs = ['Vox','Axios','Business Insider','Reuters','Vice News','Economist','CNN','New Yorker','CNBC','New Republic','Fox News','The Hill','Politico','The New York Times','Buzzfeed News','Washington Post']

In [19]:
df_drop_pubs = ['TMZ', 'Vice', 'Hyperallergic', 'TechCrunch',
       'Refinery 29', 'The Verge', 'Mashable', 'People', 'Gizmodo',
       'Wired']

In [20]:
df_select_publications = df[~df.publication.isin(df_drop_pubs)]

In [21]:
df_select_publications.publication.unique()

array(['Vox', 'Business Insider', 'Reuters', 'Vice News', 'Axios',
       'Economist', 'CNN', 'New Yorker', 'CNBC', 'New Republic',
       'Fox News', 'The Hill', 'Politico', 'The New York Times',
       'Buzzfeed News', 'Washington Post', nan], dtype=object)

In [22]:
df_select_publications.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,year,month,day,author,title,article,url,section,publication
0,0,0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,1,1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2,2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,3,3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
5,5,5,2019-06-17 00:00:00,2019,6.0,17,,"ECB's Coeure: If we decide to cut rates, we'd ...","BERLIN, June 17 (Reuters) - ECB board member B...",https://www.reuters.com/article/ecb-policy-coe...,Financials,Reuters


In [23]:
df_select_publications = df_select_publications[df_select_publications['publication'].notna()]

In [24]:
df_select_publications.info(null_counts =True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2018000 entries, 0 to 2688839
Data columns (total 12 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Unnamed: 0    2018000 non-null  int64  
 1   Unnamed: 0.1  2018000 non-null  object 
 2   date          2018000 non-null  object 
 3   year          2018000 non-null  object 
 4   month         2018000 non-null  float64
 5   day           2018000 non-null  object 
 6   author        1093276 non-null  object 
 7   title         2017964 non-null  object 
 8   article       1952209 non-null  object 
 9   url           2018000 non-null  object 
 10  section       1450194 non-null  object 
 11  publication   2018000 non-null  object 
dtypes: float64(1), int64(1), object(10)
memory usage: 200.1+ MB


In [25]:
df_select_publications = df_select_publications[df_select_publications['title'].notna()]

In [26]:
df_select_publications = df_select_publications[df_select_publications['article'].notna()]

In [27]:
df_select_publications = df_select_publications.drop(['section'], axis=1)

In [28]:
df_select_publications.info(null_counts =True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1952193 entries, 0 to 2688839
Data columns (total 11 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Unnamed: 0    1952193 non-null  int64  
 1   Unnamed: 0.1  1952193 non-null  object 
 2   date          1952193 non-null  object 
 3   year          1952193 non-null  object 
 4   month         1952193 non-null  float64
 5   day           1952193 non-null  object 
 6   author        1086916 non-null  object 
 7   title         1952193 non-null  object 
 8   article       1952193 non-null  object 
 9   url           1952193 non-null  object 
 10  publication   1952193 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 178.7+ MB


# Add in the second dataset

In [30]:
import sqlite3 as sql


In [39]:
con = sql.connect("all-the-news.db")
res = con.execute("SELECT name FROM sqlite_master WHERE type='table';")
for name in res:
    print(name[0])

longform


In [43]:
# df2 = pd.read_sql("all-the-news",con)
df2 = pd.read_sql_query("SELECT * FROM longform", con)
con.close()

In [44]:
df2.head()

Unnamed: 0,id,title,author,date,content,year,month,publication,category,digital,section,url
0,1,Agent Cooper in Twin Peaks is the audience: on...,\nTasha Robinson\n,2017-05-31,And never more so than in Showtime’s new...,2017,5,Verge,Longform,1.0,,
1,2,"AI, the humanity!",\nSam Byford\n,2017-05-30,AlphaGo’s victory isn’t a defeat for hum...,2017,5,Verge,Longform,1.0,,
2,3,The Viral Machine,\nKaitlyn Tiffany\n,2017-05-25,Super Deluxe built a weird internet empi...,2017,5,Verge,Longform,1.0,,
3,4,How Anker is beating Apple and Samsung at thei...,\nNick Statt\n,2017-05-22,Steven Yang quit his job at Google in th...,2017,5,Verge,Longform,1.0,,
4,5,Tour Black Panther’s reimagined homeland with ...,\nKwame Opam\n,2017-05-15,Ahead of Black Panther’s 2018 theatrical...,2017,5,Verge,Longform,1.0,,


In [45]:
df2.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204135 entries, 0 to 204134
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           204135 non-null  int64  
 1   title        204135 non-null  object 
 2   author       172000 non-null  object 
 3   date         191532 non-null  object 
 4   content      191181 non-null  object 
 5   year         191532 non-null  object 
 6   month        191532 non-null  object 
 7   publication  196420 non-null  object 
 8   category     168713 non-null  object 
 9   digital      193115 non-null  float64
 10  section      74572 non-null   object 
 11  url          98796 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 18.7+ MB


In [49]:
# Now have to rename content to article, 
df2.rename(columns = {'content':'article'}, inplace=True)

In [64]:
# process dates into the same format by creating a day col in the new dataset, and extracting the day from the date column into it

# drop undated articles:
df2 = df2[df2['date'].notna()]

def split_date(row) -> str:
    try:
        day = row.date.split("-")[-1]
        return day
    except:
        print(row)    


df2['day'] = df2.apply(
    lambda row: split_date(row),
    axis=1
)

In [68]:
df2.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191532 entries, 0 to 204132
Data columns (total 13 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           191532 non-null  int64  
 1   title        191532 non-null  object 
 2   author       169236 non-null  object 
 3   date         191532 non-null  object 
 4   article      185663 non-null  object 
 5   year         191532 non-null  object 
 6   month        191532 non-null  object 
 7   publication  191532 non-null  object 
 8   category     163825 non-null  object 
 9   digital      188227 non-null  float64
 10  section      71349 non-null   object 
 11  url          95570 non-null   object 
 12  day          191532 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 20.5+ MB


In [76]:
# drop digital, category, & section cols
# df2 = df2.drop(['digital','section','category'], axis=1)
df2.head()

Unnamed: 0,id,title,author,date,article,year,month,publication,url,day
0,1,Agent Cooper in Twin Peaks is the audience: on...,\nTasha Robinson\n,2017-05-31,And never more so than in Showtime’s new...,2017,5,Verge,,31
1,2,"AI, the humanity!",\nSam Byford\n,2017-05-30,AlphaGo’s victory isn’t a defeat for hum...,2017,5,Verge,,30
2,3,The Viral Machine,\nKaitlyn Tiffany\n,2017-05-25,Super Deluxe built a weird internet empi...,2017,5,Verge,,25
3,4,How Anker is beating Apple and Samsung at thei...,\nNick Statt\n,2017-05-22,Steven Yang quit his job at Google in th...,2017,5,Verge,,22
4,5,Tour Black Panther’s reimagined homeland with ...,\nKwame Opam\n,2017-05-15,Ahead of Black Panther’s 2018 theatrical...,2017,5,Verge,,15


In [79]:
# process \n in author names
def process_name(row) -> str:
    try:
        return row.author.replace("\n","")
    except:
        # some articles don't have an author
        return row.author
            

df2['author'] = df2.apply(lambda row: process_name(row), axis=1)

In [80]:
df2.head()

Unnamed: 0,id,title,author,date,article,year,month,publication,url,day
0,1,Agent Cooper in Twin Peaks is the audience: on...,Tasha Robinson,2017-05-31,And never more so than in Showtime’s new...,2017,5,Verge,,31
1,2,"AI, the humanity!",Sam Byford,2017-05-30,AlphaGo’s victory isn’t a defeat for hum...,2017,5,Verge,,30
2,3,The Viral Machine,Kaitlyn Tiffany,2017-05-25,Super Deluxe built a weird internet empi...,2017,5,Verge,,25
3,4,How Anker is beating Apple and Samsung at thei...,Nick Statt,2017-05-22,Steven Yang quit his job at Google in th...,2017,5,Verge,,22
4,5,Tour Black Panther’s reimagined homeland with ...,Kwame Opam,2017-05-15,Ahead of Black Panther’s 2018 theatrical...,2017,5,Verge,,15


In [85]:
# ok it's basically ready to merge into the larger dataset. 
# I only want to keep certain publications which were missing from the original dataset
df2.publication.unique()

array(['Verge', 'Los Angeles Times', 'New York Times', 'Breitbart', 'CNN',
       'Business Insider', 'Atlantic', 'Fox News', 'Talking Points Memo',
       'Buzzfeed News', 'National Review', 'New York Post', 'Guardian',
       'NPR', 'Reuters', 'Vox', 'Washington Post', 'New Inquiry'],
      dtype=object)

In [87]:
keep_pubs

['Vox',
 'Axios',
 'Business Insider',
 'Reuters',
 'Vice News',
 'Economist',
 'CNN',
 'New Yorker',
 'CNBC',
 'New Republic',
 'Fox News',
 'The Hill',
 'Politico',
 'The New York Times',
 'Buzzfeed News',
 'Washington Post']

In [101]:
# I'll add in Brietbart and New York Post, which are both to the right.

keep_df2 = ["Breitbart","New York Post"]
nyp_bb_df = df2[df2.publication.isin(keep_df2)]
nyp_bb_df.publication.unique()

array(['Breitbart', 'New York Post'], dtype=object)

In [103]:
nyp_bb_df.head()

Unnamed: 0,id,title,author,date,article,year,month,publication,url,day
18049,26539,CNN's Zeleny: 'Hard to Imagine' Obama Would Ha...,Ian Hanchett,2017-01-17,MOST POPULAR[On Tuesday’s broadcast of CNN’s “...,2017,1,Breitbart,,17
18050,26540,American Students on Spring Break Chant 'Build...,Katherine Rodriguez,2017-03-21,A group of American spring break revelers repo...,2017,3,Breitbart,,21
18051,26541,Surge in 'Honour Crimes' and Forced Marriages ...,Liam Deacon,2017-04-08,So-called “honour crimes” have risen by 40 per...,2017,4,Breitbart,,8
18052,26542,MILO Announces New Media Venture - Breitbart,Lucas Nolan,2017-04-28,Former Breitbart Senior Editor MILO has announ...,2017,4,Breitbart,,28
18053,26543,Jared Kushner at Center of Media Spotlight on ...,Penny Starr,2017-05-27,The focus of the continuous media reports of a...,2017,5,Breitbart,,27


In [112]:
nyp_bb_df = nyp_bb_df[nyp_bb_df['article'].notna()]

In [113]:
combined_df = df_select_publications.append(nyp_bb_df, ignore_index=True)

In [114]:
combined_df.loc[combined_df['publication'] == "New York Post"]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,year,month,day,author,title,article,url,publication,id
1975984,,,2016-12-31,2016,12,31,Associated Press,Gunman dressed as Santa Claus kills dozens at ...,ISTANBUL — An assailant believed to have been ...,https://web.archive.org/web/20170101013549/htt...,New York Post,122293.0
1975985,,,2016-12-31,2016,12,31,Lindsay Putnam,Disney could receive $50M for Carrie Fisher’s ...,Carrie Fisher’s death could be a windfall for ...,https://web.archive.org/web/20170101013549/htt...,New York Post,122294.0
1975986,,,2016-12-31,2016,12,31,Howie Kussoy,Alabama torments Washington with old-school fo...,ATLANTA — Washington wanted to prove the world...,https://web.archive.org/web/20170101013549/htt...,New York Post,122295.0
1975987,,,2016-12-31,2016,12,31,Post Staff Report,"NBC, Charter extend talks to avert midnight bl...",A midnight blackout of NBC on local cable scre...,https://web.archive.org/web/20170101013549/htt...,New York Post,122296.0
1975988,,,2016-12-31,2016,12,31,Post Staff Report,"Phil Jackson bolts, Greg Bird explodes and mor...",This is the best day of the year to be a sport...,https://web.archive.org/web/20170101013549/htt...,New York Post,122297.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1993066,,,2016-05-05,2016,5,05,Post Editorial Board,Fresh scandal for Mayor de Blasio — and bad ne...,If Mayor Bill de Blasio personally sought dona...,http://nypost.com/2016/05/05/fresh-scandal-for...,New York Post,145561.0
1993067,,,2016-04-08,2016,4,08,Jonah Goldberg,Bernie Sanders’ drive to keep people poor,"For years, supporters of free trade have been ...",http://nypost.com/2016/04/08/bernie-sanders-dr...,New York Post,145562.0
1993068,,,2016-08-16,2016,8,16,Claire Atkinson and Julia Marsh,Univision buys bankrupt Gawker for $135M,Univision won the bidding war for Gawker Media...,http://nypost.com/2016/08/16/univision-buys-ba...,New York Post,145563.0
1993069,,,2016-01-06,2016,1,06,Lia Eustachewich,Jury will decide fate of cop who fatally shot ...,A ​Brooklyn jury will decide the fate of an NY...,http://nypost.com/2016/01/06/jury-will-decide-...,New York Post,145564.0


In [125]:
combined_df.drop(['Unnamed: 0','Unnamed: 0.1', 'id'], axis=1)

Unnamed: 0,date,year,month,day,author,title,article,url,publication,bias,factual-reporting
0,2016-12-09 18:31:00,2016,12,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,Vox,left,mostly factual
1,2016-10-07 21:26:46,2016,10,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,Business Insider,left-center,high
2,2018-01-26 00:00:00,2018,1,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Reuters,least biased,very high
3,2019-06-27 00:00:00,2019,6,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,Reuters,least biased,very high
4,2019-06-17 00:00:00,2019,6,17,,"ECB's Coeure: If we decide to cut rates, we'd ...","BERLIN, June 17 (Reuters) - ECB board member B...",https://www.reuters.com/article/ecb-policy-coe...,Reuters,least biased,very high
...,...,...,...,...,...,...,...,...,...,...,...
1993066,2016-05-05,2016,5,05,Post Editorial Board,Fresh scandal for Mayor de Blasio — and bad ne...,If Mayor Bill de Blasio personally sought dona...,http://nypost.com/2016/05/05/fresh-scandal-for...,New York Post,right-center,mixed
1993067,2016-04-08,2016,4,08,Jonah Goldberg,Bernie Sanders’ drive to keep people poor,"For years, supporters of free trade have been ...",http://nypost.com/2016/04/08/bernie-sanders-dr...,New York Post,right-center,mixed
1993068,2016-08-16,2016,8,16,Claire Atkinson and Julia Marsh,Univision buys bankrupt Gawker for $135M,Univision won the bidding war for Gawker Media...,http://nypost.com/2016/08/16/univision-buys-ba...,New York Post,right-center,mixed
1993069,2016-01-06,2016,1,06,Lia Eustachewich,Jury will decide fate of cop who fatally shot ...,A ​Brooklyn jury will decide the fate of an NY...,http://nypost.com/2016/01/06/jury-will-decide-...,New York Post,right-center,mixed


In [116]:
# perfect, now I can save it
combined_df.to_csv('combined_news.csv')

# Next Steps

Now I have a dataset which only contains the publications I want, and has been slimmed down to only the columns I want to use.

Next: I must add the publication bias statements from https://mediabiasfactcheck.com/ as I am certain that will be useful.

In [117]:
# keep_pubs  # to review the publications desired
combined_df.publication.unique()

array(['Vox', 'Business Insider', 'Reuters', 'Vice News', 'Axios',
       'Economist', 'CNN', 'New Yorker', 'CNBC', 'New Republic',
       'Fox News', 'The Hill', 'Politico', 'The New York Times',
       'Buzzfeed News', 'Washington Post', 'Breitbart', 'New York Post'],
      dtype=object)

In [118]:
bias_scores = {
    "Vox":{
        'bias':'left',
        'factual-reporting':'mostly factual'
    },
    "Axios":{
        "bias":"left-center",
        'factual-reporting':"high"
    },
    "Business Insider":{
        'bias':'left-center',
        'factual-reporting':'high'
    },
    'Reuters':{
        'bias':'least biased',
        'factual-reporting':'very high'
    },
    'Vice News':{
        'bias':'left-center',
        'factual-reporting':'high'
    },
    'Economist':{
        'bias':'least biased',
        'factual-reporting':'high'
    },
    'CNN':{
        'bias':'left',
        'factual-reporting':'mixed'
    },
    'New Yorker':{
        'bias':'left',
        'factual-reporting':'high'
    },
    'CNBC':{
        'bias':'left-center',
        'factual-reporting':'mostly factual'
    },
    'New Republic':{
        'bias':'left',
        'factual-reporting':'high'
    },
    'Fox News':{
        'bias':'right',
        'factual-reporting':'mixed'
    },
    'The Hill':{
        'bias':'least biased',
        'factual-reporting':'mostly factual'
    },
    'Politico':{
        'bias':'least biased',
        'factual-reporting':'high'
    },
    'The New York Times':{
        'bias':'left-center',
        'factual-reporting':'high'
    },
    'Buzzfeed News':{
        'bias':'left-center',
        'factual-reporting':'mixed'
    },
    'Washington Post':{
        'bias':'left-center',
        'factual-reporting':'high'
    },
    'New York Post':{
        'bias':'right-center',
        'factual-reporting':'mixed'
    },
    'Breitbart':{
        'bias':'right',
        'factual-reporting':'mixed'
    },
    
}

In [121]:
combined_df['bias'] = combined_df.apply(
    lambda row: bias_scores[row.publication]["bias"],
    axis=1
)

In [122]:
combined_df['factual-reporting'] = combined_df.apply(
    lambda row: bias_scores[row.publication]["factual-reporting"],
    axis=1
)

In [123]:
combined_df.to_csv('combined_news.csv')

In [126]:
combined_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,year,month,day,author,title,article,url,publication,id,bias,factual-reporting
0,0.0,0,2016-12-09 18:31:00,2016,12,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,Vox,,left,mostly factual
1,1.0,1,2016-10-07 21:26:46,2016,10,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,Business Insider,,left-center,high
2,2.0,2,2018-01-26 00:00:00,2018,1,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Reuters,,least biased,very high
3,3.0,3,2019-06-27 00:00:00,2019,6,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,Reuters,,least biased,very high
4,5.0,5,2019-06-17 00:00:00,2019,6,17,,"ECB's Coeure: If we decide to cut rates, we'd ...","BERLIN, June 17 (Reuters) - ECB board member B...",https://www.reuters.com/article/ecb-policy-coe...,Reuters,,least biased,very high


In [127]:
combined_df.drop(['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1', 'id'], axis=1)

Unnamed: 0,date,year,month,day,author,title,article,url,publication,bias,factual-reporting
0,2016-12-09 18:31:00,2016,12,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,Vox,left,mostly factual
1,2016-10-07 21:26:46,2016,10,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,Business Insider,left-center,high
2,2018-01-26 00:00:00,2018,1,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Reuters,least biased,very high
3,2019-06-27 00:00:00,2019,6,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,Reuters,least biased,very high
4,2019-06-17 00:00:00,2019,6,17,,"ECB's Coeure: If we decide to cut rates, we'd ...","BERLIN, June 17 (Reuters) - ECB board member B...",https://www.reuters.com/article/ecb-policy-coe...,Reuters,least biased,very high
...,...,...,...,...,...,...,...,...,...,...,...
1993066,2016-05-05,2016,5,05,Post Editorial Board,Fresh scandal for Mayor de Blasio — and bad ne...,If Mayor Bill de Blasio personally sought dona...,http://nypost.com/2016/05/05/fresh-scandal-for...,New York Post,right-center,mixed
1993067,2016-04-08,2016,4,08,Jonah Goldberg,Bernie Sanders’ drive to keep people poor,"For years, supporters of free trade have been ...",http://nypost.com/2016/04/08/bernie-sanders-dr...,New York Post,right-center,mixed
1993068,2016-08-16,2016,8,16,Claire Atkinson and Julia Marsh,Univision buys bankrupt Gawker for $135M,Univision won the bidding war for Gawker Media...,http://nypost.com/2016/08/16/univision-buys-ba...,New York Post,right-center,mixed
1993069,2016-01-06,2016,1,06,Lia Eustachewich,Jury will decide fate of cop who fatally shot ...,A ​Brooklyn jury will decide the fate of an NY...,http://nypost.com/2016/01/06/jury-will-decide-...,New York Post,right-center,mixed


In [128]:
combined_df.to_csv('combined_news.csv')