# 0. Packages & Functions

## 0.1. Packages

In [1]:
import pandas as pd
import os

## 0.2. Functions

# 1. Wall Street Journal

## 1.1. URLs

In [2]:
URLs_WSJ_2016 = pd.read_parquet("Wall_Street_Journal_2016_URLS(1)")
URLs_WSJ_2017 = pd.read_parquet("Wall_Street_Journal_2017_URLS (2)")
URLs_WSJ_2018 = pd.read_parquet("Wall_Street_Journal_2018_URLS")
URLs_WSJ_2019 = pd.read_parquet("Wall_Street_Journal_2019_URLS")
URLs_WSJ_2020 = pd.read_parquet("Wall_Street_Journal_2020_URLS")

URLs_WSJ = pd.concat([URLs_WSJ_2016, URLs_WSJ_2017, URLs_WSJ_2018, URLs_WSJ_2019, URLs_WSJ_2020]).reset_index(drop = True)

In [3]:
URLs_WSJ

Unnamed: 0,Date,News Paper,Link
0,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/5-fashion-resoluti...
1,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/suspect-in-new-yea...
2,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/defenders-of-confe...
3,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/saudi-women-seek-m...
4,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/saudi-beheadings-a...
...,...,...,...
254877,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/navalny-faces-fra...
254878,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/chinese-markets-s...
254879,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/astrazeneca-and-o...
254880,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/covid-19-vaccine-...


In [4]:
#Remove the duplicate rows and the URLs with an error message in

#duplicates
URLs_WSJ = URLs_WSJ.drop_duplicates().reset_index(drop = True)

#error
mask_WSJ_URLs_error = ~ URLs_WSJ['Link'].str.contains('mod=error_page')
URLs_WSJ = URLs_WSJ[mask_WSJ_URLs_error].reset_index(drop = True)

In [5]:
URLs_WSJ

Unnamed: 0,Date,News Paper,Link
0,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/5-fashion-resoluti...
1,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/suspect-in-new-yea...
2,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/defenders-of-confe...
3,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/saudi-women-seek-m...
4,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/saudi-beheadings-a...
...,...,...,...
254576,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/navalny-faces-fra...
254577,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/chinese-markets-s...
254578,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/astrazeneca-and-o...
254579,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/covid-19-vaccine-...


In [6]:
#Collect URLs per year
URLs_WSJ["Date"] = pd.to_datetime(URLs_WSJ['Date'], format='%d/%m/%Y')
URLs_WSJ["Year"] = URLs_WSJ['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WSJ_year_URLs_1 = URLs_WSJ.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WSJ_year_URLs_2 = URLs_WSJ.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [17]:
#Check WSJ_year_URLs_1
WSJ_year_URLs_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,74024
1,Wall_Street_Journal,2017,59731
2,Wall_Street_Journal,2018,48922
3,Wall_Street_Journal,2019,36290
4,Wall_Street_Journal,2020,35614


In [18]:
WSJ_year_URLs_1["Unique_URLs_Count"].sum()

254581

In [None]:
WSJ_year_URLs_1.to_csv('C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/WSJ_year_URLs_1', index = False)

In [11]:
#Check WSJ_year_URLs_2
WSJ_year_URLs_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,74023
1,Wall_Street_Journal,2017,59728
2,Wall_Street_Journal,2018,48917
3,Wall_Street_Journal,2019,36283
4,Wall_Street_Journal,2020,35613


## 1.2. Articles

In [19]:
# Set the directory path
directory = "C:/Users/Boedt/OneDrive/Bureaublad/Scraped_Articles/Wall Street Journal"

# Get a list of filenames in the directory
filenames = os.listdir(directory)

# Loop through the filenames and read each Parquet file into a DataFrame
dfs = []
for filename in filenames:
    filepath = os.path.join(directory, filename)
    df = pd.read_parquet(filepath)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
Articles_WSJ = pd.concat(dfs, ignore_index=True)

Articles_WSJ = Articles_WSJ.drop("index", axis = 1)

In [20]:
Articles_WSJ

Unnamed: 0,Title,Text,Link,Date,News Paper
0,5 Fashion Resolutions for 2016,NEW YEAR’S RESOLUTIONS?I don’t need no stinki...,http://www.wsj.com/articles/5-fashion-resoluti...,01/01/2016,Wall_Street_Journal
1,Suspect in New Year’s Eve Terror Plot Is Panha...,"ROCHESTER, N.Y.—An ex-convict arrested in a p...",http://www.wsj.com/articles/suspect-in-new-yea...,01/01/2016,Wall_Street_Journal
2,Defenders of Confederate Symbols Mount a Count...,ATLANTA—Not since at least the civil-rights m...,http://www.wsj.com/articles/defenders-of-confe...,01/01/2016,Wall_Street_Journal
3,Saudi Women Seek More Good Jobs,RIYADH—Women in Saudi Arabia still aren’t all...,http://www.wsj.com/articles/saudi-women-seek-m...,01/01/2016,Wall_Street_Journal
4,"Saudi Beheadings at Highest Level in 20 Years,...","DUBAI, United Arab Emirates—Saudi Arabia carr...",http://www.wsj.com/articles/saudi-beheadings-a...,01/01/2016,Wall_Street_Journal
...,...,...,...,...,...
264704,Smoother by the Dozen,NO ACCESS,https://www.wsj.com/articles/smoother-by-the-d...,29/12/2020,Wall_Street_Journal
264705,Everybody Wants Credit for the Covid Vaccine,Graham T. Allison correctly highlights the im...,https://www.wsj.com/articles/everybody-wants-c...,30/12/2020,Wall_Street_Journal
264706,A Place in the Sun,NO ACCESS,https://www.wsj.com/articles/a-place-in-the-su...,30/12/2020,Wall_Street_Journal
264707,"Shift Gears, Accelerate: CIOs Reordered IT Pri...",Remote work and the acceleration of certain d...,https://www.wsj.com/articles/shift-gears-accel...,30/12/2020,Wall_Street_Journal


In [21]:
#Drop all duplicates and URLs with error message (we only found out about the error after scraping the URLs)

#duplicates
Articles_WSJ = Articles_WSJ.drop_duplicates().reset_index(drop = True)

#error
mask_WSJ_Articles_error = ~ Articles_WSJ['Link'].str.contains('mod=error_page')
Articles_WSJ = Articles_WSJ[mask_WSJ_Articles_error].reset_index(drop = True)

In [22]:
Articles_WSJ

Unnamed: 0,Title,Text,Link,Date,News Paper
0,5 Fashion Resolutions for 2016,NEW YEAR’S RESOLUTIONS?I don’t need no stinki...,http://www.wsj.com/articles/5-fashion-resoluti...,01/01/2016,Wall_Street_Journal
1,Suspect in New Year’s Eve Terror Plot Is Panha...,"ROCHESTER, N.Y.—An ex-convict arrested in a p...",http://www.wsj.com/articles/suspect-in-new-yea...,01/01/2016,Wall_Street_Journal
2,Defenders of Confederate Symbols Mount a Count...,ATLANTA—Not since at least the civil-rights m...,http://www.wsj.com/articles/defenders-of-confe...,01/01/2016,Wall_Street_Journal
3,Saudi Women Seek More Good Jobs,RIYADH—Women in Saudi Arabia still aren’t all...,http://www.wsj.com/articles/saudi-women-seek-m...,01/01/2016,Wall_Street_Journal
4,"Saudi Beheadings at Highest Level in 20 Years,...","DUBAI, United Arab Emirates—Saudi Arabia carr...",http://www.wsj.com/articles/saudi-beheadings-a...,01/01/2016,Wall_Street_Journal
...,...,...,...,...,...
239988,‘Sylvie’s Love’ Review: Lush Movie Life Reimag...,"Eugene Ashe’s “Sylvie’s Love,” streaming on A...",https://www.wsj.com/articles/eugene-ashes-sump...,23/12/2020,Wall_Street_Journal
239989,Pepper...and Salt,,https://www.wsj.com/articles/pepper-and-salt-1...,26/12/2020,Wall_Street_Journal
239990,China Tells Ant Group to Refocus on Its Paymen...,Chinese financial regulators moved to rein in...,https://www.wsj.com/articles/china-tells-ant-t...,27/12/2020,Wall_Street_Journal
239991,Everybody Wants Credit for the Covid Vaccine,Graham T. Allison correctly highlights the im...,https://www.wsj.com/articles/everybody-wants-c...,30/12/2020,Wall_Street_Journal


In [23]:
#Scraped articles per year (based on URLs)
Articles_WSJ["Date"] = pd.to_datetime(Articles_WSJ['Date'], format='%d/%m/%Y')
Articles_WSJ["Year"] = Articles_WSJ['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WSJ_year_Articles_1 = Articles_WSJ.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WSJ_year_Articles_2 = Articles_WSJ.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [24]:
#Check WSJ_year_Articles_1
WSJ_year_Articles_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,71819
1,Wall_Street_Journal,2017,49757
2,Wall_Street_Journal,2018,28891
3,Wall_Street_Journal,2019,36185
4,Wall_Street_Journal,2020,32595


In [25]:
#Check WSJ_year_Articles_2
WSJ_year_Articles_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,71819
1,Wall_Street_Journal,2017,49754
2,Wall_Street_Journal,2018,28891
3,Wall_Street_Journal,2019,36178
4,Wall_Street_Journal,2020,32594


In [26]:
test = Articles_WSJ[["Date", "Link"]].value_counts().reset_index()
test.columns = ["Date", "Link", "Count"]
link = test[test["Count"] > 1]

In [27]:
link

Unnamed: 0,Date,Link,Count
0,2016-06-03,http://www.wsj.com/articles/sumner-redstone-ha...,3
1,2016-06-07,http://www.wsj.com/articles/wells-fargo-hopes-...,3
2,2016-06-08,http://www.wsj.com/articles/boston-scientific-...,3
3,2016-06-07,http://www.wsj.com/articles/lendingclub-postpo...,3
4,2016-06-06,http://www.wsj.com/articles/eu-to-start-delica...,3
...,...,...,...
20638,2016-03-16,http://www.wsj.com/articles/new-york-city-anno...,2
20639,2016-03-16,http://www.wsj.com/articles/go-slow-britain-u-...,2
20640,2016-03-25,http://www.wsj.com/articles/senior-islamic-sta...,2
20641,2016-03-15,https://www.wsj.com/articles/greek-winter-sala...,2


In [None]:
#Articles_WSJ = Articles_WSJ.drop(Articles_WSJ[Articles_WSJ[["Link", "Date"]].duplicated() == True].index)

## 1.3. Unscraped Articles

In [28]:
#Select all the URLs that were scraped, but are not present in the article data set (unscraped)
WSJ_unscraped_articles = URLs_WSJ[~URLs_WSJ["Link"].isin(Articles_WSJ["Link"])].reset_index(drop = True)

In [29]:
WSJ_unscraped_articles

Unnamed: 0,Date,News Paper,Link,Year
0,2016-06-16,Wall_Street_Journal,http://www.wsj.com/articles/phil-mickelson-and...,2016
1,2016-06-17,Wall_Street_Journal,http://www.wsj.com/articles/dont-judge-lawsuit...,2016
2,2016-06-20,Wall_Street_Journal,http://www.wsj.com/articles/notable-quotable-v...,2016
3,2016-06-21,Wall_Street_Journal,http://www.wsj.com/articles/china-premier-vows...,2016
4,2016-06-21,Wall_Street_Journal,http://www.wsj.com/articles/chinas-one-way-dea...,2016
...,...,...,...,...
35328,2020-05-15,Wall_Street_Journal,https://www.wsj.com/articles/coronavirus-pande...,2020
35329,2020-05-29,Wall_Street_Journal,https://www.wsj.com/articles/weve-hardly-gotte...,2020
35330,2020-06-01,Wall_Street_Journal,https://www.wsj.com/articles/pep-talk-for-poli...,2020
35331,2020-06-03,Wall_Street_Journal,https://www.wsj.com/articles/he-knew-george-fl...,2020


In [30]:
#Check the distribution per year
WSJ_unscraped_articles.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()
 
# -> November and December 2017 are missing and January untill April 2018 are missing. This explains the high numbers in those 
    #years 

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,2204
1,Wall_Street_Journal,2017,9974
2,Wall_Street_Journal,2018,20031
3,Wall_Street_Journal,2019,105
4,Wall_Street_Journal,2020,3019


In [None]:
#Store the unscraped URLs so they can be rescraped! 
#WSJ_unscraped_articles.to_parquet("WSJ_missed_URLs")

## 1.4. Useful Articles

In [49]:
#First store the Articles_WSJ in a new dataframe, this way we can keep the two separate.
Articles_WSJ_Clean = Articles_WSJ

In [50]:
Articles_WSJ_Clean

Unnamed: 0,Title,Text,Link,Date,News Paper,Year
0,5 Fashion Resolutions for 2016,NEW YEAR’S RESOLUTIONS?I don’t need no stinki...,http://www.wsj.com/articles/5-fashion-resoluti...,2016-01-01,Wall_Street_Journal,2016
1,Suspect in New Year’s Eve Terror Plot Is Panha...,"ROCHESTER, N.Y.—An ex-convict arrested in a p...",http://www.wsj.com/articles/suspect-in-new-yea...,2016-01-01,Wall_Street_Journal,2016
2,Defenders of Confederate Symbols Mount a Count...,ATLANTA—Not since at least the civil-rights m...,http://www.wsj.com/articles/defenders-of-confe...,2016-01-01,Wall_Street_Journal,2016
3,Saudi Women Seek More Good Jobs,RIYADH—Women in Saudi Arabia still aren’t all...,http://www.wsj.com/articles/saudi-women-seek-m...,2016-01-01,Wall_Street_Journal,2016
4,"Saudi Beheadings at Highest Level in 20 Years,...","DUBAI, United Arab Emirates—Saudi Arabia carr...",http://www.wsj.com/articles/saudi-beheadings-a...,2016-01-01,Wall_Street_Journal,2016
...,...,...,...,...,...,...
239988,‘Sylvie’s Love’ Review: Lush Movie Life Reimag...,"Eugene Ashe’s “Sylvie’s Love,” streaming on A...",https://www.wsj.com/articles/eugene-ashes-sump...,2020-12-23,Wall_Street_Journal,2020
239989,Pepper...and Salt,,https://www.wsj.com/articles/pepper-and-salt-1...,2020-12-26,Wall_Street_Journal,2020
239990,China Tells Ant Group to Refocus on Its Paymen...,Chinese financial regulators moved to rein in...,https://www.wsj.com/articles/china-tells-ant-t...,2020-12-27,Wall_Street_Journal,2020
239991,Everybody Wants Credit for the Covid Vaccine,Graham T. Allison correctly highlights the im...,https://www.wsj.com/articles/everybody-wants-c...,2020-12-30,Wall_Street_Journal,2020


In [51]:
#First we remove the articles we don't have access to, this is because they belong to the WSJ Pro subscription which we don't have.
#This is typically about VC and PE, so the added value seemed limited to us.
total_no_access = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] == "NO ACCESS"]["Text"].count()
total_unique_no_access = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] == "NO ACCESS"].groupby(["Date", "Link"])["Link"].count().reset_index(drop = True).sum()
print("Total NO ACCESS rows:", total_no_access, "and total unique ones:", total_unique_no_access)


remove_NO_ACCESS = ~ Articles_WSJ_Clean["Text"].str.contains("NO ACCESS")
Articles_WSJ_Clean = Articles_WSJ_Clean[remove_NO_ACCESS].reset_index(drop = True)

Total NO ACCESS rows: 34985 and total unique ones: 34985


In [52]:
Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] == "NO ACCESS"]

Unnamed: 0,Title,Text,Link,Date,News Paper,Year


In [53]:
Articles_WSJ_Clean

Unnamed: 0,Title,Text,Link,Date,News Paper,Year
0,5 Fashion Resolutions for 2016,NEW YEAR’S RESOLUTIONS?I don’t need no stinki...,http://www.wsj.com/articles/5-fashion-resoluti...,2016-01-01,Wall_Street_Journal,2016
1,Suspect in New Year’s Eve Terror Plot Is Panha...,"ROCHESTER, N.Y.—An ex-convict arrested in a p...",http://www.wsj.com/articles/suspect-in-new-yea...,2016-01-01,Wall_Street_Journal,2016
2,Defenders of Confederate Symbols Mount a Count...,ATLANTA—Not since at least the civil-rights m...,http://www.wsj.com/articles/defenders-of-confe...,2016-01-01,Wall_Street_Journal,2016
3,Saudi Women Seek More Good Jobs,RIYADH—Women in Saudi Arabia still aren’t all...,http://www.wsj.com/articles/saudi-women-seek-m...,2016-01-01,Wall_Street_Journal,2016
4,"Saudi Beheadings at Highest Level in 20 Years,...","DUBAI, United Arab Emirates—Saudi Arabia carr...",http://www.wsj.com/articles/saudi-beheadings-a...,2016-01-01,Wall_Street_Journal,2016
...,...,...,...,...,...,...
205001,‘Sylvie’s Love’ Review: Lush Movie Life Reimag...,"Eugene Ashe’s “Sylvie’s Love,” streaming on A...",https://www.wsj.com/articles/eugene-ashes-sump...,2020-12-23,Wall_Street_Journal,2020
205002,Pepper...and Salt,,https://www.wsj.com/articles/pepper-and-salt-1...,2020-12-26,Wall_Street_Journal,2020
205003,China Tells Ant Group to Refocus on Its Paymen...,Chinese financial regulators moved to rein in...,https://www.wsj.com/articles/china-tells-ant-t...,2020-12-27,Wall_Street_Journal,2020
205004,Everybody Wants Credit for the Covid Vaccine,Graham T. Allison correctly highlights the im...,https://www.wsj.com/articles/everybody-wants-c...,2020-12-30,Wall_Street_Journal,2020


In [54]:
#Do we remove the articles that contained pictures or cartoons -> we could assume the distribution of climate change is 
#equal in these articles, but we are not able to detect wether it is climate change or not. So we would have to remove them? 
#Also, this is about a neglectable portion of the articles, so not sorry wether this could potentialy effect the results


#we use an intermediate step, text cleaning of the articles happens after we removed the invalid ones
remove_blank_text_df = Articles_WSJ_Clean 
remove_blank_text_df["Text"] = remove_blank_text_df['Text'].str.strip(" ")
Articles_WSJ_Clean = Articles_WSJ_Clean.drop(remove_blank_text_df[remove_blank_text_df["Text"] == ""].index)

Articles_WSJ_Clean

Unnamed: 0,Title,Text,Link,Date,News Paper,Year
0,5 Fashion Resolutions for 2016,NEW YEAR’S RESOLUTIONS?I don’t need no stinkin...,http://www.wsj.com/articles/5-fashion-resoluti...,2016-01-01,Wall_Street_Journal,2016
1,Suspect in New Year’s Eve Terror Plot Is Panha...,"ROCHESTER, N.Y.—An ex-convict arrested in a pl...",http://www.wsj.com/articles/suspect-in-new-yea...,2016-01-01,Wall_Street_Journal,2016
2,Defenders of Confederate Symbols Mount a Count...,ATLANTA—Not since at least the civil-rights mo...,http://www.wsj.com/articles/defenders-of-confe...,2016-01-01,Wall_Street_Journal,2016
3,Saudi Women Seek More Good Jobs,RIYADH—Women in Saudi Arabia still aren’t allo...,http://www.wsj.com/articles/saudi-women-seek-m...,2016-01-01,Wall_Street_Journal,2016
4,"Saudi Beheadings at Highest Level in 20 Years,...","DUBAI, United Arab Emirates—Saudi Arabia carri...",http://www.wsj.com/articles/saudi-beheadings-a...,2016-01-01,Wall_Street_Journal,2016
...,...,...,...,...,...,...
205000,Goldman Unit Makes New Push Into Real Estate W...,A Goldman Sachs Group Inc. unit that buys stak...,https://www.wsj.com/articles/goldman-unit-make...,2020-12-22,Wall_Street_Journal,2020
205001,‘Sylvie’s Love’ Review: Lush Movie Life Reimag...,"Eugene Ashe’s “Sylvie’s Love,” streaming on Am...",https://www.wsj.com/articles/eugene-ashes-sump...,2020-12-23,Wall_Street_Journal,2020
205003,China Tells Ant Group to Refocus on Its Paymen...,Chinese financial regulators moved to rein in ...,https://www.wsj.com/articles/china-tells-ant-t...,2020-12-27,Wall_Street_Journal,2020
205004,Everybody Wants Credit for the Covid Vaccine,Graham T. Allison correctly highlights the imp...,https://www.wsj.com/articles/everybody-wants-c...,2020-12-30,Wall_Street_Journal,2020


In [55]:
#Some articles did not contain text, but only a pdf that could be downloaded. We remove these for same the reasoning why we 
#removed pictures and cartoons

Articles_WSJ_Clean = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] != "Download PDF"].reset_index(drop = True)
Articles_WSJ_Clean = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] != "See Solution Download PDF"].reset_index(drop = True)
Articles_WSJ_Clean = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] != "Download PDF See Solution"].reset_index(drop = True)

In [56]:
Articles_WSJ_Clean

Unnamed: 0,Title,Text,Link,Date,News Paper,Year
0,5 Fashion Resolutions for 2016,NEW YEAR’S RESOLUTIONS?I don’t need no stinkin...,http://www.wsj.com/articles/5-fashion-resoluti...,2016-01-01,Wall_Street_Journal,2016
1,Suspect in New Year’s Eve Terror Plot Is Panha...,"ROCHESTER, N.Y.—An ex-convict arrested in a pl...",http://www.wsj.com/articles/suspect-in-new-yea...,2016-01-01,Wall_Street_Journal,2016
2,Defenders of Confederate Symbols Mount a Count...,ATLANTA—Not since at least the civil-rights mo...,http://www.wsj.com/articles/defenders-of-confe...,2016-01-01,Wall_Street_Journal,2016
3,Saudi Women Seek More Good Jobs,RIYADH—Women in Saudi Arabia still aren’t allo...,http://www.wsj.com/articles/saudi-women-seek-m...,2016-01-01,Wall_Street_Journal,2016
4,"Saudi Beheadings at Highest Level in 20 Years,...","DUBAI, United Arab Emirates—Saudi Arabia carri...",http://www.wsj.com/articles/saudi-beheadings-a...,2016-01-01,Wall_Street_Journal,2016
...,...,...,...,...,...,...
200134,Goldman Unit Makes New Push Into Real Estate W...,A Goldman Sachs Group Inc. unit that buys stak...,https://www.wsj.com/articles/goldman-unit-make...,2020-12-22,Wall_Street_Journal,2020
200135,‘Sylvie’s Love’ Review: Lush Movie Life Reimag...,"Eugene Ashe’s “Sylvie’s Love,” streaming on Am...",https://www.wsj.com/articles/eugene-ashes-sump...,2020-12-23,Wall_Street_Journal,2020
200136,China Tells Ant Group to Refocus on Its Paymen...,Chinese financial regulators moved to rein in ...,https://www.wsj.com/articles/china-tells-ant-t...,2020-12-27,Wall_Street_Journal,2020
200137,Everybody Wants Credit for the Covid Vaccine,Graham T. Allison correctly highlights the imp...,https://www.wsj.com/articles/everybody-wants-c...,2020-12-30,Wall_Street_Journal,2020


In [57]:
#Finally, after taking a look into other articles, we noticed that some articles were also unsuccesfully scraped. -> certain
#articles only contained the phrase "write to" which is always mentioned at the end of the article. This could be due to errors
#in the HTML or code. subset these and check! Is only around 3k articles. 

Articles_WSJ_Clean = Articles_WSJ_Clean[~Articles_WSJ_Clean["Text"].str.startswith('Write to')].reset_index(drop = True)

Articles_WSJ_Clean

Unnamed: 0,Title,Text,Link,Date,News Paper,Year
0,5 Fashion Resolutions for 2016,NEW YEAR’S RESOLUTIONS?I don’t need no stinkin...,http://www.wsj.com/articles/5-fashion-resoluti...,2016-01-01,Wall_Street_Journal,2016
1,Suspect in New Year’s Eve Terror Plot Is Panha...,"ROCHESTER, N.Y.—An ex-convict arrested in a pl...",http://www.wsj.com/articles/suspect-in-new-yea...,2016-01-01,Wall_Street_Journal,2016
2,Defenders of Confederate Symbols Mount a Count...,ATLANTA—Not since at least the civil-rights mo...,http://www.wsj.com/articles/defenders-of-confe...,2016-01-01,Wall_Street_Journal,2016
3,Saudi Women Seek More Good Jobs,RIYADH—Women in Saudi Arabia still aren’t allo...,http://www.wsj.com/articles/saudi-women-seek-m...,2016-01-01,Wall_Street_Journal,2016
4,"Saudi Beheadings at Highest Level in 20 Years,...","DUBAI, United Arab Emirates—Saudi Arabia carri...",http://www.wsj.com/articles/saudi-beheadings-a...,2016-01-01,Wall_Street_Journal,2016
...,...,...,...,...,...,...
197009,Goldman Unit Makes New Push Into Real Estate W...,A Goldman Sachs Group Inc. unit that buys stak...,https://www.wsj.com/articles/goldman-unit-make...,2020-12-22,Wall_Street_Journal,2020
197010,‘Sylvie’s Love’ Review: Lush Movie Life Reimag...,"Eugene Ashe’s “Sylvie’s Love,” streaming on Am...",https://www.wsj.com/articles/eugene-ashes-sump...,2020-12-23,Wall_Street_Journal,2020
197011,China Tells Ant Group to Refocus on Its Paymen...,Chinese financial regulators moved to rein in ...,https://www.wsj.com/articles/china-tells-ant-t...,2020-12-27,Wall_Street_Journal,2020
197012,Everybody Wants Credit for the Covid Vaccine,Graham T. Allison correctly highlights the imp...,https://www.wsj.com/articles/everybody-wants-c...,2020-12-30,Wall_Street_Journal,2020


In [58]:
#Useful URLs per year
Articles_WSJ_Clean["Date"] = pd.to_datetime(Articles_WSJ_Clean['Date'], format='%d/%m/%Y')
Articles_WSJ_Clean["Year"] = Articles_WSJ_Clean['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WSJ_year_useful_Articles_1 = Articles_WSJ_Clean.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WSJ_year_useful_Articles_2 = Articles_WSJ_Clean.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [59]:
#Check WSJ_year_useful_Articles_1
WSJ_year_useful_Articles_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,61253
1,Wall_Street_Journal,2017,30824
2,Wall_Street_Journal,2018,22162
3,Wall_Street_Journal,2019,35122
4,Wall_Street_Journal,2020,31808


In [60]:
#Total usefull articles
WSJ_year_useful_Articles_1["Unique_URLs_Count"].sum()

181169

In [101]:
#WSJ_year_useful_Articles_1.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/WSJ_year_useful_Articles_1", index = False)

In [61]:
#Compare to URLs we got in the beginning
Comparison_WSJ_1 = pd.merge(WSJ_year_URLs_1, WSJ_year_useful_Articles_1, on = ["News Paper", 'Year'])
Comparison_WSJ_1.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WSJ_1["Difference"] = Comparison_WSJ_1["Unique_URLs_Count"] - Comparison_WSJ_1["Useful_Unique_URLs_Count"]

In [62]:
Comparison_WSJ_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count,Useful_Unique_URLs_Count,Difference
0,Wall_Street_Journal,2016,74024,61253,12771
1,Wall_Street_Journal,2017,59731,30824,28907
2,Wall_Street_Journal,2018,48922,22162,26760
3,Wall_Street_Journal,2019,36290,35122,1168
4,Wall_Street_Journal,2020,35614,31808,3806


In [63]:
#Check WSJ_year_useful_Articles_2
WSJ_year_useful_Articles_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,61253
1,Wall_Street_Journal,2017,30824
2,Wall_Street_Journal,2018,22162
3,Wall_Street_Journal,2019,35117
4,Wall_Street_Journal,2020,31807


In [64]:
#Total useful articles 
WSJ_year_useful_Articles_2["Unique_URLs_Count"].sum()

181163

In [65]:
#Compare to URLs we got in the beginning
Comparison_WSJ_2 = pd.merge(WSJ_year_URLs_2, WSJ_year_useful_Articles_2, on = ["News Paper", 'Year'])
Comparison_WSJ_2.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WSJ_2["Difference"] = Comparison_WSJ_2["Unique_URLs_Count"] - Comparison_WSJ_2["Useful_Unique_URLs_Count"]

In [66]:
Comparison_WSJ_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count,Useful_Unique_URLs_Count,Difference
0,Wall_Street_Journal,2016,74023,61253,12770
1,Wall_Street_Journal,2017,59728,30824,28904
2,Wall_Street_Journal,2018,48917,22162,26755
3,Wall_Street_Journal,2019,36283,35117,1166
4,Wall_Street_Journal,2020,35613,31807,3806


# 2. Washington Post

## 2.1. URLs

In [2]:
URLs_WP_2016 = pd.read_parquet("Washington_Post_2016_URLS")
URLs_WP_2017 = pd.read_parquet("Washington_Post_2017_URLS")
URLs_WP_2018 = pd.read_parquet("Washington_Post_2018_URLS")
URLs_WP_2019 = pd.read_parquet("Washington_Post_2019_URLS")
URLs_WP_2020 = pd.read_parquet("Washington_Post_2020_URLS")

URLs_WP = pd.concat([URLs_WP_2016, URLs_WP_2017, URLs_WP_2018, URLs_WP_2019, URLs_WP_2020])

In [3]:
URLs_WP

Unnamed: 0,Date,News Paper,Link
0,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/capitals-i...
1,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/checkpoint...
2,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/the-fix/wp...
3,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/the-watch/...
4,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/early-lead...
...,...,...,...
13748,31/12/2020,Washington_Post,https://www.washingtonpost.com/national/nation...
13749,31/12/2020,Washington_Post,https://www.washingtonpost.com/local/trump-jan...
13750,31/12/2020,Washington_Post,https://www.washingtonpost.com/politics/secret...
13751,31/12/2020,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...


In [4]:
#Remove the duplicates
URLs_WP = URLs_WP.drop_duplicates().reset_index(drop = True)

In [5]:
URLs_WP

Unnamed: 0,Date,News Paper,Link
0,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/capitals-i...
1,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/checkpoint...
2,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/the-fix/wp...
3,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/the-watch/...
4,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/early-lead...
...,...,...,...
153173,31/12/2020,Washington_Post,https://www.washingtonpost.com/national/nation...
153174,31/12/2020,Washington_Post,https://www.washingtonpost.com/local/trump-jan...
153175,31/12/2020,Washington_Post,https://www.washingtonpost.com/politics/secret...
153176,31/12/2020,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...


In [6]:
#Collect URLs per year
URLs_WP["Date"] = pd.to_datetime(URLs_WP['Date'], format='%d/%m/%Y')
URLs_WP["Year"] = URLs_WP['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WP_year_URLs_1 = URLs_WP.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WP_year_URLs_2 = URLs_WP.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [7]:
#Check WP_year_URLs_1
WP_year_URLs_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49756
1,Washington_Post,2017,50442
2,Washington_Post,2018,28952
3,Washington_Post,2019,10275
4,Washington_Post,2020,13753


In [8]:
WP_year_URLs_1["Unique_URLs_Count"].sum()

153178

In [9]:
#WP_year_URLs_1.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/WP_year_URLs_1", index = False)

In [10]:
#Check WP_year_URLs_2
WP_year_URLs_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49756
1,Washington_Post,2017,50442
2,Washington_Post,2018,28952
3,Washington_Post,2019,10275
4,Washington_Post,2020,13753


## 2.2. Articles

In [11]:
# Set the directory path
directory = "C:/Users/Boedt/OneDrive/Bureaublad/Scraped_Articles/Washington Post"

# Get a list of filenames in the directory
filenames = os.listdir(directory)

# Loop through the filenames and read each Parquet file into a DataFrame
dfs = []
for filename in filenames:
    filepath = os.path.join(directory, filename)
    df = pd.read_parquet(filepath)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
Articles_WP = pd.concat(dfs, ignore_index=True)
Articles_WP = Articles_WP.drop("Dat", axis = 1)

In [12]:
Articles_WP

Unnamed: 0,Title,Text,Date,News Paper,Link
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",01/01/2018,Washington_Post,https://www.washingtonpost.com/local/public-sa...
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",01/01/2018,Washington_Post,https://www.washingtonpost.com/local/homelessl...
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,01/01/2018,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,01/01/2018,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,01/01/2018,Washington_Post,https://www.washingtonpost.com/business/econom...
...,...,...,...,...,...
175628,"Shivering in line for a swab up the nose, but ...",It was dusk on the third-to-last day of a terr...,31/12/2020,Washington_Post,https://www.washingtonpost.com/local/new-year-...
175629,National Digest: Nashville woman warned police...,,31/12/2020,Washington_Post,https://www.washingtonpost.com/national/nation...
175630,Jan. 6 protests multiply as Trump continues to...,Protests planned in support of President Trump...,31/12/2020,Washington_Post,https://www.washingtonpost.com/local/trump-jan...
175631,2021’s call to Reconstruction,One of the singularly important intellectual d...,31/12/2020,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...


In [13]:
#Drop all duplicates
Articles_WP = Articles_WP.drop_duplicates().reset_index(drop = True)

In [14]:
#Scraped articles per year (based on URLs)
Articles_WP["Date"] = pd.to_datetime(Articles_WP['Date'], format='%d/%m/%Y')
Articles_WP["Year"] = Articles_WP['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WP_year_Articles_1 = Articles_WP.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WP_year_Articles_2 = Articles_WP.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [15]:
#Check WP_year_Articles_1
WP_year_Articles_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49664
1,Washington_Post,2017,49800
2,Washington_Post,2018,27888
3,Washington_Post,2019,10233
4,Washington_Post,2020,13697


In [16]:
#Check WP_year_Articles_2
WP_year_Articles_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49664
1,Washington_Post,2017,49800
2,Washington_Post,2018,27888
3,Washington_Post,2019,10233
4,Washington_Post,2020,13697


## 2.3. Unscraped Articles

In [17]:
#Select all the URLs that were scraped, but are not present in the article data set (unscraped)
WP_unscraped_articles = URLs_WP[~URLs_WP["Link"].isin(Articles_WP["Link"])].reset_index(drop = True)

In [18]:
WP_unscraped_articles

Unnamed: 0,Date,News Paper,Link,Year
0,2016-07-22,Washington_Post,https://www.washingtonpost.com/entertainment/t...,2016
1,2016-07-24,Washington_Post,https://www.washingtonpost.com/news/morning-mi...,2016
2,2016-07-26,Washington_Post,https://www.washingtonpost.com/politics/in-tig...,2016
3,2016-07-27,Washington_Post,https://www.washingtonpost.com/news/energy-env...,2016
4,2016-07-28,Washington_Post,https://www.washingtonpost.com/local/born-befo...,2016
...,...,...,...,...
1891,2020-12-05,Washington_Post,https://www.washingtonpost.com/world/europe/ge...,2020
1892,2020-12-11,Washington_Post,https://www.washingtonpost.com/local/obituarie...,2020
1893,2020-12-17,Washington_Post,https://www.washingtonpost.com/lifestyle/advic...,2020
1894,2020-12-23,Washington_Post,https://www.washingtonpost.com/lifestyle/advic...,2020


In [19]:
#Check the distribution per year
WP_unscraped_articles.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()


Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,92
1,Washington_Post,2017,642
2,Washington_Post,2018,1064
3,Washington_Post,2019,42
4,Washington_Post,2020,56


In [20]:
#Store the unscraped URLs so they can be rescraped! 
#WP_unscraped_articles.to_parquet("WP_missed_URLs")

## 2.4. Useful Articles

In [21]:
Articles_WP_Clean = Articles_WP

In [22]:
Articles_WP_Clean

Unnamed: 0,Title,Text,Date,News Paper,Link,Year
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/public-sa...,2018
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/homelessl...,2018
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,2018-01-01,Washington_Post,https://www.washingtonpost.com/business/econom...,2018
...,...,...,...,...,...,...
173312,"Shivering in line for a swab up the nose, but ...",It was dusk on the third-to-last day of a terr...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/new-year-...,2020
173313,National Digest: Nashville woman warned police...,,2020-12-31,Washington_Post,https://www.washingtonpost.com/national/nation...,2020
173314,Jan. 6 protests multiply as Trump continues to...,Protests planned in support of President Trump...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/trump-jan...,2020
173315,2021’s call to Reconstruction,One of the singularly important intellectual d...,2020-12-31,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...,2020


In [23]:
#Remove empty text
remove_blank_text_df = Articles_WP_Clean 
remove_blank_text_df["Text"] = remove_blank_text_df['Text'].str.strip(" ")
Articles_WP_Clean = Articles_WP_Clean.drop(remove_blank_text_df[remove_blank_text_df["Text"] == ""].index).reset_index(drop = True)

Articles_WP_Clean

Unnamed: 0,Title,Text,Date,News Paper,Link,Year
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/public-sa...,2018
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/homelessl...,2018
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,2018-01-01,Washington_Post,https://www.washingtonpost.com/business/econom...,2018
...,...,...,...,...,...,...
139527,"By the end of 2020, we were supposed to have m...",In an alternate-universe version of 2020 — and...,2020-12-31,Washington_Post,https://www.washingtonpost.com/entertainment/b...,2020
139528,"Shivering in line for a swab up the nose, but ...",It was dusk on the third-to-last day of a terr...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/new-year-...,2020
139529,Jan. 6 protests multiply as Trump continues to...,Protests planned in support of President Trump...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/trump-jan...,2020
139530,2021’s call to Reconstruction,One of the singularly important intellectual d...,2020-12-31,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...,2020


In [24]:
Articles_WP_Clean = Articles_WP_Clean[Articles_WP_Clean['Text'].apply(lambda x: len(x.split(" ")) != 1)].reset_index(drop = True)

In [25]:
Articles_WP_Clean

Unnamed: 0,Title,Text,Date,News Paper,Link,Year
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/public-sa...,2018
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/homelessl...,2018
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,2018-01-01,Washington_Post,https://www.washingtonpost.com/business/econom...,2018
...,...,...,...,...,...,...
119967,"By the end of 2020, we were supposed to have m...",In an alternate-universe version of 2020 — and...,2020-12-31,Washington_Post,https://www.washingtonpost.com/entertainment/b...,2020
119968,"Shivering in line for a swab up the nose, but ...",It was dusk on the third-to-last day of a terr...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/new-year-...,2020
119969,Jan. 6 protests multiply as Trump continues to...,Protests planned in support of President Trump...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/trump-jan...,2020
119970,2021’s call to Reconstruction,One of the singularly important intellectual d...,2020-12-31,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...,2020


In [26]:
#Remove failed articles (occure very frequent and are checked if they are failed or not)
checklist = Articles_WP_Clean["Text"].value_counts().reset_index()
checklist.columns = ["Text", "Count"]
checklist[checklist["Count"] > 3]

Articles_WP_Clean = Articles_WP_Clean[~Articles_WP_Clean["Text"].isin(checklist[checklist["Count"] > 3]["Text"])].reset_index(drop = True)

In [27]:
#Useful URLs per year
Articles_WP_Clean["Date"] = pd.to_datetime(Articles_WP_Clean['Date'], format='%d/%m/%Y')
Articles_WP_Clean["Year"] = Articles_WP_Clean['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WP_year_useful_Articles_1 = Articles_WP_Clean.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WP_year_useful_Articles_2 = Articles_WP_Clean.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [28]:
WP_year_useful_Articles_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,40513
1,Washington_Post,2017,31232
2,Washington_Post,2018,21889
3,Washington_Post,2019,10055
4,Washington_Post,2020,13291


In [29]:
WP_year_useful_Articles_1["Unique_URLs_Count"].sum()

116980

In [34]:
#WP_year_useful_Articles_1.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/WP_year_useful_Articles_1", index = False)

In [30]:
#Compare to URLs we got in the beginning
Comparison_WP_1 = pd.merge(WP_year_URLs_1, WP_year_useful_Articles_1, on = ["News Paper", 'Year'])
Comparison_WP_1.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WP_1["Difference"] = Comparison_WP_1["Unique_URLs_Count"] - Comparison_WP_1["Useful_Unique_URLs_Count"]

In [31]:
Comparison_WP_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count,Useful_Unique_URLs_Count,Difference
0,Washington_Post,2016,49756,40513,9243
1,Washington_Post,2017,50442,31232,19210
2,Washington_Post,2018,28952,21889,7063
3,Washington_Post,2019,10275,10055,220
4,Washington_Post,2020,13753,13291,462


In [32]:
WP_year_useful_Articles_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,40513
1,Washington_Post,2017,31232
2,Washington_Post,2018,21889
3,Washington_Post,2019,10055
4,Washington_Post,2020,13291


In [98]:
WP_year_useful_Articles_2["Unique_URLs_Count"].sum()

116980

In [99]:
#Compare to URLs we got in the beginning
Comparison_WP_2 = pd.merge(WP_year_URLs_2, WP_year_useful_Articles_2, on = ["News Paper", 'Year'])
Comparison_WP_2.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WP_2["Difference"] = Comparison_WP_2["Unique_URLs_Count"] - Comparison_WP_2["Useful_Unique_URLs_Count"]

In [100]:
Comparison_WP_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count,Useful_Unique_URLs_Count,Difference
0,Washington_Post,2016,49756,40513,9243
1,Washington_Post,2017,50442,31232,19210
2,Washington_Post,2018,28952,21889,7063
3,Washington_Post,2019,10275,10055,220
4,Washington_Post,2020,13753,13291,462
