# 0. Packages & Functions

## 0.1. Packages

In [23]:
import pandas as pd
import os

## 0.2. Functions

# 1. Wall Street Journal

## 1.1. URLs

In [24]:
URLs_WSJ_2016 = pd.read_parquet("Wall_Street_Journal_2016_URLS(1)")
URLs_WSJ_2017 = pd.read_parquet("Wall_Street_Journal_2017_URLS (2)")
URLs_WSJ_2018 = pd.read_parquet("Wall_Street_Journal_2018_URLS")
URLs_WSJ_2019 = pd.read_parquet("Wall_Street_Journal_2019_URLS")
URLs_WSJ_2020 = pd.read_parquet("Wall_Street_Journal_2020_URLS")

URLs_WSJ = pd.concat([URLs_WSJ_2016, URLs_WSJ_2017, URLs_WSJ_2018, URLs_WSJ_2019, URLs_WSJ_2020]).reset_index(drop = True)

In [25]:
URLs_WSJ

Unnamed: 0,Date,News Paper,Link
0,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/5-fashion-resoluti...
1,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/suspect-in-new-yea...
2,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/defenders-of-confe...
3,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/saudi-women-seek-m...
4,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/saudi-beheadings-a...
...,...,...,...
254877,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/navalny-faces-fra...
254878,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/chinese-markets-s...
254879,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/astrazeneca-and-o...
254880,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/covid-19-vaccine-...


In [26]:
#Remove the duplicate rows and the URLs with an error message in

#duplicates
URLs_WSJ = URLs_WSJ.drop_duplicates().reset_index(drop = True)

#error
mask_WSJ_URLs_error = ~ URLs_WSJ['Link'].str.contains('mod=error_page')
URLs_WSJ = URLs_WSJ[mask_WSJ_URLs_error].reset_index(drop = True)

In [27]:
URLs_WSJ

Unnamed: 0,Date,News Paper,Link
0,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/5-fashion-resoluti...
1,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/suspect-in-new-yea...
2,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/defenders-of-confe...
3,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/saudi-women-seek-m...
4,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/saudi-beheadings-a...
...,...,...,...
254576,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/navalny-faces-fra...
254577,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/chinese-markets-s...
254578,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/astrazeneca-and-o...
254579,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/covid-19-vaccine-...


In [28]:
#Collect URLs per year
URLs_WSJ["Date"] = pd.to_datetime(URLs_WSJ['Date'], format='%d/%m/%Y')
URLs_WSJ["Year"] = URLs_WSJ['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WSJ_year_URLs_1 = URLs_WSJ.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WSJ_year_URLs_2 = URLs_WSJ.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [29]:
#Check WSJ_year_URLs_1
WSJ_year_URLs_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,74024
1,Wall_Street_Journal,2017,59731
2,Wall_Street_Journal,2018,48922
3,Wall_Street_Journal,2019,36290
4,Wall_Street_Journal,2020,35614


In [30]:
WSJ_year_URLs_1["Unique_URLs_Count"].sum()

254581

In [31]:
WSJ_year_URLs_1.to_csv('C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/WSJ_year_URLs_1', index = False)

In [32]:
#Check WSJ_year_URLs_2
WSJ_year_URLs_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,74023
1,Wall_Street_Journal,2017,59728
2,Wall_Street_Journal,2018,48917
3,Wall_Street_Journal,2019,36283
4,Wall_Street_Journal,2020,35613


## 1.2. Articles

In [33]:
# Set the directory path
directory = "C:/Users/Boedt/OneDrive/Bureaublad/Scraped_Articles/Wall Street Journal"

# Get a list of filenames in the directory
filenames = os.listdir(directory)

# Loop through the filenames and read each Parquet file into a DataFrame
dfs = []
for filename in filenames:
    filepath = os.path.join(directory, filename)
    df = pd.read_parquet(filepath)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
Articles_WSJ = pd.concat(dfs, ignore_index=True)

Articles_WSJ = Articles_WSJ.drop("index", axis = 1)

In [34]:
Articles_WSJ

Unnamed: 0,Title,Text,Link,Date,News Paper,Year
0,Don’t Judge Lawsuit Funders by Peter Thiel,Here’s a question so straightforward that it ...,http://www.wsj.com/articles/dont-judge-lawsuit...,17/06/2016,Wall_Street_Journal,2016.0
1,Notable & Quotable: Vin Scully on Socialism,Sportscaster Vin Scully commenting on air Fri...,http://www.wsj.com/articles/notable-quotable-v...,20/06/2016,Wall_Street_Journal,2016.0
2,China Premier Vows to Lower Leverage for Nonfi...,BEIJING—Chinese Premier Li Keqiang said the g...,http://www.wsj.com/articles/china-premier-vows...,21/06/2016,Wall_Street_Journal,2016.0
3,China’s One-Way Deals Grate on Germany,SHANGHAI—If there’s a single German company t...,http://www.wsj.com/articles/chinas-one-way-dea...,21/06/2016,Wall_Street_Journal,2016.0
4,Hyperloop One Explores Building High-Speed Tra...,The hyperloop is racing into a new potential ...,http://www.wsj.com/articles/hyperloop-one-expl...,22/06/2016,Wall_Street_Journal,2016.0
...,...,...,...,...,...,...
338983,Smoother by the Dozen,NO ACCESS,https://www.wsj.com/articles/smoother-by-the-d...,29/12/2020,Wall_Street_Journal,
338984,Everybody Wants Credit for the Covid Vaccine,Graham T. Allison correctly highlights the im...,https://www.wsj.com/articles/everybody-wants-c...,30/12/2020,Wall_Street_Journal,
338985,A Place in the Sun,NO ACCESS,https://www.wsj.com/articles/a-place-in-the-su...,30/12/2020,Wall_Street_Journal,
338986,"Shift Gears, Accelerate: CIOs Reordered IT Pri...",Remote work and the acceleration of certain d...,https://www.wsj.com/articles/shift-gears-accel...,30/12/2020,Wall_Street_Journal,


In [35]:
#Drop all duplicates and URLs with error message (we only found out about the error after scraping the URLs)

#duplicates
Articles_WSJ = Articles_WSJ.drop_duplicates().reset_index(drop = True)

#error
mask_WSJ_Articles_error = ~ Articles_WSJ['Link'].str.contains('mod=error_page')
Articles_WSJ = Articles_WSJ[mask_WSJ_Articles_error].reset_index(drop = True)

In [36]:
Articles_WSJ["Date"]

0         17/06/2016
1         20/06/2016
2         21/06/2016
3         21/06/2016
4         22/06/2016
             ...    
303768    23/12/2020
303769    26/12/2020
303770    27/12/2020
303771    30/12/2020
303772    30/12/2020
Name: Date, Length: 303773, dtype: object

In [37]:
#Scraped articles per year (based on URLs)
Articles_WSJ["Date"] = pd.to_datetime(Articles_WSJ['Date'], format='%d/%m/%Y')
Articles_WSJ["Year"] = Articles_WSJ['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WSJ_year_Articles_1 = Articles_WSJ.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WSJ_year_Articles_2 = Articles_WSJ.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [38]:
#Check WSJ_year_Articles_1
WSJ_year_Articles_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,74015
1,Wall_Street_Journal,2017,59729
2,Wall_Street_Journal,2018,48908
3,Wall_Street_Journal,2019,36290
4,Wall_Street_Journal,2020,35614


In [39]:
#Check WSJ_year_Articles_2
WSJ_year_Articles_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,74015
1,Wall_Street_Journal,2017,59726
2,Wall_Street_Journal,2018,48903
3,Wall_Street_Journal,2019,36283
4,Wall_Street_Journal,2020,35613


In [40]:
#Articles_WSJ = Articles_WSJ.drop(Articles_WSJ[Articles_WSJ[["Link", "Date"]].duplicated() == True].index)

## 1.3. Unscraped Articles

In [41]:
#Select all the URLs that were scraped, but are not present in the article data set (unscraped)
WSJ_unscraped_articles = URLs_WSJ[~URLs_WSJ["Link"].isin(Articles_WSJ["Link"])].reset_index(drop = True)

In [42]:
WSJ_unscraped_articles

Unnamed: 0,Date,News Paper,Link,Year
0,2016-06-16,Wall_Street_Journal,http://www.wsj.com/articles/phil-mickelson-and...,2016
1,2016-12-17,Wall_Street_Journal,http://www.wsj.com/articles/australias-prime-m...,2016
2,2016-12-19,Wall_Street_Journal,http://www.wsj.com/articles/sherwin-williams-v...,2016
3,2016-12-20,Wall_Street_Journal,http://www.wsj.com/articles/jefferies-group-pr...,2016
4,2016-12-22,Wall_Street_Journal,http://www.wsj.com/articles/rogue-one-points-t...,2016
5,2016-12-23,Wall_Street_Journal,http://www.wsj.com/articles/let-it-be-an-arms-...,2016
6,2016-12-26,Wall_Street_Journal,http://www.wsj.com/articles/bojs-kuroda-sees-b...,2016
7,2016-12-28,Wall_Street_Journal,http://www.wsj.com/articles/time-to-expand-acc...,2016
8,2017-11-01,Wall_Street_Journal,https://www.wsj.com/articles/fed-likely-on-hol...,2017
9,2017-11-02,Wall_Street_Journal,https://www.wsj.com/articles/new-workplace-per...,2017


In [43]:
#Check the distribution per year
WSJ_unscraped_articles.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()
 
# -> November and December 2017 are missing and January untill April 2018 are missing. This explains the high numbers in those 
    #years 

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,8
1,Wall_Street_Journal,2017,2
2,Wall_Street_Journal,2018,14


In [44]:
#Store the unscraped URLs so they can be rescraped! 
#WSJ_unscraped_articles.to_parquet("WSJ_missed_URLs")

## 1.4. Useful Articles

In [45]:
#First store the Articles_WSJ in a new dataframe, this way we can keep the two separate.
Articles_WSJ_Clean = Articles_WSJ.copy()

In [46]:
Articles_WSJ_Clean = Articles_WSJ_Clean.sort_values(by='Text', key=lambda x: x.str.split().str.len(), ascending=False).drop_duplicates(subset=['Date', 'Link'], keep='first')

In [47]:
Articles_WSJ_Clean

Unnamed: 0,Title,Text,Link,Date,News Paper,Year
46538,Transcript: Jerome Powell Fields Questions at ...,Federal Reserve Chairman Jerome Powell testifi...,https://www.wsj.com/articles/transcript-jerome...,2018-02-28,Wall_Street_Journal,2018
71471,"Transcript of Yellen’s Feb. 10, 2016, Appearan...",Federal Reserve Chairwoman Janet Yellen deliv...,http://www.wsj.com/articles/transcript-of-yell...,2016-02-11,Wall_Street_Journal,2016
71776,"Transcript of Yellen’s Feb. 11, 2016, Appearan...",Federal Reserve Chairwoman Janet Yellen deliv...,http://www.wsj.com/articles/transcript-of-yell...,2016-02-12,Wall_Street_Journal,2016
56106,Transcript: Fed Chief Nominee Jerome Powell’s ...,"The Senate Banking Committee on Tuesday, Nov....",https://www.wsj.com/articles/transcript-fed-ch...,2017-11-28,Wall_Street_Journal,2017
271721,Dealing With the Coronavirus,"We are all living in limbo now, trying to adj...",https://www.wsj.com/articles/dealing-with-the-...,2020-05-22,Wall_Street_Journal,2020
...,...,...,...,...,...,...
12720,404,,https://www.wsj.com/articles/advocates-push-to...,2017-12-10,Wall_Street_Journal,2017
203166,Pepper...and Salt,,https://www.wsj.com/articles/pepper-and-salt-1...,2017-10-06,Wall_Street_Journal,2017
12721,404,,https://www.wsj.com/articles/the-first-women-i...,2017-12-10,Wall_Street_Journal,2017
12722,404,,https://www.wsj.com/articles/indonesians-prote...,2017-12-10,Wall_Street_Journal,2017


In [48]:
no_access = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] == "NO ACCESS"]

In [49]:
no_access_year = no_access.groupby(["Year", "Link"])["Year"].nunique().reset_index(name='Unique_URLs_Count').groupby("Year").sum()
no_access_year

Unnamed: 0_level_0,Unique_URLs_Count
Year,Unnamed: 1_level_1
2016,9893
2017,13276
2018,6636
2019,707
2020,437


In [50]:
#First we remove the articles we don't have access to, this is because they belong to the WSJ Pro subscription which we don't have.
#This is typically about VC and PE, so the added value seemed limited to us.
total_no_access = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] == "NO ACCESS"]["Text"].count()
total_unique_no_access = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] == "NO ACCESS"].groupby(["Date", "Link"])["Link"].count().reset_index(drop = True).sum()
print("Total NO ACCESS rows:", total_no_access, "and total unique ones:", total_unique_no_access)


remove_NO_ACCESS = ~ Articles_WSJ_Clean["Text"].str.contains("NO ACCESS")
Articles_WSJ_Clean = Articles_WSJ_Clean[remove_NO_ACCESS].reset_index(drop = True)

Total NO ACCESS rows: 30953 and total unique ones: 30953


In [51]:
Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] == "NO ACCESS"]

Unnamed: 0,Title,Text,Link,Date,News Paper,Year


In [52]:
Articles_WSJ_Clean

Unnamed: 0,Title,Text,Link,Date,News Paper,Year
0,Transcript: Jerome Powell Fields Questions at ...,Federal Reserve Chairman Jerome Powell testifi...,https://www.wsj.com/articles/transcript-jerome...,2018-02-28,Wall_Street_Journal,2018
1,"Transcript of Yellen’s Feb. 10, 2016, Appearan...",Federal Reserve Chairwoman Janet Yellen deliv...,http://www.wsj.com/articles/transcript-of-yell...,2016-02-11,Wall_Street_Journal,2016
2,"Transcript of Yellen’s Feb. 11, 2016, Appearan...",Federal Reserve Chairwoman Janet Yellen deliv...,http://www.wsj.com/articles/transcript-of-yell...,2016-02-12,Wall_Street_Journal,2016
3,Transcript: Fed Chief Nominee Jerome Powell’s ...,"The Senate Banking Committee on Tuesday, Nov....",https://www.wsj.com/articles/transcript-fed-ch...,2017-11-28,Wall_Street_Journal,2017
4,Dealing With the Coronavirus,"We are all living in limbo now, trying to adj...",https://www.wsj.com/articles/dealing-with-the-...,2020-05-22,Wall_Street_Journal,2020
...,...,...,...,...,...,...
223596,404,,https://www.wsj.com/articles/advocates-push-to...,2017-12-10,Wall_Street_Journal,2017
223597,Pepper...and Salt,,https://www.wsj.com/articles/pepper-and-salt-1...,2017-10-06,Wall_Street_Journal,2017
223598,404,,https://www.wsj.com/articles/the-first-women-i...,2017-12-10,Wall_Street_Journal,2017
223599,404,,https://www.wsj.com/articles/indonesians-prote...,2017-12-10,Wall_Street_Journal,2017


In [53]:
#Do we remove the articles that contained pictures or cartoons -> we could assume the distribution of climate change is 
#equal in these articles, but we are not able to detect wether it is climate change or not. So we would have to remove them? 
#Also, this is about a neglectable portion of the articles, so not sorry wether this could potentialy effect the results


#we use an intermediate step, text cleaning of the articles happens after we removed the invalid ones
remove_blank_text_df = Articles_WSJ_Clean 
remove_blank_text_df["Text"] = remove_blank_text_df['Text'].str.strip(" ")
total_blank = remove_blank_text_df[remove_blank_text_df["Text"] == ""]

In [54]:
total_blank.groupby(["Year", "Link"])["Year"].nunique().reset_index(name='Unique_URLs_Count').groupby("Year").sum()

Unnamed: 0_level_0,Unique_URLs_Count
Year,Unnamed: 1_level_1
2016,1166
2017,5980
2018,5733
2019,356
2020,547


In [55]:
Articles_WSJ_Clean = Articles_WSJ_Clean.drop(remove_blank_text_df[remove_blank_text_df["Text"] == ""].index)

Articles_WSJ_Clean

Unnamed: 0,Title,Text,Link,Date,News Paper,Year
0,Transcript: Jerome Powell Fields Questions at ...,Federal Reserve Chairman Jerome Powell testifi...,https://www.wsj.com/articles/transcript-jerome...,2018-02-28,Wall_Street_Journal,2018
1,"Transcript of Yellen’s Feb. 10, 2016, Appearan...",Federal Reserve Chairwoman Janet Yellen delive...,http://www.wsj.com/articles/transcript-of-yell...,2016-02-11,Wall_Street_Journal,2016
2,"Transcript of Yellen’s Feb. 11, 2016, Appearan...",Federal Reserve Chairwoman Janet Yellen delive...,http://www.wsj.com/articles/transcript-of-yell...,2016-02-12,Wall_Street_Journal,2016
3,Transcript: Fed Chief Nominee Jerome Powell’s ...,"The Senate Banking Committee on Tuesday, Nov. ...",https://www.wsj.com/articles/transcript-fed-ch...,2017-11-28,Wall_Street_Journal,2017
4,Dealing With the Coronavirus,"We are all living in limbo now, trying to adju...",https://www.wsj.com/articles/dealing-with-the-...,2020-05-22,Wall_Street_Journal,2020
...,...,...,...,...,...,...
209813,"Rep.-Elect Jimmy Gomez, Your Seat Is Getting Cold",Washington,https://www.wsj.com/articles/rep-elect-jimmy-g...,2017-06-29,Wall_Street_Journal,2017
209814,"‘Curlew River,’ ‘Dido and Aeneas’ and ‘Otello’...",Brooklyn,https://www.wsj.com/articles/curlew-river-dido...,2017-03-20,Wall_Street_Journal,2017
209815,Turkey’s Coup and Europe’s Rule of Law,Athens,http://www.wsj.com/articles/turkeys-coup-and-e...,2017-01-06,Wall_Street_Journal,2017
209816,"A Terrorist’s Big Payday, Courtesy of Trudeau",Ottawa,https://www.wsj.com/articles/a-terrorists-big-...,2017-07-16,Wall_Street_Journal,2017


In [56]:
PDF = Articles_WSJ_Clean[
    (Articles_WSJ_Clean["Text"] == "Download PDF") |
    (Articles_WSJ_Clean["Text"] == "See Solution Download PDF") |
    (Articles_WSJ_Clean["Text"] == "Download PDF See Solution")
].reset_index(drop=True)

In [57]:
PDF.groupby(["Year", "Link"])["Year"].nunique().reset_index(name='Unique_URLs_Count').groupby("Year").sum()

Unnamed: 0_level_0,Unique_URLs_Count
Year,Unnamed: 1_level_1
2020,62


In [58]:
#Some articles did not contain text, but only a pdf that could be downloaded. We remove these for same the reasoning why we 
#removed pictures and cartoons

Articles_WSJ_Clean = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] != "Download PDF"].reset_index(drop = True)
Articles_WSJ_Clean = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] != "See Solution Download PDF"].reset_index(drop = True)
Articles_WSJ_Clean = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] != "Download PDF See Solution"].reset_index(drop = True)

In [59]:
Articles_WSJ_Clean

Unnamed: 0,Title,Text,Link,Date,News Paper,Year
0,Transcript: Jerome Powell Fields Questions at ...,Federal Reserve Chairman Jerome Powell testifi...,https://www.wsj.com/articles/transcript-jerome...,2018-02-28,Wall_Street_Journal,2018
1,"Transcript of Yellen’s Feb. 10, 2016, Appearan...",Federal Reserve Chairwoman Janet Yellen delive...,http://www.wsj.com/articles/transcript-of-yell...,2016-02-11,Wall_Street_Journal,2016
2,"Transcript of Yellen’s Feb. 11, 2016, Appearan...",Federal Reserve Chairwoman Janet Yellen delive...,http://www.wsj.com/articles/transcript-of-yell...,2016-02-12,Wall_Street_Journal,2016
3,Transcript: Fed Chief Nominee Jerome Powell’s ...,"The Senate Banking Committee on Tuesday, Nov. ...",https://www.wsj.com/articles/transcript-fed-ch...,2017-11-28,Wall_Street_Journal,2017
4,Dealing With the Coronavirus,"We are all living in limbo now, trying to adju...",https://www.wsj.com/articles/dealing-with-the-...,2020-05-22,Wall_Street_Journal,2020
...,...,...,...,...,...,...
209751,"Rep.-Elect Jimmy Gomez, Your Seat Is Getting Cold",Washington,https://www.wsj.com/articles/rep-elect-jimmy-g...,2017-06-29,Wall_Street_Journal,2017
209752,"‘Curlew River,’ ‘Dido and Aeneas’ and ‘Otello’...",Brooklyn,https://www.wsj.com/articles/curlew-river-dido...,2017-03-20,Wall_Street_Journal,2017
209753,Turkey’s Coup and Europe’s Rule of Law,Athens,http://www.wsj.com/articles/turkeys-coup-and-e...,2017-01-06,Wall_Street_Journal,2017
209754,"A Terrorist’s Big Payday, Courtesy of Trudeau",Ottawa,https://www.wsj.com/articles/a-terrorists-big-...,2017-07-16,Wall_Street_Journal,2017


In [60]:
Failed_write_to = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"].str.startswith('Write to')].reset_index(drop = True)

In [61]:
Failed_write_to.groupby(["Year", "Link"])["Year"].nunique().reset_index(name='Unique_URLs_Count').groupby("Year").sum()

Unnamed: 0_level_0,Unique_URLs_Count
Year,Unnamed: 1_level_1
2017,3125
2018,1


In [62]:
#Finally, after taking a look into other articles, we noticed that some articles were also unsuccesfully scraped. -> certain
#articles only contained the phrase "write to" which is always mentioned at the end of the article. This could be due to errors
#in the HTML or code. subset these and check! Is only around 3k articles. 

Articles_WSJ_Clean = Articles_WSJ_Clean[~Articles_WSJ_Clean["Text"].str.startswith('Write to')].reset_index(drop = True)

Articles_WSJ_Clean

Unnamed: 0,Title,Text,Link,Date,News Paper,Year
0,Transcript: Jerome Powell Fields Questions at ...,Federal Reserve Chairman Jerome Powell testifi...,https://www.wsj.com/articles/transcript-jerome...,2018-02-28,Wall_Street_Journal,2018
1,"Transcript of Yellen’s Feb. 10, 2016, Appearan...",Federal Reserve Chairwoman Janet Yellen delive...,http://www.wsj.com/articles/transcript-of-yell...,2016-02-11,Wall_Street_Journal,2016
2,"Transcript of Yellen’s Feb. 11, 2016, Appearan...",Federal Reserve Chairwoman Janet Yellen delive...,http://www.wsj.com/articles/transcript-of-yell...,2016-02-12,Wall_Street_Journal,2016
3,Transcript: Fed Chief Nominee Jerome Powell’s ...,"The Senate Banking Committee on Tuesday, Nov. ...",https://www.wsj.com/articles/transcript-fed-ch...,2017-11-28,Wall_Street_Journal,2017
4,Dealing With the Coronavirus,"We are all living in limbo now, trying to adju...",https://www.wsj.com/articles/dealing-with-the-...,2020-05-22,Wall_Street_Journal,2020
...,...,...,...,...,...,...
206625,"Rep.-Elect Jimmy Gomez, Your Seat Is Getting Cold",Washington,https://www.wsj.com/articles/rep-elect-jimmy-g...,2017-06-29,Wall_Street_Journal,2017
206626,"‘Curlew River,’ ‘Dido and Aeneas’ and ‘Otello’...",Brooklyn,https://www.wsj.com/articles/curlew-river-dido...,2017-03-20,Wall_Street_Journal,2017
206627,Turkey’s Coup and Europe’s Rule of Law,Athens,http://www.wsj.com/articles/turkeys-coup-and-e...,2017-01-06,Wall_Street_Journal,2017
206628,"A Terrorist’s Big Payday, Courtesy of Trudeau",Ottawa,https://www.wsj.com/articles/a-terrorists-big-...,2017-07-16,Wall_Street_Journal,2017


In [63]:
company = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"].str.startswith('Company')]
company.groupby(["Year", "Link"])["Year"].nunique().reset_index(name='Unique_URLs_Count').groupby("Year").sum()

Unnamed: 0_level_0,Unique_URLs_Count
Year,Unnamed: 1_level_1
2016,361
2017,222
2018,241
2019,1
2020,2


In [64]:
#Remove other failed articles

#Other articles from WSJ Pro, for which we got some information, not include them
Articles_WSJ_Clean = Articles_WSJ_Clean[~Articles_WSJ_Clean["Text"].str.startswith('Company')].reset_index(drop = True)

#Remove articles that have less than 25 words = trash
#Articles_WSJ_Clean = Articles_WSJ_Clean[Articles_WSJ_Clean['Text'].str.split().apply(len) > 25].sort_values(by='Text', key=lambda x: x.str.len(), ascending=False).reset_index(drop = True)

In [65]:
#Useful URLs per year
Articles_WSJ_Clean["Date"] = pd.to_datetime(Articles_WSJ_Clean['Date'], format='%d/%m/%Y')
Articles_WSJ_Clean["Year"] = Articles_WSJ_Clean['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WSJ_year_useful_Articles_1 = Articles_WSJ_Clean.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WSJ_year_useful_Articles_2 = Articles_WSJ_Clean.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [66]:
#Check WSJ_year_useful_Articles_1
WSJ_year_useful_Articles_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,62593
1,Wall_Street_Journal,2017,37123
2,Wall_Street_Journal,2018,36297
3,Wall_Street_Journal,2019,35224
4,Wall_Street_Journal,2020,34566


In [67]:
#Total usefull articles
WSJ_year_useful_Articles_1["Unique_URLs_Count"].sum()

205803

In [68]:
#WSJ_year_useful_Articles_1.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/WSJ_year_useful_Articles_1", index = False)

In [69]:
#Compare to URLs we got in the beginning
Comparison_WSJ_1 = pd.merge(WSJ_year_URLs_1, WSJ_year_useful_Articles_1, on = ["News Paper", 'Year'])
Comparison_WSJ_1.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WSJ_1["Difference"] = Comparison_WSJ_1["Unique_URLs_Count"] - Comparison_WSJ_1["Useful_Unique_URLs_Count"]

In [70]:
Comparison_WSJ_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count,Useful_Unique_URLs_Count,Difference
0,Wall_Street_Journal,2016,74024,62593,11431
1,Wall_Street_Journal,2017,59731,37123,22608
2,Wall_Street_Journal,2018,48922,36297,12625
3,Wall_Street_Journal,2019,36290,35224,1066
4,Wall_Street_Journal,2020,35614,34566,1048


In [71]:
#Check WSJ_year_useful_Articles_2
WSJ_year_useful_Articles_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,62593
1,Wall_Street_Journal,2017,37123
2,Wall_Street_Journal,2018,36293
3,Wall_Street_Journal,2019,35219
4,Wall_Street_Journal,2020,34565


In [72]:
#Total useful articles 
WSJ_year_useful_Articles_2["Unique_URLs_Count"].sum()

205793

In [73]:
#Compare to URLs we got in the beginning
Comparison_WSJ_2 = pd.merge(WSJ_year_URLs_2, WSJ_year_useful_Articles_2, on = ["News Paper", 'Year'])
Comparison_WSJ_2.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WSJ_2["Difference"] = Comparison_WSJ_2["Unique_URLs_Count"] - Comparison_WSJ_2["Useful_Unique_URLs_Count"]

In [74]:
Comparison_WSJ_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count,Useful_Unique_URLs_Count,Difference
0,Wall_Street_Journal,2016,74023,62593,11430
1,Wall_Street_Journal,2017,59728,37123,22605
2,Wall_Street_Journal,2018,48917,36293,12624
3,Wall_Street_Journal,2019,36283,35219,1064
4,Wall_Street_Journal,2020,35613,34565,1048


In [80]:
rescrape = pd.concat([no_access, total_blank])

## 1.5. Remove Duplicates

Remove the duplicates of articles that were scraped successfully scraped more than once

In [104]:
Articles_WSJ_Clean = Articles_WSJ_Clean.reset_index(drop = True)

In [105]:
Articles_WSJ_Clean

Unnamed: 0,Title,Text,Link,Date,News Paper,Year
0,Transcript: Jerome Powell Fields Questions at ...,Federal Reserve Chairman Jerome Powell testifi...,https://www.wsj.com/articles/transcript-jerome...,2018-02-28,Wall_Street_Journal,2018
1,"Transcript of Yellen’s Feb. 10, 2016, Appearan...",Federal Reserve Chairwoman Janet Yellen delive...,http://www.wsj.com/articles/transcript-of-yell...,2016-02-11,Wall_Street_Journal,2016
2,"Transcript of Yellen’s Feb. 11, 2016, Appearan...",Federal Reserve Chairwoman Janet Yellen delive...,http://www.wsj.com/articles/transcript-of-yell...,2016-02-12,Wall_Street_Journal,2016
3,Transcript: Fed Chief Nominee Jerome Powell’s ...,"The Senate Banking Committee on Tuesday, Nov. ...",https://www.wsj.com/articles/transcript-fed-ch...,2017-11-28,Wall_Street_Journal,2017
4,Dealing With the Coronavirus,"We are all living in limbo now, trying to adju...",https://www.wsj.com/articles/dealing-with-the-...,2020-05-22,Wall_Street_Journal,2020
...,...,...,...,...,...,...
205798,"Rep.-Elect Jimmy Gomez, Your Seat Is Getting Cold",Washington,https://www.wsj.com/articles/rep-elect-jimmy-g...,2017-06-29,Wall_Street_Journal,2017
205799,"‘Curlew River,’ ‘Dido and Aeneas’ and ‘Otello’...",Brooklyn,https://www.wsj.com/articles/curlew-river-dido...,2017-03-20,Wall_Street_Journal,2017
205800,Turkey’s Coup and Europe’s Rule of Law,Athens,http://www.wsj.com/articles/turkeys-coup-and-e...,2017-01-06,Wall_Street_Journal,2017
205801,"A Terrorist’s Big Payday, Courtesy of Trudeau",Ottawa,https://www.wsj.com/articles/a-terrorists-big-...,2017-07-16,Wall_Street_Journal,2017


In [81]:
short_articles = Articles_WSJ_Clean[Articles_WSJ_Clean['Text'].str.split().apply(len) < 100].sort_values(by='Text', key=lambda x: x.str.len(), ascending=False)

In [82]:
short_articles.groupby(["Year", "Link"])["Year"].nunique().reset_index(name='Unique_URLs_Count').groupby("Year").sum()

Unnamed: 0_level_0,Unique_URLs_Count
Year,Unnamed: 1_level_1
2016,1313
2017,14998
2018,2998
2019,208
2020,233


In [116]:
short_articles[Articles_WSJ_Clean["Year"] == 2017]

Unnamed: 0,Title,Text,Link,Date,News Paper,Year
186434,Private Colleges Court Community-College Students,Facing dire financial challenges stemming from...,http://www.wsj.com/articles/private-colleges-c...,2017-01-24,Wall_Street_Journal,2017
186755,"Rebels Hail U.S. Strike, Syria Says 16 Killed",BEIRUT—Syrian opposition supporters hailed the...,https://www.wsj.com/articles/syrian-media-say-...,2017-04-07,Wall_Street_Journal,2017
187467,Has the Movement to Raise the Minimum Wage Rea...,$9.00-$9.99 $7.25 (federal minimum) $7.50-$7.9...,https://www.wsj.com/articles/has-the-movement-...,2017-04-06,Wall_Street_Journal,2017
186405,U.S. Seeks to Stay Neutral in Iraq Conflict,Iraqi Prime Minister Haider al-Abadi ordered f...,https://www.wsj.com/articles/u-s-seeks-to-stay...,2017-10-16,Wall_Street_Journal,2017
186111,Supreme Court Nominee Neil Gorsuch Calls Trump...,WASHINGTON—Supreme Court nominee Neil Gorsuch ...,https://www.wsj.com/articles/supreme-court-nom...,2017-02-09,Wall_Street_Journal,2017
...,...,...,...,...,...,...
205797,German Pacifism Comes Under Fire,Berlin,https://www.wsj.com/articles/german-pacifism-c...,2017-06-26,Wall_Street_Journal,2017
205800,Turkey’s Coup and Europe’s Rule of Law,Athens,http://www.wsj.com/articles/turkeys-coup-and-e...,2017-01-06,Wall_Street_Journal,2017
205801,"A Terrorist’s Big Payday, Courtesy of Trudeau",Ottawa,https://www.wsj.com/articles/a-terrorists-big-...,2017-07-16,Wall_Street_Journal,2017
205790,Iran Won in Lebanon. What About Iraq?,Beirut,https://www.wsj.com/articles/iran-won-in-leban...,2017-06-26,Wall_Street_Journal,2017


In [83]:
rescrape = pd.concat([rescrape, short_articles])

In [86]:
rescrape = rescrape[["Link", "Date"]]

In [None]:
rescrape.to_csv()

# 2. Washington Post

## 2.1. URLs

In [117]:
URLs_WP_2016 = pd.read_parquet("Washington_Post_2016_URLS")
URLs_WP_2017 = pd.read_parquet("Washington_Post_2017_URLS")
URLs_WP_2018 = pd.read_parquet("Washington_Post_2018_URLS")
URLs_WP_2019 = pd.read_parquet("Washington_Post_2019_URLS")
URLs_WP_2020 = pd.read_parquet("Washington_Post_2020_URLS")

URLs_WP = pd.concat([URLs_WP_2016, URLs_WP_2017, URLs_WP_2018, URLs_WP_2019, URLs_WP_2020]).reset_index(drop = True)

In [118]:
URLs_WP

Unnamed: 0,Date,News Paper,Link
0,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/capitals-i...
1,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/checkpoint...
2,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/the-fix/wp...
3,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/the-watch/...
4,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/early-lead...
...,...,...,...
153173,31/12/2020,Washington_Post,https://www.washingtonpost.com/national/nation...
153174,31/12/2020,Washington_Post,https://www.washingtonpost.com/local/trump-jan...
153175,31/12/2020,Washington_Post,https://www.washingtonpost.com/politics/secret...
153176,31/12/2020,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...


In [119]:
#Remove the duplicates
URLs_WP = URLs_WP.drop_duplicates().reset_index(drop = True)

In [120]:
URLs_WP

Unnamed: 0,Date,News Paper,Link
0,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/capitals-i...
1,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/checkpoint...
2,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/the-fix/wp...
3,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/the-watch/...
4,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/early-lead...
...,...,...,...
153173,31/12/2020,Washington_Post,https://www.washingtonpost.com/national/nation...
153174,31/12/2020,Washington_Post,https://www.washingtonpost.com/local/trump-jan...
153175,31/12/2020,Washington_Post,https://www.washingtonpost.com/politics/secret...
153176,31/12/2020,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...


In [121]:
#Collect URLs per year
URLs_WP["Date"] = pd.to_datetime(URLs_WP['Date'], format='%d/%m/%Y')
URLs_WP["Year"] = URLs_WP['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WP_year_URLs_1 = URLs_WP.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WP_year_URLs_2 = URLs_WP.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [122]:
#Check WP_year_URLs_1
WP_year_URLs_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49756
1,Washington_Post,2017,50442
2,Washington_Post,2018,28952
3,Washington_Post,2019,10275
4,Washington_Post,2020,13753


In [123]:
WP_year_URLs_1["Unique_URLs_Count"].sum()

153178

In [124]:
#WP_year_URLs_1.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/WP_year_URLs_1", index = False)

In [125]:
#Check WP_year_URLs_2
WP_year_URLs_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49756
1,Washington_Post,2017,50442
2,Washington_Post,2018,28952
3,Washington_Post,2019,10275
4,Washington_Post,2020,13753


## 2.2. Articles

In [126]:
# Set the directory path
directory = "C:/Users/Boedt/OneDrive/Bureaublad/Scraped_Articles/Washington Post"

# Get a list of filenames in the directory
filenames = os.listdir(directory)

# Loop through the filenames and read each Parquet file into a DataFrame
dfs = []
for filename in filenames:
    filepath = os.path.join(directory, filename)
    df = pd.read_parquet(filepath)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
Articles_WP = pd.concat(dfs, ignore_index=True)
Articles_WP = Articles_WP.drop("Dat", axis = 1)

In [127]:
Articles_WP

Unnamed: 0,Title,Text,Date,News Paper,Link
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",01/01/2018,Washington_Post,https://www.washingtonpost.com/local/public-sa...
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",01/01/2018,Washington_Post,https://www.washingtonpost.com/local/homelessl...
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,01/01/2018,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,01/01/2018,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,01/01/2018,Washington_Post,https://www.washingtonpost.com/business/econom...
...,...,...,...,...,...
203351,"Shivering in line for a swab up the nose, but ...",It was dusk on the third-to-last day of a terr...,31/12/2020,Washington_Post,https://www.washingtonpost.com/local/new-year-...
203352,National Digest: Nashville woman warned police...,,31/12/2020,Washington_Post,https://www.washingtonpost.com/national/nation...
203353,Jan. 6 protests multiply as Trump continues to...,Protests planned in support of President Trump...,31/12/2020,Washington_Post,https://www.washingtonpost.com/local/trump-jan...
203354,2021’s call to Reconstruction,One of the singularly important intellectual d...,31/12/2020,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...


In [128]:
#Drop all duplicates
Articles_WP = Articles_WP.drop_duplicates().reset_index(drop = True)

In [129]:
#Scraped articles per year (based on URLs)
Articles_WP["Date"] = pd.to_datetime(Articles_WP['Date'], format='%d/%m/%Y')
Articles_WP["Year"] = Articles_WP['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WP_year_Articles_1 = Articles_WP.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WP_year_Articles_2 = Articles_WP.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [130]:
#Check WP_year_Articles_1
WP_year_Articles_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49664
1,Washington_Post,2017,49800
2,Washington_Post,2018,27888
3,Washington_Post,2019,10233
4,Washington_Post,2020,13697


In [131]:
#Check WP_year_Articles_2
WP_year_Articles_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49664
1,Washington_Post,2017,49800
2,Washington_Post,2018,27888
3,Washington_Post,2019,10233
4,Washington_Post,2020,13697


## 2.3. Unscraped Articles

In [132]:
#Select all the URLs that were scraped, but are not present in the article data set (unscraped)
WP_unscraped_articles = URLs_WP[~URLs_WP["Link"].isin(Articles_WP["Link"])].reset_index(drop = True)

In [133]:
WP_unscraped_articles

Unnamed: 0,Date,News Paper,Link,Year
0,2016-07-22,Washington_Post,https://www.washingtonpost.com/entertainment/t...,2016
1,2016-07-24,Washington_Post,https://www.washingtonpost.com/news/morning-mi...,2016
2,2016-07-26,Washington_Post,https://www.washingtonpost.com/politics/in-tig...,2016
3,2016-07-27,Washington_Post,https://www.washingtonpost.com/news/energy-env...,2016
4,2016-07-28,Washington_Post,https://www.washingtonpost.com/local/born-befo...,2016
...,...,...,...,...
1891,2020-12-05,Washington_Post,https://www.washingtonpost.com/world/europe/ge...,2020
1892,2020-12-11,Washington_Post,https://www.washingtonpost.com/local/obituarie...,2020
1893,2020-12-17,Washington_Post,https://www.washingtonpost.com/lifestyle/advic...,2020
1894,2020-12-23,Washington_Post,https://www.washingtonpost.com/lifestyle/advic...,2020


In [134]:
#Check the distribution per year
WP_unscraped_articles.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()


Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,92
1,Washington_Post,2017,642
2,Washington_Post,2018,1064
3,Washington_Post,2019,42
4,Washington_Post,2020,56


In [135]:
#Store the unscraped URLs so they can be rescraped! 
#WP_unscraped_articles.to_parquet("WP_missed_URLs")

## 2.4. Useful Articles

In [136]:
Articles_WP_Clean = Articles_WP

In [137]:
Articles_WP_Clean

Unnamed: 0,Title,Text,Date,News Paper,Link,Year
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/public-sa...,2018
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/homelessl...,2018
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,2018-01-01,Washington_Post,https://www.washingtonpost.com/business/econom...,2018
...,...,...,...,...,...,...
199176,"Shivering in line for a swab up the nose, but ...",It was dusk on the third-to-last day of a terr...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/new-year-...,2020
199177,National Digest: Nashville woman warned police...,,2020-12-31,Washington_Post,https://www.washingtonpost.com/national/nation...,2020
199178,Jan. 6 protests multiply as Trump continues to...,Protests planned in support of President Trump...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/trump-jan...,2020
199179,2021’s call to Reconstruction,One of the singularly important intellectual d...,2020-12-31,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...,2020


In [138]:
#Remove empty text
remove_blank_text_df = Articles_WP_Clean 
remove_blank_text_df["Text"] = remove_blank_text_df['Text'].str.strip(" ")
Articles_WP_Clean = Articles_WP_Clean.drop(remove_blank_text_df[remove_blank_text_df["Text"] == ""].index).reset_index(drop = True)

Articles_WP_Clean

Unnamed: 0,Title,Text,Date,News Paper,Link,Year
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/public-sa...,2018
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/homelessl...,2018
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,2018-01-01,Washington_Post,https://www.washingtonpost.com/business/econom...,2018
...,...,...,...,...,...,...
165390,"By the end of 2020, we were supposed to have m...",In an alternate-universe version of 2020 — and...,2020-12-31,Washington_Post,https://www.washingtonpost.com/entertainment/b...,2020
165391,"Shivering in line for a swab up the nose, but ...",It was dusk on the third-to-last day of a terr...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/new-year-...,2020
165392,Jan. 6 protests multiply as Trump continues to...,Protests planned in support of President Trump...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/trump-jan...,2020
165393,2021’s call to Reconstruction,One of the singularly important intellectual d...,2020-12-31,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...,2020


In [139]:
Articles_WP_Clean = Articles_WP_Clean[Articles_WP_Clean['Text'].apply(lambda x: len(x.split(" ")) != 1)].reset_index(drop = True)

In [140]:
Articles_WP_Clean

Unnamed: 0,Title,Text,Date,News Paper,Link,Year
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/public-sa...,2018
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/homelessl...,2018
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,2018-01-01,Washington_Post,https://www.washingtonpost.com/business/econom...,2018
...,...,...,...,...,...,...
145811,"By the end of 2020, we were supposed to have m...",In an alternate-universe version of 2020 — and...,2020-12-31,Washington_Post,https://www.washingtonpost.com/entertainment/b...,2020
145812,"Shivering in line for a swab up the nose, but ...",It was dusk on the third-to-last day of a terr...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/new-year-...,2020
145813,Jan. 6 protests multiply as Trump continues to...,Protests planned in support of President Trump...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/trump-jan...,2020
145814,2021’s call to Reconstruction,One of the singularly important intellectual d...,2020-12-31,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...,2020


In [141]:
#Remove failed articles (occure very frequent and are checked if they are failed or not)
checklist = Articles_WP_Clean["Text"].value_counts().reset_index()
checklist.columns = ["Text", "Count"]
checklist[checklist["Count"] > 3]

Articles_WP_Clean = Articles_WP_Clean[~Articles_WP_Clean["Text"].isin(checklist[checklist["Count"] > 3]["Text"])].reset_index(drop = True)

In [142]:
#Useful URLs per year
Articles_WP_Clean["Date"] = pd.to_datetime(Articles_WP_Clean['Date'], format='%d/%m/%Y')
Articles_WP_Clean["Year"] = Articles_WP_Clean['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WP_year_useful_Articles_1 = Articles_WP_Clean.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WP_year_useful_Articles_2 = Articles_WP_Clean.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [143]:
WP_year_useful_Articles_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,47679
1,Washington_Post,2017,47546
2,Washington_Post,2018,21889
3,Washington_Post,2019,10055
4,Washington_Post,2020,13291


In [144]:
WP_year_useful_Articles_1["Unique_URLs_Count"].sum()

140460

In [145]:
#WP_year_useful_Articles_1.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/WP_year_useful_Articles_1", index = False)

In [146]:
#Compare to URLs we got in the beginning
Comparison_WP_1 = pd.merge(WP_year_URLs_1, WP_year_useful_Articles_1, on = ["News Paper", 'Year'])
Comparison_WP_1.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WP_1["Difference"] = Comparison_WP_1["Unique_URLs_Count"] - Comparison_WP_1["Useful_Unique_URLs_Count"]

In [147]:
Comparison_WP_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count,Useful_Unique_URLs_Count,Difference
0,Washington_Post,2016,49756,47679,2077
1,Washington_Post,2017,50442,47546,2896
2,Washington_Post,2018,28952,21889,7063
3,Washington_Post,2019,10275,10055,220
4,Washington_Post,2020,13753,13291,462


In [148]:
WP_year_useful_Articles_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,47679
1,Washington_Post,2017,47546
2,Washington_Post,2018,21889
3,Washington_Post,2019,10055
4,Washington_Post,2020,13291


In [149]:
WP_year_useful_Articles_2["Unique_URLs_Count"].sum()

140460

In [150]:
#Compare to URLs we got in the beginning
Comparison_WP_2 = pd.merge(WP_year_URLs_2, WP_year_useful_Articles_2, on = ["News Paper", 'Year'])
Comparison_WP_2.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WP_2["Difference"] = Comparison_WP_2["Unique_URLs_Count"] - Comparison_WP_2["Useful_Unique_URLs_Count"]

In [151]:
Comparison_WP_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count,Useful_Unique_URLs_Count,Difference
0,Washington_Post,2016,49756,47679,2077
1,Washington_Post,2017,50442,47546,2896
2,Washington_Post,2018,28952,21889,7063
3,Washington_Post,2019,10275,10055,220
4,Washington_Post,2020,13753,13291,462


## 1.5. Remove Duplicates

In [None]:
#Keep the articles with the longest text
Articles_WP_Clean.sort_values(by='Text', key=lambda x: x.str.split().str.len(), ascending=False).drop_duplicates(subset=['Date', 'Link'], keep='first')

In [82]:
Articles_WP_Clean[Articles_WP_Clean['Text'].str.split().apply(len) < 100].sort_values(by='Text', key=lambda x: x.str.len(), ascending=False)