# 0. Packages & Functions

## 0.1. Packages

In [1]:
import pandas as pd
import os

## 0.2. Functions

# 1. Wall Street Journal

## 1.1. URLs

In [5]:
# Set the directory path
directory = "C:/Users/Boedt/OneDrive/Bureaublad/Scraped_Articles/Wall Street Journal/URLs"

# Get a list of filenames in the directory
filenames = os.listdir(directory)

# Loop through the filenames and read each Parquet file into a DataFrame
dfs = []
for filename in filenames:
    filepath = os.path.join(directory, filename)
    df = pd.read_parquet(filepath)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
URLs_WSJ = pd.concat(dfs, ignore_index=True)

In [3]:
URLs_WSJ

Unnamed: 0,Date,News Paper,Link
0,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/5-fashion-resoluti...
1,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/suspect-in-new-yea...
2,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/defenders-of-confe...
3,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/saudi-women-seek-m...
4,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/saudi-beheadings-a...
...,...,...,...
254877,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/navalny-faces-fra...
254878,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/chinese-markets-s...
254879,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/astrazeneca-and-o...
254880,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/covid-19-vaccine-...


In [4]:
#Remove the duplicate rows and the URLs with an error message in

#duplicates
URLs_WSJ = URLs_WSJ.drop_duplicates().reset_index(drop = True)

#error
mask_WSJ_URLs_error = ~ URLs_WSJ['Link'].str.contains('mod=error_page')
URLs_WSJ = URLs_WSJ[mask_WSJ_URLs_error].reset_index(drop = True)

In [5]:
URLs_WSJ

Unnamed: 0,Date,News Paper,Link
0,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/5-fashion-resoluti...
1,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/suspect-in-new-yea...
2,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/defenders-of-confe...
3,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/saudi-women-seek-m...
4,01/01/2016,Wall_Street_Journal,http://www.wsj.com/articles/saudi-beheadings-a...
...,...,...,...
254576,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/navalny-faces-fra...
254577,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/chinese-markets-s...
254578,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/astrazeneca-and-o...
254579,31/12/2020,Wall_Street_Journal,https://www.wsj.com/articles/covid-19-vaccine-...


In [6]:
#Collect URLs per year
URLs_WSJ["Date"] = pd.to_datetime(URLs_WSJ['Date'], format='%d/%m/%Y')
URLs_WSJ["Year"] = URLs_WSJ['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WSJ_year_URLs_1 = URLs_WSJ.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WSJ_year_URLs_2 = URLs_WSJ.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [7]:
#Check WSJ_year_URLs_1
WSJ_year_URLs_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,74024
1,Wall_Street_Journal,2017,59731
2,Wall_Street_Journal,2018,48922
3,Wall_Street_Journal,2019,36290
4,Wall_Street_Journal,2020,35614


In [8]:
WSJ_year_URLs_1["Unique_URLs_Count"].sum()

254581

In [9]:
WSJ_year_URLs_1.to_csv('C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/WSJ_year_URLs_1', index = False)

In [10]:
#Check WSJ_year_URLs_2
WSJ_year_URLs_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,74023
1,Wall_Street_Journal,2017,59728
2,Wall_Street_Journal,2018,48917
3,Wall_Street_Journal,2019,36283
4,Wall_Street_Journal,2020,35613


## 1.2. Articles

In [11]:
# Set the directory path
directory = "C:/Users/Boedt/OneDrive/Bureaublad/Scraped_Articles/Wall Street Journal"

# Get a list of filenames in the directory
filenames = os.listdir(directory)

# Loop through the filenames and read each Parquet file into a DataFrame
dfs = []
for filename in filenames:
    filepath = os.path.join(directory, filename)
    df = pd.read_parquet(filepath)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
Articles_WSJ = pd.concat(dfs, ignore_index=True)

Articles_WSJ = Articles_WSJ.drop("index", axis = 1)

In [12]:
Articles_WSJ

Unnamed: 0,Title,Text,Link,Date,News Paper,Year
0,Phil Mickelson and the SEC’s Legal Bogey,The government’s latest big-ticket insider tr...,http://www.wsj.com/articles/phil-mickelson-and...,2016-06-16 00:00:00,Wall_Street_Journal,2016.0
1,Hamburg Haven,,http://www.wsj.com/articles/hamburg-haven-1483...,2016-12-29 00:00:00,Wall_Street_Journal,2016.0
2,Scenes From Edinburgh,,http://www.wsj.com/articles/scenes-from-edinbu...,2016-12-29 00:00:00,Wall_Street_Journal,2016.0
3,"After a Disaster, Families Rebuild",,http://www.wsj.com/articles/after-a-disaster-f...,2016-12-29 00:00:00,Wall_Street_Journal,2016.0
4,Photos of the Day: December 30,,http://www.wsj.com/articles/photos-of-the-day-...,2016-12-30 00:00:00,Wall_Street_Journal,2016.0
...,...,...,...,...,...,...
344236,Smoother by the Dozen,NO ACCESS,https://www.wsj.com/articles/smoother-by-the-d...,29/12/2020,Wall_Street_Journal,
344237,Everybody Wants Credit for the Covid Vaccine,Graham T. Allison correctly highlights the im...,https://www.wsj.com/articles/everybody-wants-c...,30/12/2020,Wall_Street_Journal,
344238,A Place in the Sun,NO ACCESS,https://www.wsj.com/articles/a-place-in-the-su...,30/12/2020,Wall_Street_Journal,
344239,"Shift Gears, Accelerate: CIOs Reordered IT Pri...",Remote work and the acceleration of certain d...,https://www.wsj.com/articles/shift-gears-accel...,30/12/2020,Wall_Street_Journal,


In [13]:
#Drop all duplicates and URLs with error message (we only found out about the error after scraping the URLs)

#duplicates
Articles_WSJ = Articles_WSJ.drop_duplicates().reset_index(drop = True)

#error
mask_WSJ_Articles_error = ~ Articles_WSJ['Link'].str.contains('mod=error_page')
Articles_WSJ = Articles_WSJ[mask_WSJ_Articles_error].reset_index(drop = True)

In [14]:
Articles_WSJ["Date"]

0         2016-06-16 00:00:00
1         2016-12-29 00:00:00
2         2016-12-29 00:00:00
3         2016-12-29 00:00:00
4         2016-12-30 00:00:00
                 ...         
309021             23/12/2020
309022             26/12/2020
309023             27/12/2020
309024             30/12/2020
309025             30/12/2020
Name: Date, Length: 309026, dtype: object

In [15]:
#Scraped articles per year (based on URLs)
Articles_WSJ["Date"] = pd.to_datetime(Articles_WSJ['Date'], format='%d/%m/%Y')
Articles_WSJ["Year"] = Articles_WSJ['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WSJ_year_Articles_1 = Articles_WSJ.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WSJ_year_Articles_2 = Articles_WSJ.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

ValueError: time data '2016-06-16 00:00:00' does not match format '%d/%m/%Y' (match)

In [18]:
#Check WSJ_year_Articles_1
WSJ_year_Articles_1

NameError: name 'WSJ_year_Articles_1' is not defined

In [None]:
#Check WSJ_year_Articles_2
WSJ_year_Articles_2

In [None]:
#Articles_WSJ = Articles_WSJ.drop(Articles_WSJ[Articles_WSJ[["Link", "Date"]].duplicated() == True].index)

## 1.3. Unscraped Articles

In [19]:
#Select all the URLs that were scraped, but are not present in the article data set (unscraped)
WSJ_unscraped_articles = URLs_WSJ[~URLs_WSJ["Link"].isin(Articles_WSJ["Link"])].reset_index(drop = True)

In [20]:
WSJ_unscraped_articles

Unnamed: 0,Date,News Paper,Link,Year
0,2016-12-17,Wall_Street_Journal,http://www.wsj.com/articles/australias-prime-m...,2016
1,2016-12-19,Wall_Street_Journal,http://www.wsj.com/articles/sherwin-williams-v...,2016
2,2016-12-20,Wall_Street_Journal,http://www.wsj.com/articles/jefferies-group-pr...,2016
3,2016-12-22,Wall_Street_Journal,http://www.wsj.com/articles/rogue-one-points-t...,2016
4,2016-12-23,Wall_Street_Journal,http://www.wsj.com/articles/let-it-be-an-arms-...,2016
5,2016-12-26,Wall_Street_Journal,http://www.wsj.com/articles/bojs-kuroda-sees-b...,2016
6,2016-12-28,Wall_Street_Journal,http://www.wsj.com/articles/time-to-expand-acc...,2016
7,2017-11-01,Wall_Street_Journal,https://www.wsj.com/articles/fed-likely-on-hol...,2017
8,2017-11-02,Wall_Street_Journal,https://www.wsj.com/articles/new-workplace-per...,2017
9,2018-02-02,Wall_Street_Journal,https://www.wsj.com/articles/dont-overdiscuss-...,2018


In [21]:
#Check the distribution per year
WSJ_unscraped_articles.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()
 
# -> November and December 2017 are missing and January untill April 2018 are missing. This explains the high numbers in those 
    #years 

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Wall_Street_Journal,2016,7
1,Wall_Street_Journal,2017,2
2,Wall_Street_Journal,2018,5


In [22]:
#Store the unscraped URLs so they can be rescraped! 
#WSJ_unscraped_articles.to_parquet("WSJ_missed_URLs")

## 1.4. Useful Articles

In [23]:
#First store the Articles_WSJ in a new dataframe, this way we can keep the two separate.
Articles_WSJ_Clean = Articles_WSJ.copy()

In [None]:
Articles_WSJ_Clean = Articles_WSJ_Clean.sort_values(by='Text', key=lambda x: x.str.split().str.len(), ascending=False).drop_duplicates(subset=['Date', 'Link'], keep='first')

In [None]:
Articles_WSJ_Clean

In [None]:
no_access = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] == "NO ACCESS"]

In [None]:
no_access_year = no_access.groupby(["Year", "Link"])["Year"].nunique().reset_index(name='Unique_URLs_Count').groupby("Year").sum()
no_access_year

In [None]:
#First we remove the articles we don't have access to, this is because they belong to the WSJ Pro subscription which we don't have.
#This is typically about VC and PE, so the added value seemed limited to us.
total_no_access = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] == "NO ACCESS"]["Text"].count()
total_unique_no_access = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] == "NO ACCESS"].groupby(["Date", "Link"])["Link"].count().reset_index(drop = True).sum()
print("Total NO ACCESS rows:", total_no_access, "and total unique ones:", total_unique_no_access)


remove_NO_ACCESS = ~ Articles_WSJ_Clean["Text"].str.contains("NO ACCESS")
Articles_WSJ_Clean = Articles_WSJ_Clean[remove_NO_ACCESS].reset_index(drop = True)

In [None]:
Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] == "NO ACCESS"]

In [None]:
Articles_WSJ_Clean

In [None]:
#Do we remove the articles that contained pictures or cartoons -> we could assume the distribution of climate change is 
#equal in these articles, but we are not able to detect wether it is climate change or not. So we would have to remove them? 
#Also, this is about a neglectable portion of the articles, so not sorry wether this could potentialy effect the results


#we use an intermediate step, text cleaning of the articles happens after we removed the invalid ones
remove_blank_text_df = Articles_WSJ_Clean 
remove_blank_text_df["Text"] = remove_blank_text_df['Text'].str.strip(" ")
total_blank = remove_blank_text_df[remove_blank_text_df["Text"] == ""]

In [None]:
total_blank.groupby(["Year", "Link"])["Year"].nunique().reset_index(name='Unique_URLs_Count').groupby("Year").sum()

In [None]:
Articles_WSJ_Clean = Articles_WSJ_Clean.drop(remove_blank_text_df[remove_blank_text_df["Text"] == ""].index)

Articles_WSJ_Clean

In [None]:
PDF = Articles_WSJ_Clean[
    (Articles_WSJ_Clean["Text"] == "Download PDF") |
    (Articles_WSJ_Clean["Text"] == "See Solution Download PDF") |
    (Articles_WSJ_Clean["Text"] == "Download PDF See Solution")
].reset_index(drop=True)

In [None]:
PDF.groupby(["Year", "Link"])["Year"].nunique().reset_index(name='Unique_URLs_Count').groupby("Year").sum()

In [None]:
#Some articles did not contain text, but only a pdf that could be downloaded. We remove these for same the reasoning why we 
#removed pictures and cartoons

Articles_WSJ_Clean = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] != "Download PDF"].reset_index(drop = True)
Articles_WSJ_Clean = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] != "See Solution Download PDF"].reset_index(drop = True)
Articles_WSJ_Clean = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] != "Download PDF See Solution"].reset_index(drop = True)

In [None]:
Articles_WSJ_Clean

In [None]:
Failed_write_to = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"].str.startswith('Write to')].reset_index(drop = True)

In [None]:
Failed_write_to.groupby(["Year", "Link"])["Year"].nunique().reset_index(name='Unique_URLs_Count').groupby("Year").sum()

In [None]:
#Finally, after taking a look into other articles, we noticed that some articles were also unsuccesfully scraped. -> certain
#articles only contained the phrase "write to" which is always mentioned at the end of the article. This could be due to errors
#in the HTML or code. subset these and check! Is only around 3k articles. 

Articles_WSJ_Clean = Articles_WSJ_Clean[~Articles_WSJ_Clean["Text"].str.startswith('Write to')].reset_index(drop = True)

Articles_WSJ_Clean

In [None]:
company = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"].str.startswith('Company')]
company.groupby(["Year", "Link"])["Year"].nunique().reset_index(name='Unique_URLs_Count').groupby("Year").sum()

In [None]:
#Remove other failed articles

#Other articles from WSJ Pro, for which we got some information, not include them
Articles_WSJ_Clean = Articles_WSJ_Clean[~Articles_WSJ_Clean["Text"].str.startswith('Company')].reset_index(drop = True)

#Remove articles that have less than 25 words = trash
#Articles_WSJ_Clean = Articles_WSJ_Clean[Articles_WSJ_Clean['Text'].str.split().apply(len) > 25].sort_values(by='Text', key=lambda x: x.str.len(), ascending=False).reset_index(drop = True)

In [None]:
#Useful URLs per year
Articles_WSJ_Clean["Date"] = pd.to_datetime(Articles_WSJ_Clean['Date'], format='%d/%m/%Y')
Articles_WSJ_Clean["Year"] = Articles_WSJ_Clean['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WSJ_year_useful_Articles_1 = Articles_WSJ_Clean.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WSJ_year_useful_Articles_2 = Articles_WSJ_Clean.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [None]:
#Check WSJ_year_useful_Articles_1
WSJ_year_useful_Articles_1

In [None]:
#Total usefull articles
WSJ_year_useful_Articles_1["Unique_URLs_Count"].sum()

In [None]:
#WSJ_year_useful_Articles_1.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/WSJ_year_useful_Articles_1", index = False)

In [None]:
#Compare to URLs we got in the beginning
Comparison_WSJ_1 = pd.merge(WSJ_year_URLs_1, WSJ_year_useful_Articles_1, on = ["News Paper", 'Year'])
Comparison_WSJ_1.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WSJ_1["Difference"] = Comparison_WSJ_1["Unique_URLs_Count"] - Comparison_WSJ_1["Useful_Unique_URLs_Count"]

In [None]:
Comparison_WSJ_1

In [None]:
#Check WSJ_year_useful_Articles_2
WSJ_year_useful_Articles_2

In [None]:
#Total useful articles 
WSJ_year_useful_Articles_2["Unique_URLs_Count"].sum()

In [None]:
#Compare to URLs we got in the beginning
Comparison_WSJ_2 = pd.merge(WSJ_year_URLs_2, WSJ_year_useful_Articles_2, on = ["News Paper", 'Year'])
Comparison_WSJ_2.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WSJ_2["Difference"] = Comparison_WSJ_2["Unique_URLs_Count"] - Comparison_WSJ_2["Useful_Unique_URLs_Count"]

In [None]:
Comparison_WSJ_2

In [None]:
rescrape = pd.concat([no_access, total_blank])

## 1.5. Remove Duplicates

Remove the duplicates of articles that were scraped successfully scraped more than once

In [16]:
Articles_WSJ_Clean = Articles_WSJ_Clean.reset_index(drop = True)

NameError: name 'Articles_WSJ_Clean' is not defined

In [17]:
Articles_WSJ_Clean

NameError: name 'Articles_WSJ_Clean' is not defined

In [157]:
short_articles = Articles_WSJ_Clean[Articles_WSJ_Clean['Text'].str.split().apply(len) < 100].sort_values(by='Text', key=lambda x: x.str.len(), ascending=False)

In [158]:
short_articles.groupby(["Year", "Link"])["Year"].nunique().reset_index(name='Unique_URLs_Count').groupby("Year").sum()

Unnamed: 0_level_0,Unique_URLs_Count
Year,Unnamed: 1_level_1
2016,1313
2017,14998
2018,2998
2019,208
2020,233


In [159]:
rescrape = pd.concat([rescrape, short_articles])

In [177]:
rescrape = rescrape[~rescrape["Title"].str.contains("Photo")]

In [178]:
rescrape[["Link", "Date"]].to_csv("final_rescrape_URLs", index = False)

# 2. Washington Post

## 2.1. URLs

In [2]:
URLs_WP_2016 = pd.read_parquet("Washington_Post_2016_URLS")
URLs_WP_2017 = pd.read_parquet("Washington_Post_2017_URLS")
URLs_WP_2018 = pd.read_parquet("Washington_Post_2018_URLS")
URLs_WP_2019 = pd.read_parquet("Washington_Post_2019_URLS")
URLs_WP_2020 = pd.read_parquet("Washington_Post_2020_URLS")

URLs_WP = pd.concat([URLs_WP_2016, URLs_WP_2017, URLs_WP_2018, URLs_WP_2019, URLs_WP_2020]).reset_index(drop = True)

In [3]:
URLs_WP

Unnamed: 0,Date,News Paper,Link
0,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/capitals-i...
1,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/checkpoint...
2,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/the-fix/wp...
3,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/the-watch/...
4,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/early-lead...
...,...,...,...
153173,31/12/2020,Washington_Post,https://www.washingtonpost.com/national/nation...
153174,31/12/2020,Washington_Post,https://www.washingtonpost.com/local/trump-jan...
153175,31/12/2020,Washington_Post,https://www.washingtonpost.com/politics/secret...
153176,31/12/2020,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...


In [4]:
#Remove the duplicates
URLs_WP = URLs_WP.drop_duplicates().reset_index(drop = True)

In [5]:
#Collect URLs per year
URLs_WP["Date"] = pd.to_datetime(URLs_WP['Date'], format='%d/%m/%Y')
URLs_WP["Year"] = URLs_WP['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WP_year_URLs_1 = URLs_WP.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WP_year_URLs_2 = URLs_WP.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [6]:
#Check WP_year_URLs_1
WP_year_URLs_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49756
1,Washington_Post,2017,50442
2,Washington_Post,2018,28952
3,Washington_Post,2019,10275
4,Washington_Post,2020,13753


In [7]:
WP_year_URLs_1["Unique_URLs_Count"].sum()

153178

In [8]:
#WP_year_URLs_1.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/WP_year_URLs_1", index = False)

In [9]:
#Check WP_year_URLs_2
WP_year_URLs_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49756
1,Washington_Post,2017,50442
2,Washington_Post,2018,28952
3,Washington_Post,2019,10275
4,Washington_Post,2020,13753


## 2.2. Articles

In [10]:
# Set the directory path
directory = "C:/Users/Boedt/OneDrive/Bureaublad/Scraped_Articles/Washington Post"

# Get a list of filenames in the directory
filenames = os.listdir(directory)

# Loop through the filenames and read each Parquet file into a DataFrame
dfs = []
for filename in filenames:
    filepath = os.path.join(directory, filename)
    df = pd.read_parquet(filepath)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
Articles_WP = pd.concat(dfs, ignore_index=True)
Articles_WP = Articles_WP.drop("Dat", axis = 1)

Articles_WP = Articles_WP.drop("Text Length", axis = 1)
Articles_WP.loc[Articles_WP["News Paper"] == "Washington Post", "News Paper"] = "Washington_Post"

In [11]:
Articles_WP

Unnamed: 0,Title,Text,Date,News Paper,Link
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",01/01/2018,Washington_Post,https://www.washingtonpost.com/local/public-sa...
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",01/01/2018,Washington_Post,https://www.washingtonpost.com/local/homelessl...
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,01/01/2018,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,01/01/2018,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,01/01/2018,Washington_Post,https://www.washingtonpost.com/business/econom...
...,...,...,...,...,...
265133,Carolyn Hax: How can I stop my stepchild’s con...,(Nick Galifianakis for The Washington Post)\nC...,24/09/2020,Washington_Post,https://www.washingtonpost.com/lifestyle/advic...
265134,Trump’s biggest liabilities are now his own fa...,Untangling the web of Donald Trump Jr.\n3:05\n...,12/07/2017,Washington_Post,https://www.washingtonpost.com/news/the-fix/wp...
265135,D.C.-area forecast: A mix of rain and light sn...,(Jim Havard)\nAdd to your saved stories\nSave\...,06/03/2018,Washington_Post,https://www.washingtonpost.com/news/capital-we...
265136,Redskins-Giants: Notes and nuggets for Washing...,Josh Norman needs just one interception to tie...,01/01/2017,Washington_Post,https://www.washingtonpost.com/news/football-i...


In [12]:
#Drop all duplicates
Articles_WP = Articles_WP.drop_duplicates().reset_index(drop = True)

In [13]:
#Scraped articles per year (based on URLs)
Articles_WP["Date"] = pd.to_datetime(Articles_WP['Date'], format='%d/%m/%Y')
Articles_WP["Year"] = Articles_WP['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WP_year_Articles_1 = Articles_WP.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WP_year_Articles_2 = Articles_WP.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [14]:
#Check WP_year_Articles_1
WP_year_Articles_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49756
1,Washington_Post,2017,50442
2,Washington_Post,2018,28952
3,Washington_Post,2019,10275
4,Washington_Post,2020,13753


In [15]:
#Check WP_year_Articles_2
WP_year_Articles_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49756
1,Washington_Post,2017,50442
2,Washington_Post,2018,28952
3,Washington_Post,2019,10275
4,Washington_Post,2020,13753


## 2.3. Unscraped Articles

In [16]:
#Select all the URLs that were scraped, but are not present in the article data set (unscraped)
WP_unscraped_articles = URLs_WP[~URLs_WP["Link"].isin(Articles_WP["Link"])].reset_index(drop = True)

In [17]:
WP_unscraped_articles

Unnamed: 0,Date,News Paper,Link,Year


In [18]:
#Check the distribution per year
WP_unscraped_articles.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()


Unnamed: 0,News Paper,Year,Unique_URLs_Count


In [19]:
#Store the unscraped URLs so they can be rescraped! 
#WP_unscraped_articles.to_parquet("WP_missed_URLs")

## 2.4. Useful Articles

In [20]:
Articles_WP_Clean = Articles_WP

In [21]:
Articles_WP_Clean

Unnamed: 0,Title,Text,Date,News Paper,Link,Year
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/public-sa...,2018
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/homelessl...,2018
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,2018-01-01,Washington_Post,https://www.washingtonpost.com/business/econom...,2018
...,...,...,...,...,...,...
260226,Carolyn Hax: How can I stop my stepchild’s con...,(Nick Galifianakis for The Washington Post)\nC...,2020-09-24,Washington_Post,https://www.washingtonpost.com/lifestyle/advic...,2020
260227,Trump’s biggest liabilities are now his own fa...,Untangling the web of Donald Trump Jr.\n3:05\n...,2017-07-12,Washington_Post,https://www.washingtonpost.com/news/the-fix/wp...,2017
260228,D.C.-area forecast: A mix of rain and light sn...,(Jim Havard)\nAdd to your saved stories\nSave\...,2018-03-06,Washington_Post,https://www.washingtonpost.com/news/capital-we...,2018
260229,Redskins-Giants: Notes and nuggets for Washing...,Josh Norman needs just one interception to tie...,2017-01-01,Washington_Post,https://www.washingtonpost.com/news/football-i...,2017


In [22]:
#Remove empty text
remove_blank_text_df = Articles_WP_Clean 
remove_blank_text_df["Text"] = remove_blank_text_df['Text'].str.strip(" ")

blanks = remove_blank_text_df[remove_blank_text_df["Text"] == ""]

In [23]:
#Remove empty text
Articles_WP_Clean = Articles_WP_Clean.drop(remove_blank_text_df[remove_blank_text_df["Text"] == ""].index).reset_index(drop = True)

Articles_WP_Clean

Unnamed: 0,Title,Text,Date,News Paper,Link,Year
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/public-sa...,2018
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/homelessl...,2018
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,2018-01-01,Washington_Post,https://www.washingtonpost.com/business/econom...,2018
...,...,...,...,...,...,...
226437,Carolyn Hax: How can I stop my stepchild’s con...,(Nick Galifianakis for The Washington Post)\nC...,2020-09-24,Washington_Post,https://www.washingtonpost.com/lifestyle/advic...,2020
226438,Trump’s biggest liabilities are now his own fa...,Untangling the web of Donald Trump Jr.\n3:05\n...,2017-07-12,Washington_Post,https://www.washingtonpost.com/news/the-fix/wp...,2017
226439,D.C.-area forecast: A mix of rain and light sn...,(Jim Havard)\nAdd to your saved stories\nSave\...,2018-03-06,Washington_Post,https://www.washingtonpost.com/news/capital-we...,2018
226440,Redskins-Giants: Notes and nuggets for Washing...,Josh Norman needs just one interception to tie...,2017-01-01,Washington_Post,https://www.washingtonpost.com/news/football-i...,2017


In [24]:
Articles_WP_Clean = Articles_WP_Clean[Articles_WP_Clean['Text'].apply(lambda x: len(x.split(" ")) != 1)].reset_index(drop = True)

In [25]:
Articles_WP_Clean

Unnamed: 0,Title,Text,Date,News Paper,Link,Year
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/public-sa...,2018
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/homelessl...,2018
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,2018-01-01,Washington_Post,https://www.washingtonpost.com/business/econom...,2018
...,...,...,...,...,...,...
206858,Carolyn Hax: How can I stop my stepchild’s con...,(Nick Galifianakis for The Washington Post)\nC...,2020-09-24,Washington_Post,https://www.washingtonpost.com/lifestyle/advic...,2020
206859,Trump’s biggest liabilities are now his own fa...,Untangling the web of Donald Trump Jr.\n3:05\n...,2017-07-12,Washington_Post,https://www.washingtonpost.com/news/the-fix/wp...,2017
206860,D.C.-area forecast: A mix of rain and light sn...,(Jim Havard)\nAdd to your saved stories\nSave\...,2018-03-06,Washington_Post,https://www.washingtonpost.com/news/capital-we...,2018
206861,Redskins-Giants: Notes and nuggets for Washing...,Josh Norman needs just one interception to tie...,2017-01-01,Washington_Post,https://www.washingtonpost.com/news/football-i...,2017


In [26]:
#Remove failed articles (occure very frequent and are checked if they are failed or not)
checklist = Articles_WP_Clean["Text"].value_counts().reset_index()
checklist.columns = ["Text", "Count"]
checklist[checklist["Count"] > 3]

Articles_WP_Clean = Articles_WP_Clean[~Articles_WP_Clean["Text"].isin(checklist[checklist["Count"] > 3]["Text"])].reset_index(drop = True)

In [27]:
#Useful URLs per year
Articles_WP_Clean["Date"] = pd.to_datetime(Articles_WP_Clean['Date'], format='%d/%m/%Y')
Articles_WP_Clean["Year"] = Articles_WP_Clean['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WP_year_useful_Articles_1 = Articles_WP_Clean.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WP_year_useful_Articles_2 = Articles_WP_Clean.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [28]:
WP_year_useful_Articles_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,47864
1,Washington_Post,2017,48374
2,Washington_Post,2018,28214
3,Washington_Post,2019,10177
4,Washington_Post,2020,13588


In [29]:
WP_year_useful_Articles_1["Unique_URLs_Count"].sum()

148217

In [30]:
#WP_year_useful_Articles_1.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/WP_year_useful_Articles_1", index = False)

In [31]:
#Compare to URLs we got in the beginning
Comparison_WP_1 = pd.merge(WP_year_URLs_1, WP_year_useful_Articles_1, on = ["News Paper", 'Year'])
Comparison_WP_1.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WP_1["Difference"] = Comparison_WP_1["Unique_URLs_Count"] - Comparison_WP_1["Useful_Unique_URLs_Count"]

In [32]:
Comparison_WP_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count,Useful_Unique_URLs_Count,Difference
0,Washington_Post,2016,49756,47864,1892
1,Washington_Post,2017,50442,48374,2068
2,Washington_Post,2018,28952,28214,738
3,Washington_Post,2019,10275,10177,98
4,Washington_Post,2020,13753,13588,165


In [33]:
WP_year_useful_Articles_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,47864
1,Washington_Post,2017,48374
2,Washington_Post,2018,28214
3,Washington_Post,2019,10177
4,Washington_Post,2020,13588


In [34]:
WP_year_useful_Articles_2["Unique_URLs_Count"].sum()

148217

In [35]:
#Compare to URLs we got in the beginning
Comparison_WP_2 = pd.merge(WP_year_URLs_2, WP_year_useful_Articles_2, on = ["News Paper", 'Year'])
Comparison_WP_2.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WP_2["Difference"] = Comparison_WP_2["Unique_URLs_Count"] - Comparison_WP_2["Useful_Unique_URLs_Count"]

In [36]:
Comparison_WP_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count,Useful_Unique_URLs_Count,Difference
0,Washington_Post,2016,49756,47864,1892
1,Washington_Post,2017,50442,48374,2068
2,Washington_Post,2018,28952,28214,738
3,Washington_Post,2019,10275,10177,98
4,Washington_Post,2020,13753,13588,165


## 1.5. Remove Duplicates

In [37]:
#Keep the articles with the longest text
Articles_WP_Clean = Articles_WP_Clean.sort_values(by='Text', key=lambda x: x.str.split().str.len(), ascending=False).drop_duplicates(subset=['Date', 'Link'], keep='first')

In [38]:
Articles_WP_Clean["Text Length"] = Articles_WP_Clean["Text"].str.split().str.len()

In [44]:
Final_WP = Articles_WP_Clean[Articles_WP_Clean["Text Length"] > 150].reset_index(drop = True)

In [46]:
Final_WP.drop("Text Length", axis = 1).to_parquet("C:/Users/Boedt/OneDrive/Bureaublad/Scraped_Articles/WP_Final_Articles.parquet")