# 0. Packages & Functions

## 0.1. Packages

In [2]:
import pandas as pd
import os

## 0.2. Functions

# 1. Wall Street Journal

## 1.1. URLs

In [None]:
URLs_WSJ_2016 = pd.read_parquet("Wall_Street_Journal_2016_URLS(1)")
URLs_WSJ_2017 = pd.read_parquet("Wall_Street_Journal_2017_URLS (2)")
URLs_WSJ_2018 = pd.read_parquet("Wall_Street_Journal_2018_URLS")
URLs_WSJ_2019 = pd.read_parquet("Wall_Street_Journal_2019_URLS")
URLs_WSJ_2020 = pd.read_parquet("Wall_Street_Journal_2020_URLS")

URLs_WSJ = pd.concat([URLs_WSJ_2016, URLs_WSJ_2017, URLs_WSJ_2018, URLs_WSJ_2019, URLs_WSJ_2020]).reset_index(drop = True)

In [None]:
URLs_WSJ

In [None]:
#Remove the duplicate rows and the URLs with an error message in

#duplicates
URLs_WSJ = URLs_WSJ.drop_duplicates().reset_index(drop = True)

#error
mask_WSJ_URLs_error = ~ URLs_WSJ['Link'].str.contains('mod=error_page')
URLs_WSJ = URLs_WSJ[mask_WSJ_URLs_error].reset_index(drop = True)

In [None]:
URLs_WSJ

In [None]:
#Collect URLs per year
URLs_WSJ["Date"] = pd.to_datetime(URLs_WSJ['Date'], format='%d/%m/%Y')
URLs_WSJ["Year"] = URLs_WSJ['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WSJ_year_URLs_1 = URLs_WSJ.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WSJ_year_URLs_2 = URLs_WSJ.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [None]:
#Check WSJ_year_URLs_1
WSJ_year_URLs_1

In [None]:
#Check WSJ_year_URLs_2
WSJ_year_URLs_2

## 1.2. Articles

In [None]:
# Set the directory path
directory = "C:/Users/Boedt/OneDrive/Bureaublad/Scraped_Articles/Wall Street Journal"

# Get a list of filenames in the directory
filenames = os.listdir(directory)

# Loop through the filenames and read each Parquet file into a DataFrame
dfs = []
for filename in filenames:
    filepath = os.path.join(directory, filename)
    df = pd.read_parquet(filepath)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
Articles_WSJ = pd.concat(dfs, ignore_index=True)

Articles_WSJ = Articles_WSJ.drop("index", axis = 1)

In [None]:
Articles_WSJ

In [None]:
#Drop all duplicates and URLs with error message (we only found out about the error after scraping the URLs)

#duplicates
Articles_WSJ = Articles_WSJ.drop_duplicates().reset_index(drop = True)

#error
mask_WSJ_Articles_error = ~ Articles_WSJ['Link'].str.contains('mod=error_page')
Articles_WSJ = Articles_WSJ[mask_WSJ_Articles_error].reset_index(drop = True)

In [None]:
Articles_WSJ

In [None]:
#Scraped articles per year (based on URLs)
Articles_WSJ["Date"] = pd.to_datetime(Articles_WSJ['Date'], format='%d/%m/%Y')
Articles_WSJ["Year"] = Articles_WSJ['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WSJ_year_Articles_1 = Articles_WSJ.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WSJ_year_Articles_2 = Articles_WSJ.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [None]:
#Check WSJ_year_Articles_1
WSJ_year_Articles_1

In [None]:
#Check WSJ_year_Articles_2
WSJ_year_Articles_2

In [None]:
test = Articles_WSJ[["Date", "Link"]].value_counts().reset_index()
test.columns = ["Date", "Link", "Count"]
link = test[test["Count"] > 1]

In [None]:
link

In [None]:
#Articles_WSJ = Articles_WSJ.drop(Articles_WSJ[Articles_WSJ[["Link", "Date"]].duplicated() == True].index)

## 1.3. Unscraped Articles

In [None]:
#Select all the URLs that were scraped, but are not present in the article data set (unscraped)
WSJ_unscraped_articles = URLs_WSJ[~URLs_WSJ["Link"].isin(Articles_WSJ["Link"])].reset_index(drop = True)

In [None]:
WSJ_unscraped_articles

In [None]:
#Check the distribution per year
WSJ_unscraped_articles.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()
 
# -> November and December 2017 are missing and January untill April 2018 are missing. This explains the high numbers in those 
    #years 

In [None]:
#Store the unscraped URLs so they can be rescraped! 
#WSJ_unscraped_articles.to_parquet("WSJ_missed_URLs")

## 1.4. Useful Articles

In [None]:
#First store the Articles_WSJ in a new dataframe, this way we can keep the two separate.
Articles_WSJ_Clean = Articles_WSJ

In [None]:
Articles_WSJ_Clean

In [None]:
#First we remove the articles we don't have access to, this is because they belong to the WSJ Pro subscription which we don't have.
#This is typically about VC and PE, so the added value seemed limited to us.
total_no_access = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] == "NO ACCESS"]["Text"].count()
total_unique_no_access = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] == "NO ACCESS"].groupby(["Date", "Link"])["Link"].count().reset_index(drop = True).sum()
print("Total NO ACCESS rows:", total_no_access, "and total unique ones:", total_unique_no_access)


remove_NO_ACCESS = ~ Articles_WSJ_Clean["Text"].str.contains("NO ACCESS")
Articles_WSJ_Clean = Articles_WSJ_Clean[remove_NO_ACCESS].reset_index(drop = True)

In [None]:
Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] == "NO ACCESS"]

In [None]:
Articles_WSJ_Clean

In [None]:
#Do we remove the articles that contained pictures or cartoons -> we could assume the distribution of climate change is 
#equal in these articles, but we are not able to detect wether it is climate change or not. So we would have to remove them? 
#Also, this is about a neglectable portion of the articles, so not sorry wether this could potentialy effect the results


#we use an intermediate step, text cleaning of the articles happens after we removed the invalid ones
remove_blank_text_df = Articles_WSJ_Clean 
remove_blank_text_df["Text"] = remove_blank_text_df['Text'].str.strip(" ")
Articles_WSJ_Clean = Articles_WSJ_Clean.drop(remove_blank_text_df[remove_blank_text_df["Text"] == ""].index)

Articles_WSJ_Clean

In [None]:
#Some articles did not contain text, but only a pdf that could be downloaded. We remove these for same the reasoning why we 
#removed pictures and cartoons

Articles_WSJ_Clean = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] != "Download PDF"].reset_index(drop = True)
Articles_WSJ_Clean = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] != "See Solution Download PDF"].reset_index(drop = True)
Articles_WSJ_Clean = Articles_WSJ_Clean[Articles_WSJ_Clean["Text"] != "Download PDF See Solution"].reset_index(drop = True)

In [None]:
Articles_WSJ_Clean

In [None]:
#Finally, after taking a look into other articles, we noticed that some articles were also unsuccesfully scraped. -> certain
#articles only contained the phrase "write to" which is always mentioned at the end of the article. This could be due to errors
#in the HTML or code. subset these and check! Is only around 3k articles. 

Articles_WSJ_Clean = Articles_WSJ_Clean[~Articles_WSJ_Clean["Text"].str.startswith('Write to')].reset_index(drop = True)

Articles_WSJ_Clean

In [None]:
#Useful URLs per year
Articles_WSJ_Clean["Date"] = pd.to_datetime(Articles_WSJ_Clean['Date'], format='%d/%m/%Y')
Articles_WSJ_Clean["Year"] = Articles_WSJ_Clean['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WSJ_year_useful_Articles_1 = Articles_WSJ_Clean.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WSJ_year_useful_Articles_2 = Articles_WSJ_Clean.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [None]:
#Check WSJ_year_useful_Articles_1
WSJ_year_useful_Articles_1

In [None]:
#Total usefull articles
WSJ_year_useful_Articles_1["Unique_URLs_Count"].sum()

In [None]:
#Compare to URLs we got in the beginning
Comparison_WSJ_1 = pd.merge(WSJ_year_URLs_1, WSJ_year_useful_Articles_1, on = ["News Paper", 'Year'])
Comparison_WSJ_1.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WSJ_1["Difference"] = Comparison_WSJ_1["Unique_URLs_Count"] - Comparison_WSJ_1["Useful_Unique_URLs_Count"]

In [None]:
Comparison_WSJ_1

In [None]:
#Check WSJ_year_useful_Articles_2
WSJ_year_useful_Articles_2

In [None]:
#Total useful articles 
WSJ_year_useful_Articles_2["Unique_URLs_Count"].sum()

In [None]:
#Compare to URLs we got in the beginning
Comparison_WSJ_2 = pd.merge(WSJ_year_URLs_2, WSJ_year_useful_Articles_2, on = ["News Paper", 'Year'])
Comparison_WSJ_2.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WSJ_2["Difference"] = Comparison_WSJ_2["Unique_URLs_Count"] - Comparison_WSJ_2["Useful_Unique_URLs_Count"]

In [None]:
Comparison_WSJ_2

# 2. Washington Post

## 2.1. URLs

In [3]:
URLs_WP_2016 = pd.read_parquet("Washington_Post_2016_URLS")
URLs_WP_2017 = pd.read_parquet("Washington_Post_2017_URLS")
URLs_WP_2018 = pd.read_parquet("Washington_Post_2018_URLS")
URLs_WP_2019 = pd.read_parquet("Washington_Post_2019_URLS")
URLs_WP_2020 = pd.read_parquet("Washington_Post_2020_URLS")

URLs_WP = pd.concat([URLs_WP_2016, URLs_WP_2017, URLs_WP_2018, URLs_WP_2019, URLs_WP_2020])

In [4]:
URLs_WP

Unnamed: 0,Date,News Paper,Link
0,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/capitals-i...
1,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/checkpoint...
2,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/the-fix/wp...
3,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/the-watch/...
4,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/early-lead...
...,...,...,...
13748,31/12/2020,Washington_Post,https://www.washingtonpost.com/national/nation...
13749,31/12/2020,Washington_Post,https://www.washingtonpost.com/local/trump-jan...
13750,31/12/2020,Washington_Post,https://www.washingtonpost.com/politics/secret...
13751,31/12/2020,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...


In [5]:
#Remove the duplicates
URLs_WP = URLs_WP.drop_duplicates().reset_index(drop = True)

In [6]:
URLs_WP

Unnamed: 0,Date,News Paper,Link
0,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/capitals-i...
1,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/checkpoint...
2,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/the-fix/wp...
3,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/the-watch/...
4,29/02/2016,Washington_Post,https://www.washingtonpost.com/news/early-lead...
...,...,...,...
153173,31/12/2020,Washington_Post,https://www.washingtonpost.com/national/nation...
153174,31/12/2020,Washington_Post,https://www.washingtonpost.com/local/trump-jan...
153175,31/12/2020,Washington_Post,https://www.washingtonpost.com/politics/secret...
153176,31/12/2020,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...


In [7]:
#Collect URLs per year
URLs_WP["Date"] = pd.to_datetime(URLs_WP['Date'], format='%d/%m/%Y')
URLs_WP["Year"] = URLs_WP['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WP_year_URLs_1 = URLs_WP.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WP_year_URLs_2 = URLs_WP.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [8]:
#Check WP_year_URLs_1
WP_year_URLs_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49756
1,Washington_Post,2017,50442
2,Washington_Post,2018,28952
3,Washington_Post,2019,10275
4,Washington_Post,2020,13753


In [9]:
#Check WP_year_URLs_2
WP_year_URLs_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49756
1,Washington_Post,2017,50442
2,Washington_Post,2018,28952
3,Washington_Post,2019,10275
4,Washington_Post,2020,13753


## 2.2. Articles

In [46]:
# Set the directory path
directory = "C:/Users/Boedt/OneDrive/Bureaublad/Scraped_Articles/Washington Post"

# Get a list of filenames in the directory
filenames = os.listdir(directory)

# Loop through the filenames and read each Parquet file into a DataFrame
dfs = []
for filename in filenames:
    filepath = os.path.join(directory, filename)
    df = pd.read_parquet(filepath)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
Articles_WP = pd.concat(dfs, ignore_index=True)
Articles_WP = Articles_WP.drop("Dat", axis = 1)

In [47]:
Articles_WP

Unnamed: 0,Title,Text,Date,News Paper,Link
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",01/01/2018,Washington_Post,https://www.washingtonpost.com/local/public-sa...
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",01/01/2018,Washington_Post,https://www.washingtonpost.com/local/homelessl...
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,01/01/2018,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,01/01/2018,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,01/01/2018,Washington_Post,https://www.washingtonpost.com/business/econom...
...,...,...,...,...,...
175628,"Shivering in line for a swab up the nose, but ...",It was dusk on the third-to-last day of a terr...,31/12/2020,Washington_Post,https://www.washingtonpost.com/local/new-year-...
175629,National Digest: Nashville woman warned police...,,31/12/2020,Washington_Post,https://www.washingtonpost.com/national/nation...
175630,Jan. 6 protests multiply as Trump continues to...,Protests planned in support of President Trump...,31/12/2020,Washington_Post,https://www.washingtonpost.com/local/trump-jan...
175631,2021’s call to Reconstruction,One of the singularly important intellectual d...,31/12/2020,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...


In [48]:
#Drop all duplicates
Articles_WP = Articles_WP.drop_duplicates().reset_index(drop = True)

In [49]:
#Scraped articles per year (based on URLs)
Articles_WP["Date"] = pd.to_datetime(Articles_WP['Date'], format='%d/%m/%Y')
Articles_WP["Year"] = Articles_WP['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WP_year_Articles_1 = Articles_WP.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WP_year_Articles_2 = Articles_WP.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [50]:
#Check WP_year_Articles_1
WP_year_Articles_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49664
1,Washington_Post,2017,49800
2,Washington_Post,2018,27888
3,Washington_Post,2019,10233
4,Washington_Post,2020,13697


In [51]:
#Check WP_year_Articles_2
WP_year_Articles_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,49664
1,Washington_Post,2017,49800
2,Washington_Post,2018,27888
3,Washington_Post,2019,10233
4,Washington_Post,2020,13697


## 2.3. Unscraped Articles

In [52]:
#Select all the URLs that were scraped, but are not present in the article data set (unscraped)
WP_unscraped_articles = URLs_WP[~URLs_WP["Link"].isin(Articles_WP["Link"])].reset_index(drop = True)

In [53]:
WP_unscraped_articles

Unnamed: 0,Date,News Paper,Link,Year
0,2016-07-22,Washington_Post,https://www.washingtonpost.com/entertainment/t...,2016
1,2016-07-24,Washington_Post,https://www.washingtonpost.com/news/morning-mi...,2016
2,2016-07-26,Washington_Post,https://www.washingtonpost.com/politics/in-tig...,2016
3,2016-07-27,Washington_Post,https://www.washingtonpost.com/news/energy-env...,2016
4,2016-07-28,Washington_Post,https://www.washingtonpost.com/local/born-befo...,2016
...,...,...,...,...
1891,2020-12-05,Washington_Post,https://www.washingtonpost.com/world/europe/ge...,2020
1892,2020-12-11,Washington_Post,https://www.washingtonpost.com/local/obituarie...,2020
1893,2020-12-17,Washington_Post,https://www.washingtonpost.com/lifestyle/advic...,2020
1894,2020-12-23,Washington_Post,https://www.washingtonpost.com/lifestyle/advic...,2020


In [54]:
#Check the distribution per year
WP_unscraped_articles.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()


Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,92
1,Washington_Post,2017,642
2,Washington_Post,2018,1064
3,Washington_Post,2019,42
4,Washington_Post,2020,56


In [55]:
#Store the unscraped URLs so they can be rescraped! 
#WP_unscraped_articles.to_parquet("WP_missed_URLs")

## 2.4. Useful Articles

In [81]:
Articles_WP_Clean = Articles_WP

In [82]:
Articles_WP_Clean

Unnamed: 0,Title,Text,Date,News Paper,Link,Year
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/public-sa...,2018
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/homelessl...,2018
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,2018-01-01,Washington_Post,https://www.washingtonpost.com/business/econom...,2018
...,...,...,...,...,...,...
173312,"Shivering in line for a swab up the nose, but ...",It was dusk on the third-to-last day of a terr...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/new-year-...,2020
173313,National Digest: Nashville woman warned police...,,2020-12-31,Washington_Post,https://www.washingtonpost.com/national/nation...,2020
173314,Jan. 6 protests multiply as Trump continues to...,Protests planned in support of President Trump...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/trump-jan...,2020
173315,2021’s call to Reconstruction,One of the singularly important intellectual d...,2020-12-31,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...,2020


In [84]:
#Remove empty text
remove_blank_text_df = Articles_WP_Clean 
remove_blank_text_df["Text"] = remove_blank_text_df['Text'].str.strip(" ")
Articles_WP_Clean = Articles_WP_Clean.drop(remove_blank_text_df[remove_blank_text_df["Text"] == ""].index).reset_index(drop = True)

Articles_WP_Clean

Unnamed: 0,Title,Text,Date,News Paper,Link,Year
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/public-sa...,2018
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/homelessl...,2018
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,2018-01-01,Washington_Post,https://www.washingtonpost.com/business/econom...,2018
...,...,...,...,...,...,...
139527,"By the end of 2020, we were supposed to have m...",In an alternate-universe version of 2020 — and...,2020-12-31,Washington_Post,https://www.washingtonpost.com/entertainment/b...,2020
139528,"Shivering in line for a swab up the nose, but ...",It was dusk on the third-to-last day of a terr...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/new-year-...,2020
139529,Jan. 6 protests multiply as Trump continues to...,Protests planned in support of President Trump...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/trump-jan...,2020
139530,2021’s call to Reconstruction,One of the singularly important intellectual d...,2020-12-31,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...,2020


In [85]:
Articles_WP_Clean = Articles_WP_Clean[Articles_WP_Clean['Text'].apply(lambda x: len(x.split(" ")) != 1)].reset_index(drop = True)

In [93]:
Articles_WP_Clean

Unnamed: 0,Title,Text,Date,News Paper,Link,Year
0,Clues to how a young man went from ‘classic te...,"Sean Andrew Duncan, 21, of Sterling, will be a...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/public-sa...,2018
1,‘Homelessly in Love’ tells stories of love ami...,"The documentary ""Homelessly in Love"" tells sto...",2018-01-01,Washington_Post,https://www.washingtonpost.com/local/homelessl...,2018
2,Looking for a star for your winter garden? Try...,Winterberry holly. (Adrian Higgins/TWP)\nComme...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
3,7 questions to ask yourself before taking on a...,(bee32/Getty Images/iStockphoto)\nComment\n0\n...,2018-01-01,Washington_Post,https://www.washingtonpost.com/lifestyle/home/...,2018
4,Congress will return to a full slate of diffic...,The GOP is starting 2018 with a lofty legislat...,2018-01-01,Washington_Post,https://www.washingtonpost.com/business/econom...,2018
...,...,...,...,...,...,...
119967,"By the end of 2020, we were supposed to have m...",In an alternate-universe version of 2020 — and...,2020-12-31,Washington_Post,https://www.washingtonpost.com/entertainment/b...,2020
119968,"Shivering in line for a swab up the nose, but ...",It was dusk on the third-to-last day of a terr...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/new-year-...,2020
119969,Jan. 6 protests multiply as Trump continues to...,Protests planned in support of President Trump...,2020-12-31,Washington_Post,https://www.washingtonpost.com/local/trump-jan...,2020
119970,2021’s call to Reconstruction,One of the singularly important intellectual d...,2020-12-31,Washington_Post,https://www.washingtonpost.com/opinions/2021s-...,2020


In [138]:
#Remove failed articles (occure very frequent and are checked if they are failed or not)
checklist = Articles_WP_Clean["Text"].value_counts().reset_index()
checklist.columns = ["Text", "Count"]
checklist[checklist["Count"] > 3]

Articles_WP_Clean = Articles_WP_Clean[~Articles_WP_Clean["Text"].isin(checklist[checklist["Count"] > 3]["Text"])].reset_index(drop = True)

In [141]:
#Useful URLs per year
Articles_WP_Clean["Date"] = pd.to_datetime(Articles_WP_Clean['Date'], format='%d/%m/%Y')
Articles_WP_Clean["Year"] = Articles_WP_Clean['Date'].dt.year

#Unique URLs -> unique combination of date and URL
WP_year_useful_Articles_1 = Articles_WP_Clean.groupby(["News Paper","Year", "Date"])["Link"].nunique().reset_index(name='Unique_URLs_Count').groupby(["News Paper", "Year"]).sum("Unique_URLs_Count").reset_index()

#Unique URLs -> one URL on two different dates counts as one URL
WP_year_useful_Articles_2 = Articles_WP_Clean.groupby(["News Paper","Year"])["Link"].nunique().reset_index(name='Unique_URLs_Count')

In [142]:
WP_year_useful_Articles_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,40513
1,Washington_Post,2017,31232
2,Washington_Post,2018,21889
3,Washington_Post,2019,10055
4,Washington_Post,2020,13291


In [147]:
WP_year_useful_Articles_1["Unique_URLs_Count"].sum()

116980

In [144]:
#Compare to URLs we got in the beginning
Comparison_WP_1 = pd.merge(WP_year_URLs_1, WP_year_useful_Articles_1, on = ["News Paper", 'Year'])
Comparison_WP_1.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WP_1["Difference"] = Comparison_WP_1["Unique_URLs_Count"] - Comparison_WP_1["Useful_Unique_URLs_Count"]

In [145]:
Comparison_WP_1

Unnamed: 0,News Paper,Year,Unique_URLs_Count,Useful_Unique_URLs_Count,Difference
0,Washington_Post,2016,49756,40513,9243
1,Washington_Post,2017,50442,31232,19210
2,Washington_Post,2018,28952,21889,7063
3,Washington_Post,2019,10275,10055,220
4,Washington_Post,2020,13753,13291,462


In [148]:
WP_year_useful_Articles_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count
0,Washington_Post,2016,40513
1,Washington_Post,2017,31232
2,Washington_Post,2018,21889
3,Washington_Post,2019,10055
4,Washington_Post,2020,13291


In [149]:
WP_year_useful_Articles_2["Unique_URLs_Count"].sum()

116980

In [150]:
#Compare to URLs we got in the beginning
Comparison_WP_2 = pd.merge(WP_year_URLs_2, WP_year_useful_Articles_2, on = ["News Paper", 'Year'])
Comparison_WP_2.columns = ["News Paper", "Year", "Unique_URLs_Count", "Useful_Unique_URLs_Count"]
Comparison_WP_2["Difference"] = Comparison_WP_2["Unique_URLs_Count"] - Comparison_WP_2["Useful_Unique_URLs_Count"]

In [151]:
Comparison_WP_2

Unnamed: 0,News Paper,Year,Unique_URLs_Count,Useful_Unique_URLs_Count,Difference
0,Washington_Post,2016,49756,40513,9243
1,Washington_Post,2017,50442,31232,19210
2,Washington_Post,2018,28952,21889,7063
3,Washington_Post,2019,10275,10055,220
4,Washington_Post,2020,13753,13291,462
