In [1]:
#We import the necessary libraries.
import pandas as pd
from tqdm import tqdm
import pandas as pd
import json
from pandas import json_normalize
import re
from collections import Counter
import numpy as np
import sys
from scipy import stats
import matplotlib.pyplot as plt

## Reading Lagged File

In [2]:
#We read the created 'New Publications' file in script I, which excluded old articles.
df = pd.read_csv("New Publications.csv")

In [3]:
#We check for duplicates.
print(df.duplicated().value_counts())

False    17784352
dtype: int64


In [4]:
#We order the dataframe on date within each publication (this is how the file was already structured, but just to be sure).
df = df.groupby(["Newspaper", "Landing page"]).apply(lambda x: x.sort_values("Timestamp date")).reset_index(drop=True)

In [5]:
#We add the value one as a column, the difference between rows is namely 1 hour across publications.
df["Hour"] = 1
#We take the sum of this column cumulatively within each publication, thus creating the hours passed.
df["Hours passed"] = df.groupby(["Newspaper", "Landing page"])["Hour"].cumsum()
#We drop the original column.
df = df.drop(columns = ["Hour"])

In [6]:
#We create a function that calculates the percentage across columns.
def get_percentage(col1, col2): 
    return col1 / col2 * 100

In [7]:
#We also take the sum of views cumulatively within each publication.
df["Added views"] = df.groupby(["Newspaper", "Landing page"])["Views"].cumsum()
#We also calculate the total sum of views.
df["Total views"] = df.groupby(["Newspaper", "Landing page"])["Views"].transform("sum")
#We calculate the percentage between those for each row.
df["Percentage"] = get_percentage(df["Added views"], df["Total views"])

In [8]:
#We inspect the data.
df.head()

Unnamed: 0,Newspaper,Landing page,Timestamp date,Views,Publication date,Hours passed,Added views,Total views,Percentage
0,Gooieneemlander,/cnt/dmf20190705_81476516,2021-05-26 19:00:00,0,2021-05-26 19:00:00,1,0,3,0.0
1,Gooieneemlander,/cnt/dmf20190705_81476516,2021-05-26 20:00:00,0,2021-05-26 19:00:00,2,0,3,0.0
2,Gooieneemlander,/cnt/dmf20190705_81476516,2021-05-26 21:00:00,0,2021-05-26 19:00:00,3,0,3,0.0
3,Gooieneemlander,/cnt/dmf20190705_81476516,2021-05-26 22:00:00,0,2021-05-26 19:00:00,4,0,3,0.0
4,Gooieneemlander,/cnt/dmf20190705_81476516,2021-05-26 23:00:00,0,2021-05-26 19:00:00,5,0,3,0.0


In [9]:
#We also create a column with the time of the day, thus only keeping the hour of the datetime object.
df["Time of the day"] = df["Timestamp date"].apply(lambda x: x[11:-6])

## Shelf-life

In [10]:
#We combine our existing data with a subset of it. Namely we subset our data to only include rows in which atleast 90% of the views have been reached.
#We than group this by publication, and take the first row. The hours passed thus now indicates the amount of hours it took to reach the 90%-mark.
df = df.merge(df[df.Percentage >= 90][["Newspaper", "Landing page", "Hours passed"]].groupby(["Newspaper", "Landing page"]).nth(0)\
.reset_index().rename(columns = {"Hours passed": "Shelf-life"}), how="left", on=["Newspaper", "Landing page"])

## Half-life

In [11]:
#We do the same but now for half-life: the first moment that 50% of the views was reached.
df = df.merge(df[df.Percentage >= 50][["Newspaper", "Landing page", "Hours passed"]].groupby(["Newspaper", "Landing page"]).nth(0)\
.reset_index().rename(columns = {"Hours passed": "Half-life"}), how="left", on=["Newspaper", "Landing page"])

## Hours till Peak

In [12]:
#We look for the highest amount of views reached within one hour for each publication.
df["Peak views"] = df.groupby(["Newspaper", "Landing page"])["Views"].transform("max")

In [13]:
#In a similar fashion as for the 'life'-metrics, we look for how many hours it took to for the first time reach its maximum amount of views ever.
df = df.merge(df[df.Views == df["Peak views"]][["Newspaper", "Landing page", "Hours passed"]].groupby(["Newspaper", "Landing page"]).nth(0)\
.reset_index().rename(columns = {"Hours passed": "Hours till peak"}), how="left", on=["Newspaper", "Landing page"])

In [14]:
#We quickly inspect the created columns.
df.groupby(["Newspaper", "Landing page"]).nth(0).reset_index()[["Shelf-life", "Half-life", "Hours till peak"]].describe().astype(int)

Unnamed: 0,Shelf-life,Half-life,Hours till peak
count,46550,46550,46550
mean,168,31,15
std,324,110,85
min,1,1,1
25%,8,2,1
50%,34,6,2
75%,136,17,5
max,2115,2072,2072


## Popularity Rank

In [15]:
#We create a ranking metric, in which within each hours of each newspaper the articles are ranked on the basis of their views.
df["Popularity rank"] = df.groupby(["Newspaper", "Timestamp date"])["Views"].rank(method="min", ascending=False)

In [16]:
#We sort the data on views to see if the ranking makes sense (it does).
df.sort_values("Views", ascending=False)

Unnamed: 0,Newspaper,Landing page,Timestamp date,Views,Publication date,Hours passed,Added views,Total views,Percentage,Time of the day,Shelf-life,Half-life,Peak views,Hours till peak,Popularity rank
11901883,Noordhollandsdagblad,/cnt/dmf20210430_111098,2021-05-01 10:00:00,12532,2021-04-30 19:00:00,16,33715,122780,27.459684,10,77,49,12532,16,1.0
7958040,Leidschdagblad,/cnt/dmf20210706_50014093,2021-07-06 20:00:00,12093,2021-07-06 20:00:00,1,12093,16550,73.069486,20,4,1,12093,1,1.0
17741027,Noordhollandsdagblad,/cnt/dmf20210706_50014093,2021-07-06 21:00:00,10569,2021-07-06 20:00:00,2,13835,36983,37.409080,21,21,4,10569,2,1.0
14101276,Noordhollandsdagblad,/cnt/dmf20210516_61210176,2021-05-16 20:00:00,9123,2021-05-16 19:00:00,2,17460,112701,15.492320,20,25,15,9123,2,1.0
12128607,Noordhollandsdagblad,/cnt/dmf20210501_18794464,2021-05-01 09:00:00,8792,2021-05-01 08:00:00,2,17042,46276,36.826865,09,24,4,8792,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6100968,Leidschdagblad,/cnt/dmf20210423_60516251,2021-05-25 09:00:00,0,2021-04-23 18:00:00,760,237,263,90.114068,09,743,15,55,4,88.0
6100969,Leidschdagblad,/cnt/dmf20210423_60516251,2021-05-25 10:00:00,0,2021-04-23 18:00:00,761,237,263,90.114068,10,743,15,55,4,100.0
6100970,Leidschdagblad,/cnt/dmf20210423_60516251,2021-05-25 11:00:00,0,2021-04-23 18:00:00,762,237,263,90.114068,11,743,15,55,4,93.0
6100971,Leidschdagblad,/cnt/dmf20210423_60516251,2021-05-25 12:00:00,0,2021-04-23 18:00:00,763,237,263,90.114068,12,743,15,55,4,100.0


## Average Popularity Rank in 1st 2h

In [17]:
#Per publication we caluclate the mean score of the popularity rank over the first two hours.
df["Popularity rank in 1st 2h"] = df.groupby(["Newspaper", "Landing page"])["Popularity rank"].transform(lambda x: x.head(2).mean())

## Views in 1st 2h

In [18]:
#We do the same but than for views.
df["Views in 1st 2h"] = df.groupby(["Newspaper", "Landing page"])["Views"].transform(lambda x: x.head(2).sum())

In [19]:
#We inspect the data.
df.groupby(["Newspaper", "Landing page"]).head(2)

Unnamed: 0,Newspaper,Landing page,Timestamp date,Views,Publication date,Hours passed,Added views,Total views,Percentage,Time of the day,Shelf-life,Half-life,Peak views,Hours till peak,Popularity rank,Popularity rank in 1st 2h,Views in 1st 2h
0,Gooieneemlander,/cnt/dmf20190705_81476516,2021-05-26 19:00:00,0,2021-05-26 19:00:00,1,0,3,0.000000,19,38,28,1,22,61.0,64.0,0
1,Gooieneemlander,/cnt/dmf20190705_81476516,2021-05-26 20:00:00,0,2021-05-26 19:00:00,2,0,3,0.000000,20,38,28,1,22,67.0,64.0,0
38,Gooieneemlander,/cnt/dmf20200501_53506347,2021-07-05 18:00:00,4,2021-07-05 18:00:00,1,4,13,30.769231,18,19,3,4,1,24.0,31.0,5
39,Gooieneemlander,/cnt/dmf20200501_53506347,2021-07-05 19:00:00,1,2021-07-05 18:00:00,2,5,13,38.461538,19,19,3,4,1,38.0,31.0,5
65,Gooieneemlander,/cnt/dmf20201026_62507126,2021-04-22 10:00:00,1,2021-04-22 10:00:00,1,1,2,50.000000,10,5,1,1,1,31.0,44.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17784329,Noordhollandsdagblad,/cnt/dmf20210711_96015867,2021-07-11 18:00:00,3,2021-07-11 17:00:00,2,4,7,57.142857,18,7,2,3,2,98.0,124.0,4
17784335,Noordhollandsdagblad,/cnt/dmf20210711_98210529,2021-07-11 03:00:00,0,2021-07-11 03:00:00,1,0,2,0.000000,03,14,2,1,2,62.0,45.0,1
17784336,Noordhollandsdagblad,/cnt/dmf20210711_98210529,2021-07-11 04:00:00,1,2021-07-11 03:00:00,2,1,2,50.000000,04,14,2,1,2,28.0,45.0,1
17784349,Noordhollandsdagblad,/cnt/dmf20210711_99077945,2021-07-11 21:00:00,21,2021-07-11 21:00:00,1,21,486,4.320988,21,3,2,345,2,37.0,21.0,366


In [20]:
#We create an empty list.
popularity = []

#We take the mean views in the first two hours.
mean = np.mean(df["Views in 1st 2h"])

#We loop over the 'views in the 1st 2h'-column.
for item in tqdm(df["Views in 1st 2h"]):
    #We check if it is above or below average and append the answer to the empty list.
    if item > mean:
        popularity.append(True)
    else:
        popularity.append(False)

100%|██████████| 17784352/17784352 [00:07<00:00, 2298653.48it/s]


In [21]:
#We add the created list as a column.
df["Popularity"] = popularity
popularity = []
Counter(df.Popularity)

Counter({False: 15245555, True: 2538797})

In [22]:
#We inspect the data.
df.head()

Unnamed: 0,Newspaper,Landing page,Timestamp date,Views,Publication date,Hours passed,Added views,Total views,Percentage,Time of the day,Shelf-life,Half-life,Peak views,Hours till peak,Popularity rank,Popularity rank in 1st 2h,Views in 1st 2h,Popularity
0,Gooieneemlander,/cnt/dmf20190705_81476516,2021-05-26 19:00:00,0,2021-05-26 19:00:00,1,0,3,0.0,19,38,28,1,22,61.0,64.0,0,False
1,Gooieneemlander,/cnt/dmf20190705_81476516,2021-05-26 20:00:00,0,2021-05-26 19:00:00,2,0,3,0.0,20,38,28,1,22,67.0,64.0,0,False
2,Gooieneemlander,/cnt/dmf20190705_81476516,2021-05-26 21:00:00,0,2021-05-26 19:00:00,3,0,3,0.0,21,38,28,1,22,60.0,64.0,0,False
3,Gooieneemlander,/cnt/dmf20190705_81476516,2021-05-26 22:00:00,0,2021-05-26 19:00:00,4,0,3,0.0,22,38,28,1,22,58.0,64.0,0,False
4,Gooieneemlander,/cnt/dmf20190705_81476516,2021-05-26 23:00:00,0,2021-05-26 19:00:00,5,0,3,0.0,23,38,28,1,22,46.0,64.0,0,False


## Standardized Views per Publication

In [23]:
#We standardize the views within each publication and add it as a new column.
df["Z_Views"] = df.groupby(["Newspaper", "Landing page"])["Views"].transform(lambda x: stats.zscore(x))

## Slope Before Peak

In [24]:
#We calculate the slope till the peak. We do so by merging the dataframe with a subset of it.
#This subset only includes the passed hours till the peak is reached.
#Than the data is groupped by publication.
#The exact slope is than calculated within each publication.
#This is thus simply done by calculating the difference between the views in the first hour and the last hour (the hour of the peak).
#And dividing it by the difference in hours.
df = df.merge(df[(df["Hours passed"] <= df["Hours till peak"])]\
       .groupby(["Newspaper", "Landing page"])\
       .apply(lambda x: (x.Z_Views.iloc[-1] - x.Z_Views.iloc[0]) / (x["Hours passed"].iloc[-1] - x["Hours passed"].iloc[0])).reset_index()\
       .rename(columns = {0: "Slope before peak"}), how="left", on=["Newspaper", "Landing page"])

  .apply(lambda x: (x.Z_Views.iloc[-1] - x.Z_Views.iloc[0]) / (x["Hours passed"].iloc[-1] - x["Hours passed"].iloc[0])).reset_index()\


In [25]:
#We quickly inspect it.
df["Slope before peak"].describe()

count    1.320451e+07
mean     6.937747e+00
std      7.615506e+00
min      2.103780e-03
25%      1.034062e+00
50%      4.157347e+00
75%      1.048805e+01
max      4.362373e+01
Name: Slope before peak, dtype: float64

## Slope After Peak till Shelf-life

In [26]:
#We do the same but now starting from the peak till shelf-life.
df = df.merge(df[(df["Hours passed"] >= df["Hours till peak"]) & (df["Hours passed"] <= df["Shelf-life"])]\
       .groupby(["Newspaper", "Landing page"])\
       .apply(lambda x: (x.Z_Views.iloc[-1] - x.Z_Views.iloc[0]) / (x["Hours passed"].iloc[-1] - x["Hours passed"].iloc[0])).reset_index()\
       .rename(columns = {0: "Slope after peak"}), how="left", on=["Newspaper", "Landing page"])

  .apply(lambda x: (x.Z_Views.iloc[-1] - x.Z_Views.iloc[0]) / (x["Hours passed"].iloc[-1] - x["Hours passed"].iloc[0])).reset_index()\


In [27]:
#And we inspect it again.
df["Slope after peak"].describe()

count    1.746918e+07
mean    -3.787015e-01
std      8.665576e-01
min     -3.556453e+01
25%     -4.840733e-01
50%     -8.260033e-02
75%     -1.050703e-02
max      0.000000e+00
Name: Slope after peak, dtype: float64

## Reading Meta File

In [28]:
#We create a list of all newspapers.
newspapers = ["gooieneemlander", "haarlemsdagblad", "ijmuidercourant", "leidschdagblad", "noordhollandsdagblad"]

#We create an empty list.
df2 = []

#We loop over the newspapers.
for n in tqdm(newspapers):
    #And for each newspaper we read in its respective json file.
    with open(f"{n}.json", "r") as f:
        for line in f:
            #We load it line for line.
            line = json.loads(line)
            #We append it line for line.
            df2.append(line)

#We normalize the data.
df2 = json_normalize(df2)

100%|██████████| 5/5 [00:16<00:00,  3.27s/it]


In [29]:
def extract_columns(listofdicts):
    normalizeddict = {}
    for entry in listofdicts:
        if "name" in entry:
            normalizeddict[entry["name"]] = entry.get("content", None)
        elif "property" in entry:
            normalizeddict[entry["property"]] = entry.get("content", None)
        else:
            continue
    return normalizeddict

In [30]:
df2 = df2.join(pd.DataFrame.from_records(df2["meta"].apply(extract_columns))).drop("meta", axis=1)

In [31]:
#We inspect all the columns.
df2.columns

Index(['url', 'meta-script.interactiefPrefix', 'meta-script.brooklynapiurl',
       'meta-script.readLaterServiceUrl', 'meta-script.froomleServiceUrl',
       'meta-script.access_loginshown', 'meta-script.article_author',
       'meta-script.article_bodycharcount', 'meta-script.article_positive',
       'meta-script.article_cimpage', 'meta-script.article_cimpage_free',
       'meta-script.article_cityname', 'meta-script.article_contenttype',
       'meta-script.article_dossier', 'meta-script.article_id',
       'meta-script.article_introcharcount', 'meta-script.article_sectionid',
       'meta-script.article_section', 'meta-script.article_maintag',
       'meta-script.article_paper', 'meta-script.article_publicationdate',
       'meta-script.article_publicationtime',
       'meta-script.article_ispluslayout', 'meta-script.article_source',
       'meta-script.article_sourceoforigin', 'meta-script.article_taglist',
       'meta-script.article_title', 'meta-script.article_type',
       'm

In [32]:
#We use a regular expression to extract the landing page.
df2["Landing page"] = df2["url"].apply(lambda x: re.search(r"\/cnt\/dmf(\d+)_\d+", x.lower()).group(0))

In [33]:
#We quickly sort our dataset on the length of the landing page, to see if the streamlining went correctly,
#Thus if we do not have very short or very long landing page ids.
df2.sort_values(by="Landing page", key=lambda x: x.str.len())[["Landing page"]]

Unnamed: 0,Landing page
12448,/cnt/dmf20191229_491
120634,/cnt/dmf20191229_491
60606,/cnt/dmf20200812_110
107952,/cnt/dmf20200812_110
35769,/cnt/dmf20200812_110
...,...
73599,/cnt/dmf20190604_32241827
73600,/cnt/dmf20210331_18308698
73602,/cnt/dmf20180926_49230990
73629,/cnt/dmf20200803_19491311


In [34]:
#We also extract the newspapers from the url.
df2["Newspaper"] = df2["url"].apply(lambda x:x[x.find("www.") + len("www."):x.find(".nl")].title())

In [35]:
Counter(df2.Newspaper)

Counter({'Gooieneemlander': 24438,
         'Haarlemsdagblad': 26273,
         'Ijmuidercourant': 16433,
         'Leidschdagblad': 30476,
         'Noordhollandsdagblad': 111875})

In [56]:
df_txt = df2[["url", "meta-script.article_title", "description"]]

In [58]:
df_txt.to_csv("text_of_publications.csv")

## Trimming File

In [36]:
#We only keep the needed columns.
df2 = df2[["meta-script.article_author", "meta-script.article_bodycharcount", "meta-script.article_introcharcount", 
           "meta-script.page_sectiontrees", "meta-script.article_has_inline_media", 
           "cXenseParse:mhu-article_ispaidcontent", "Newspaper", "Landing page", "meta-script.article_publicationdate"]]

In [37]:
#We rename some columns.
df2.rename(columns = {"meta-script.article_author": "Author",
                      "meta-script.article_bodycharcount": "Body character count",
                      "meta-script.article_introcharcount": "Intro character count",
                      "meta-script.page_sectiontrees": "Section trees (aggregate)",
                      "meta-script.article_has_inline_media": "Inline media",
                      "cXenseParse:mhu-article_ispaidcontent": "Paid content",
                      "meta-script.article_publicationdate": "Publication date"}, inplace = True)

In [38]:
#We inspect the data.
df2.head()

Unnamed: 0,Author,Body character count,Intro character count,Section trees (aggregate),Inline media,Paid content,Newspaper,Landing page,Publication date
0,ANP,4888.0,302.0,nhd/sport|hd/sport|ge/sport|yc/sport|ld/sport|...,True,False,Gooieneemlander,/cnt/dmf20210612_49952447,20210612
1,Internetredactie,839.0,230.0,nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210611_74078660,20210611
2,Susanne van Velzen,3403.0,426.0,nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,True,Gooieneemlander,/cnt/dmf20210609_6603454,20210609
3,Internetredactie,1807.0,182.0,nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210609_60025534,20210609
4,Leander Mascini,2146.0,262.0,nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210608_17288936,20210608


In [39]:
#We inspect the number of duplicates.
print(df2.duplicated().value_counts())

False    208640
True        855
dtype: int64


In [40]:
#We remove the duplicates.
df2.drop_duplicates(inplace=True)

## Number of Sections and Categorical Dummies

In [41]:
#We create an empty list.
section_trees = []

#We loop over the newspapers and trees for each publication.
for newspaper, trees in tqdm(zip(df2["Newspaper"], df2["Section trees (aggregate)"]), total=len(df2)):
    #We check which newspaper it is and define its respective abbreviation. 
    #(This abbreviation comes back in section trees and is therefore needed).
    if newspaper == "Gooieneemlander":
        abbr = "ge"
    elif newspaper == "Haarlemsdagblad":
        abbr = "hd"
    elif newspaper == "Ijmuidercourant":
        abbr = "yc"
    elif newspaper == "Leidschdagblad":
        abbr = "ld"
    elif newspaper == "Noordhollandsdagblad":
        abbr = "nhd"
    
    #We create another empty list.
    clean = []
    
    #If trees is a string (thus not empty) we will loop over it.
    if type(trees) == str:
        #We will first split it on '|', the symbol which seperates the different trees/paths an article was on.
        for tree in trees.split("|"):
            #We than split up the path in its seperate layers ('nhd/sport' becomes [nhd, sport]).
            #And we check if one of those layers equals the abbreviation of the newspaper the publication was on.
            #If so we append the original tree to the clean list.
            #The purpose of doing so is to only keep the trees were on the newspaper of the respective publication.
            #And thus to exclude the trees that refer to the same article, but that originate from a different newspaper.
            if any(layer == abbr for layer in tree.split("/")):
                clean.append(tree)
        
        #We append this clean list to the overarching list.
        section_trees.append(clean)
    
    #If it is not a string we will append nothing.
    else:
        section_trees.append("")

100%|██████████| 208640/208640 [00:01<00:00, 125970.12it/s]


In [42]:
#We add this list as a column.
df2["Section trees"] = section_trees
section_trees =[]

In [43]:
#We inspect the data and can now see a cleaned version of the section trees column.
df2.head()

Unnamed: 0,Author,Body character count,Intro character count,Section trees (aggregate),Inline media,Paid content,Newspaper,Landing page,Publication date,Section trees
0,ANP,4888.0,302.0,nhd/sport|hd/sport|ge/sport|yc/sport|ld/sport|...,True,False,Gooieneemlander,/cnt/dmf20210612_49952447,20210612,"[ge/sport, ge]"
1,Internetredactie,839.0,230.0,nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210611_74078660,20210611,"[ge/regio/gooi-en-eemland-, ge]"
2,Susanne van Velzen,3403.0,426.0,nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,True,Gooieneemlander,/cnt/dmf20210609_6603454,20210609,"[ge/regio/gooi-en-eemland-, ge]"
3,Internetredactie,1807.0,182.0,nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210609_60025534,20210609,"[ge/regio/gooi-en-eemland-, ge]"
4,Leander Mascini,2146.0,262.0,nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210608_17288936,20210608,"[ge/regio/gooi-en-eemland-, ge]"


In [44]:
#We create a list of the abbreviations.
newspapers = ["nhd", "hd", "ge", "yc", "ld"]

#As well as an empty list.
frontpage = []

#We loop over the trees of the cleaned up section trees column.
for trees in tqdm(df2["Section trees"].values):
    #If one of those equals an abbreviation, it indicates that the article has been on the frontpage.
    #We append the respective answers to the list.
    if any(tree in newspapers for tree in trees):
        frontpage.append(True)
    else:
        frontpage.append(False)

100%|██████████| 208640/208640 [00:00<00:00, 567897.26it/s]


In [45]:
#We inspect the list.
Counter(frontpage)

Counter({True: 79654, False: 128986})

In [46]:
#We add it as a column.
df2["Frontpage"] = frontpage
frontpage = []

## Other Sections

In [47]:
#We create a function to create dummy variables if a publication is published on the section yes or no.
def section_dummy(section_name):
    #We define an empty list.
    dummy_list = []
    
    #We loop over the section trees of a publication.
    for trees in tqdm(df2["Section trees"].values):
        #We check if the section name is present in any of the trees.
        if any(section_name in tree for tree in trees):
            #If so we append true to the list.
            dummy_list.append(True)
        else:
            #If not we append false to it.
            dummy_list.append(False)
    
    #We check if the dummy list equals the original dataframe.
    print(len(dummy_list) == len(df2))
    #We print a counter.
    print(Counter(dummy_list))
    #We add it as a column.
    df2[section_name.title()] = dummy_list
    #We empty up the list.
    dummy_list = []

In [48]:
#We do this for all relevant sections.
section_dummy("uitgelicht")

100%|██████████| 208640/208640 [00:00<00:00, 633851.33it/s]

True
Counter({False: 203309, True: 5331})





In [49]:
section_dummy("keuze-van-de-redactie")

100%|██████████| 208640/208640 [00:00<00:00, 639188.52it/s]

True
Counter({False: 206085, True: 2555})





In [50]:
section_dummy("sport")

100%|██████████| 208640/208640 [00:00<00:00, 631850.89it/s]

True
Counter({False: 194761, True: 13879})





In [51]:
section_dummy("regio")

100%|██████████| 208640/208640 [00:00<00:00, 617375.98it/s]

True
Counter({True: 119182, False: 89458})





In [52]:
section_dummy("achtergrond")

100%|██████████| 208640/208640 [00:00<00:00, 632558.37it/s]

True
Counter({False: 204976, True: 3664})





In [53]:
section_dummy("lifestyle")

100%|██████████| 208640/208640 [00:00<00:00, 612542.02it/s]

True
Counter({False: 205589, True: 3051})





In [54]:
section_dummy("buitenland")

100%|██████████| 208640/208640 [00:00<00:00, 607581.89it/s]

True
Counter({False: 198209, True: 10431})





In [55]:
section_dummy("binnenland")

100%|██████████| 208640/208640 [00:00<00:00, 624951.95it/s]

True
Counter({False: 186637, True: 22003})





In [56]:
section_dummy("opinie-column")

100%|██████████| 208640/208640 [00:00<00:00, 619300.58it/s]

True
Counter({False: 206039, True: 2601})





In [57]:
section_dummy("cultuur")

100%|██████████| 208640/208640 [00:00<00:00, 638960.30it/s]

True
Counter({False: 207476, True: 1164})





In [58]:
#We translate all of the column names.
df2.rename(columns={"Uitgelicht": "Highlighted", "Keuze-Van-De-Redactie": "Recommended", "Regio": "Regional", 
                    "Achtergrond": "Background", "Buitenland": "Foreign",
                    "Binnenland": "Domestic", "Opinie-Column": "Opinion", "Cultuur": "Culture"}, inplace=True)

In [59]:
df2.head()

Unnamed: 0,Author,Body character count,Intro character count,Section trees (aggregate),Inline media,Paid content,Newspaper,Landing page,Publication date,Section trees,...,Highlighted,Recommended,Sport,Regional,Background,Lifestyle,Foreign,Domestic,Opinion,Culture
0,ANP,4888.0,302.0,nhd/sport|hd/sport|ge/sport|yc/sport|ld/sport|...,True,False,Gooieneemlander,/cnt/dmf20210612_49952447,20210612,"[ge/sport, ge]",...,False,False,True,False,False,False,False,False,False,False
1,Internetredactie,839.0,230.0,nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210611_74078660,20210611,"[ge/regio/gooi-en-eemland-, ge]",...,False,False,False,True,False,False,False,False,False,False
2,Susanne van Velzen,3403.0,426.0,nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,True,Gooieneemlander,/cnt/dmf20210609_6603454,20210609,"[ge/regio/gooi-en-eemland-, ge]",...,False,False,False,True,False,False,False,False,False,False
3,Internetredactie,1807.0,182.0,nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210609_60025534,20210609,"[ge/regio/gooi-en-eemland-, ge]",...,False,False,False,True,False,False,False,False,False,False
4,Leander Mascini,2146.0,262.0,nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210608_17288936,20210608,"[ge/regio/gooi-en-eemland-, ge]",...,False,False,False,True,False,False,False,False,False,False


## Number of Sections (Including Frontpage)

In [60]:
#We create an empty list.
number_of_sections = []

#We loop over the sections in section trees.
for sections in tqdm(df2["Section trees"]):
    #If it is empty we append nothing.
    if sections == "":
        number_of_sections.append("")
    #Otherwise we append the length of it.
    else:
        number_of_sections.append(len(sections))

100%|██████████| 208640/208640 [00:00<00:00, 1027012.14it/s]


In [61]:
#We inspect the list.
Counter(number_of_sections)

Counter({2: 47669,
         3: 2675,
         1: 156868,
         '': 1146,
         4: 220,
         7: 5,
         6: 15,
         5: 41,
         8: 1})

In [62]:
#We add the list as a column.
df2["Number of sections (incl. frontpage)"] = number_of_sections
number_of_sections = []

## Number of Sections (Excluding Frontpage)

In [63]:
#We now do exactly the same however we exclude the sections that are solely an abbreviation.
#These sections indicate being on the frontpage, however, the number of sections highly correlate with being on the frontpage.
#We thus exclude them from the length.
number_of_sections_V2 = []

newspapers = ["nhd", "hd", "ge", "yc", "ld"]

for sections in tqdm(df2["Section trees"]):
    if sections == "":     
        number_of_sections_V2.append("")
    else:
        #Thus if the section is in newspapers (the list of abbreviations, the proxy for frontpage), it is excluded from the count.
        number_of_sections_V2.append(len([x for x in sections if x not in newspapers]))

100%|██████████| 208640/208640 [00:00<00:00, 570719.85it/s]


In [64]:
#We inspect the list.
#Ofcourse zero now corresponds to being on the frontpage and thus it needs to be made missing in the analysis without controls.
Counter(number_of_sections_V2)

Counter({1: 171957,
         2: 4290,
         0: 30865,
         '': 1146,
         3: 310,
         6: 8,
         5: 21,
         4: 42,
         7: 1})

In [65]:
#We add the list as a column.
df2["Number of sections (excl. frontpage)"] = number_of_sections_V2
number_of_sections_V2 = []

## Author / Byline

In [66]:
#We print the most common authors.
print((Counter(df2.Author).most_common(100)))

[('ANP Producties', 32413), ('', 14403), ('Internetredactie', 8736), ('Van onze verslaggever', 3512), ('ANP', 2734), ('Eddie de Paepe', 1780), ('Bart Boele', 1572), ('Martijn Gijsbertsen', 1486), ('Leander Mascini', 1349), ('Pieter van Hove', 1312), ('Annemarie de Jong', 1302), ('Loman Leefmans', 1274), ('Susanne van Velzen', 1256), ('Eric Molenaar', 1247), ('Wessel Mekking', 1244), ('Ed Brouwer', 1210), ('Kees de Boer', 1181), ('Tanja Koopen', 1154), (nan, 1146), ('Eric Lorier', 1109), ('Arnold Aarts', 1103), ('Leontien van Engelen', 1101), ('Marieta Kroft', 1096), ('Richard Stekelenburg', 1078), ('Henk Runhaar', 1066), ('Arthur de Mijttenaere', 1058), ('Cees Beemster', 1040), ('Robert Jan van der Woud', 1017), ('Marten Visser', 1013), ('Marlies Vording', 993), ('Hans van Keken', 985), ('Delano Weltevreden', 976), ('Peter Schat', 949), ('Paul de Vlieger', 947), ('Casper Duin', 946), ('Joyce Huibers', 930), ('Milo Lambers', 923), ('Frans van den Berg', 913), ('Redacteur West-Friesland'

In [67]:
#We create an empty list.
authors = []

#We loop over the authors column.
for item in tqdm(df2.Author.values):
    #We use several keywords to unravel what kind of author it was.
    if "redac" in str(item):
        authors.append("editor(s)")
    elif "ANP" in str(item):
        authors.append("ANP (news agency)")
    elif "verslag" in str(item):
        authors.append("our reporter(s)")
    elif item == "":
        authors.append("unknown")
    else:
        authors.append("individual reporter(s)")

100%|██████████| 208640/208640 [00:00<00:00, 574778.43it/s]


In [68]:
#We add it as a column.
df2["Author"] = authors
authors = []
#We inspect the column.
Counter(df2.Author)

Counter({'ANP (news agency)': 35191,
         'editor(s)': 9389,
         'individual reporter(s)': 145056,
         'unknown': 14403,
         'our reporter(s)': 4601})

## Character Count

In [69]:
#We sum the body and intro character count together.
df2["Character count"] = df2["Body character count"] + df2["Intro character count"]

In [70]:
#We drop the seperate columns.
df2.drop(columns=["Body character count", "Intro character count"], inplace=True)

In [71]:
#We inspect the data.
df2.head()

Unnamed: 0,Author,Section trees (aggregate),Inline media,Paid content,Newspaper,Landing page,Publication date,Section trees,Frontpage,Highlighted,...,Regional,Background,Lifestyle,Foreign,Domestic,Opinion,Culture,Number of sections (incl. frontpage),Number of sections (excl. frontpage),Character count
0,ANP (news agency),nhd/sport|hd/sport|ge/sport|yc/sport|ld/sport|...,True,False,Gooieneemlander,/cnt/dmf20210612_49952447,20210612,"[ge/sport, ge]",True,False,...,False,False,False,False,False,False,False,2,1,5190.0
1,editor(s),nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210611_74078660,20210611,"[ge/regio/gooi-en-eemland-, ge]",True,False,...,True,False,False,False,False,False,False,2,1,1069.0
2,individual reporter(s),nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,True,Gooieneemlander,/cnt/dmf20210609_6603454,20210609,"[ge/regio/gooi-en-eemland-, ge]",True,False,...,True,False,False,False,False,False,False,2,1,3829.0
3,editor(s),nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210609_60025534,20210609,"[ge/regio/gooi-en-eemland-, ge]",True,False,...,True,False,False,False,False,False,False,2,1,1989.0
4,individual reporter(s),nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210608_17288936,20210608,"[ge/regio/gooi-en-eemland-, ge]",True,False,...,True,False,False,False,False,False,False,2,1,2408.0


## Day of the Week

In [72]:
#We create an empty list.
day_of_the_week = []

#We loop over the publication dates.
for item in tqdm(df2["Publication date"].values):
    #We turn it into a timstamp.
    datetime_object = pd.Timestamp(item)
    #We extract the name of the day.
    day = datetime_object.day_name()
    #We append it to our list.
    day_of_the_week.append(day)

100%|██████████| 208640/208640 [00:02<00:00, 71704.20it/s]


In [73]:
#We add it as a column.
df2["Day of the week"] = day_of_the_week
day_of_the_week = []

## Inline Media

In [74]:
#We turn the inline media column into a boolean value.
df2["Inline media"] = df2["Inline media"].astype("category").cat.rename_categories({"True": True, "False": False})

## Paid Content

In [75]:
#We do the same for paid content.
df2["Paid content"] = df2["Paid content"].astype("category").cat.rename_categories({"true": True, "false": False})

In [76]:
#We inspect the data.
df2.head()

Unnamed: 0,Author,Section trees (aggregate),Inline media,Paid content,Newspaper,Landing page,Publication date,Section trees,Frontpage,Highlighted,...,Background,Lifestyle,Foreign,Domestic,Opinion,Culture,Number of sections (incl. frontpage),Number of sections (excl. frontpage),Character count,Day of the week
0,ANP (news agency),nhd/sport|hd/sport|ge/sport|yc/sport|ld/sport|...,True,False,Gooieneemlander,/cnt/dmf20210612_49952447,20210612,"[ge/sport, ge]",True,False,...,False,False,False,False,False,False,2,1,5190.0,Saturday
1,editor(s),nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210611_74078660,20210611,"[ge/regio/gooi-en-eemland-, ge]",True,False,...,False,False,False,False,False,False,2,1,1069.0,Friday
2,individual reporter(s),nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,True,Gooieneemlander,/cnt/dmf20210609_6603454,20210609,"[ge/regio/gooi-en-eemland-, ge]",True,False,...,False,False,False,False,False,False,2,1,3829.0,Wednesday
3,editor(s),nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210609_60025534,20210609,"[ge/regio/gooi-en-eemland-, ge]",True,False,...,False,False,False,False,False,False,2,1,1989.0,Wednesday
4,individual reporter(s),nhd/regio/gooi|ge/regio/gooi-en-eemland-|ge,True,False,Gooieneemlander,/cnt/dmf20210608_17288936,20210608,"[ge/regio/gooi-en-eemland-, ge]",True,False,...,False,False,False,False,False,False,2,1,2408.0,Tuesday


In [77]:
#We sort the columns by their names.
df2.columns.sort_values()

Index(['Author', 'Background', 'Character count', 'Culture', 'Day of the week',
       'Domestic', 'Foreign', 'Frontpage', 'Highlighted', 'Inline media',
       'Landing page', 'Lifestyle', 'Newspaper',
       'Number of sections (excl. frontpage)',
       'Number of sections (incl. frontpage)', 'Opinion', 'Paid content',
       'Publication date', 'Recommended', 'Regional', 'Section trees',
       'Section trees (aggregate)', 'Sport'],
      dtype='object')

In [78]:
#We inspect the columns of the lagged dataset.
df.columns.sort_values()

Index(['Added views', 'Half-life', 'Hours passed', 'Hours till peak',
       'Landing page', 'Newspaper', 'Peak views', 'Percentage', 'Popularity',
       'Popularity rank', 'Popularity rank in 1st 2h', 'Publication date',
       'Shelf-life', 'Slope after peak', 'Slope before peak',
       'Time of the day', 'Timestamp date', 'Total views', 'Views',
       'Views in 1st 2h', 'Z_Views'],
      dtype='object')

## Merging CSV with JSON files.

In [79]:
#We merge the files (excluding publication date, which is in there twice).
df3 = df.merge(df2.loc[:, ~df2.columns.isin(["Publication date"])], how="left", on=["Newspaper", "Landing page"])

In [80]:
df = []
df2 = []

In [81]:
#We sort the columns by their names.
df3 = df3.reindex(sorted(df3.columns), axis=1)

In [82]:
#We look at all the columns.
df3.columns

Index(['Added views', 'Author', 'Background', 'Character count', 'Culture',
       'Day of the week', 'Domestic', 'Foreign', 'Frontpage', 'Half-life',
       'Highlighted', 'Hours passed', 'Hours till peak', 'Inline media',
       'Landing page', 'Lifestyle', 'Newspaper',
       'Number of sections (excl. frontpage)',
       'Number of sections (incl. frontpage)', 'Opinion', 'Paid content',
       'Peak views', 'Percentage', 'Popularity', 'Popularity rank',
       'Popularity rank in 1st 2h', 'Publication date', 'Recommended',
       'Regional', 'Section trees', 'Section trees (aggregate)', 'Shelf-life',
       'Slope after peak', 'Slope before peak', 'Sport', 'Time of the day',
       'Timestamp date', 'Total views', 'Views', 'Views in 1st 2h', 'Z_Views'],
      dtype='object')

In [83]:
#We remove those that we do not longer need.
df3.drop(columns=["Added views", "Peak views", "Percentage", "Section trees (aggregate)", "Z_Views"], inplace=True)

In [84]:
#We inspect it one more time.
df3.head()

Unnamed: 0,Author,Background,Character count,Culture,Day of the week,Domestic,Foreign,Frontpage,Half-life,Highlighted,...,Section trees,Shelf-life,Slope after peak,Slope before peak,Sport,Time of the day,Timestamp date,Total views,Views,Views in 1st 2h
0,unknown,False,3286.0,False,Wednesday,False,False,True,28,False,...,"[ge/extra/opinie-column, ge]",38,0.0,0.176591,False,19,2021-05-26 19:00:00,3,0,0
1,unknown,False,3286.0,False,Wednesday,False,False,True,28,False,...,"[ge/extra/opinie-column, ge]",38,0.0,0.176591,False,20,2021-05-26 20:00:00,3,0,0
2,unknown,False,3286.0,False,Wednesday,False,False,True,28,False,...,"[ge/extra/opinie-column, ge]",38,0.0,0.176591,False,21,2021-05-26 21:00:00,3,0,0
3,unknown,False,3286.0,False,Wednesday,False,False,True,28,False,...,"[ge/extra/opinie-column, ge]",38,0.0,0.176591,False,22,2021-05-26 22:00:00,3,0,0
4,unknown,False,3286.0,False,Wednesday,False,False,True,28,False,...,"[ge/extra/opinie-column, ge]",38,0.0,0.176591,False,23,2021-05-26 23:00:00,3,0,0


In [85]:
#We calculate the size of the frame as well as the zero values overall and within the first 72 hours.
print(f"Size of the dataframe: {(sys.getsizeof(df3) / 10**9):.2f} GB")
print(f"Zero values overall: {df3['Views'].value_counts()[0]}")
print(f"Zero values in the first 72 hours: {df3[df3['Hours passed'] <= 72]['Views'].value_counts()[0]}")

Size of the dataframe: 13.84 GB
Zero values overall: 16823012
Zero values in the first 72 hours: 1618869


In [86]:
#We write the file away.
df3.to_csv("Newspapers_Complete.csv", index=False)