In [1]:
import pandas as pd

In [2]:
df_full = pd.read_csv('../data/full_data.csv')
df_scraped = pd.read_csv('../data/scraping_no_duplicates.csv')

Preprocessing of scraped dataframe, similar to df_full

In [3]:
df_scraped.columns = [col.lower() for col in df_scraped.columns]

df_scraped.rename({
           #'impressions': 'page_impressions',
           'page_efahrer_id': 'page_id',
           'page_canonical_url': 'url',
           'author': 'author_scraped',
           'current_title': 'h1'
            }, axis=1, inplace=True)

df_scraped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6899 entries, 0 to 6898
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   page_id         6899 non-null   int64 
 1   url             6899 non-null   object
 2   h1              6899 non-null   object
 3   abstract        6899 non-null   object
 4   author_scraped  6899 non-null   object
 5   words           6899 non-null   int64 
 6   last_update     6899 non-null   object
 7   image_url       6899 non-null   object
dtypes: int64(2), object(6)
memory usage: 431.3+ KB


Checking whether the same ids & urls are in both dataframes

In [4]:
print('Page IDs')
# Convert Series to sets
page_ids_full = set(df_full['page_id'])
page_ids_scraped = set(df_scraped['page_id'])

# Find the difference between the sets
page_ids_difference = page_ids_full.difference(page_ids_scraped)
print('Number of IDs that no data was scraped for:', len(page_ids_difference))
print(' ')

print('Page URLs')
# Convert Series to sets
page_url_full = set(df_full['url'])
page_url_scraped = set(df_scraped['url'])

# Find the difference between the sets
page_url_difference = page_url_full.difference(page_url_scraped)

print('Number URLs that no data was scraped for:',len(page_url_difference))
print('Number of URLs in complete dataset:', len(df_full.url.unique()))
print('Number of URLs in scraped dataset:', len(df_scraped['url'].unique()))

Page IDs
Number of IDs that no data was scraped for: 0
 
Page URLs
Number URLs that no data was scraped for: 5530
Number of URLs in complete dataset: 12429
Number of URLs in scraped dataset: 6899


<span style="color:red">For each ID we have the scraped data but not for every URL. Assumption that the page content is the same for each page ID

Next we can merge the dataframe with the scraped content with our full dataset with the performance data. 

In [5]:
col_to_merge = ['page_id', 'url']
df_full_scraped = pd.merge(left=df_full, right=df_scraped, on=col_to_merge, how='left')
df_full_scraped.head()

Unnamed: 0,old_index,page_id,date,publish_date,word_count,url,page_name,classification_product,classification_type,title,...,page_impressions,clickouts,external_clicks,external_impressions,h1,abstract,author_scraped,words,last_update,image_url
0,65055,1037,2024-03-13,2024-03-10,827.0,https://efahrer.chip.de/e-wissen/aufladen_1037,efa-1037 | Ladestationen für Elektroautos,E-Auto,Ratgeber,Ladestationen für Elektroautos,...,15.0,0.0,22.0,343.0,,,,,,
1,34072,1037,2024-03-13,2024-03-10,827.0,https://efahrer.chip.de/e-wissen/elektrofahrze...,efa-1037 | Ladestationen für Elektroautos,E-Auto,Ratgeber,Ladestationen für Elektroautos,...,41.0,0.0,22.0,343.0,Elektrofahrzeug-Ladestation: Kosten und Anbiet...,"Im Grunde kann man sein E-Auto überall laden, ...",Eva Goldschald,1544.0,2024-03-10,https://im-efahrer.chip.de/files/5ffed1cb4c3dc...
2,736,1037,2024-03-14,2024-03-10,827.0,https://efahrer.chip.de/e-wissen/elektrofahrze...,efa-1037 | Ladestationen für Elektroautos,E-Auto,Ratgeber,Ladestationen für Elektroautos,...,113.0,1.0,79.0,1494.0,Elektrofahrzeug-Ladestation: Kosten und Anbiet...,"Im Grunde kann man sein E-Auto überall laden, ...",Eva Goldschald,1544.0,2024-03-10,https://im-efahrer.chip.de/files/5ffed1cb4c3dc...
3,114772,1037,2024-03-15,2024-03-10,827.0,https://efahrer.chip.de/e-wissen/elektrofahrze...,efa-1037 | Ladestationen für Elektroautos,E-Auto,Ratgeber,Ladestationen für Elektroautos,...,53.0,1.0,16.0,431.0,Elektrofahrzeug-Ladestation: Kosten und Anbiet...,"Im Grunde kann man sein E-Auto überall laden, ...",Eva Goldschald,1544.0,2024-03-10,https://im-efahrer.chip.de/files/5ffed1cb4c3dc...
4,19316,1037,2024-03-16,2024-03-10,827.0,https://efahrer.chip.de/e-wissen/elektrofahrze...,efa-1037 | Ladestationen für Elektroautos,E-Auto,Ratgeber,Ladestationen für Elektroautos,...,51.0,1.0,17.0,602.0,Elektrofahrzeug-Ladestation: Kosten und Anbiet...,"Im Grunde kann man sein E-Auto überall laden, ...",Eva Goldschald,1544.0,2024-03-10,https://im-efahrer.chip.de/files/5ffed1cb4c3dc...


In [6]:
df_full_scraped['check_merge'] = df_full_scraped['words'] == df_full_scraped['words']

print('Number of scraped pages', len(df_scraped.page_id))
print('Number of page IDs, that contain scraped data:', len(df_full_scraped[df_full_scraped['abstract'].notnull()]['page_id'].unique()))
print('Number of pages that were merged correctly based on word count:', len(df_full_scraped[df_full_scraped['check_merge'] == True]['page_id'].unique()))

Number of scraped pages 6899
Number of page IDs, that contain scraped data: 6899
Number of pages that were merged correctly based on word count: 6899


In [7]:
df_full_scraped['check_merge'] = df_full_scraped['last_update'] == df_full_scraped['publish_date']

print('Number of pages that have a different update date compared to the publish date:', len(df_scraped.page_id) - len(df_full_scraped[df_full_scraped['check_merge'] == True]['page_id'].unique()))

Number of pages that have a different update date compared to the publish date: 1652


<span style="color:red"> Some pages have a different publishing date compared to the scraped update date. This could be as the scraped date is added manually and the published_date from the data set is meta data from the actual day of publishing.</span> 
In the following the publishing date from the original data set is taken instead of the scraped data.

Cleaning up for the EDA file by dropping unnecessary columns.

<b><p>This is the file for the first EDA. For further analysis other columns should be included again.

For the first round of EDA we work with three target variables: Impressions, Clicks, CTR (click-through-rate). The latter we create in the next step:

In [8]:
df_full_scraped['ctr'] = df_full_scraped['external_clicks'] / df_full_scraped['external_impressions'] *100

In [11]:
df_full_scraped.columns

Index(['old_index', 'page_id', 'date', 'publish_date', 'word_count', 'url',
       'page_name', 'classification_product', 'classification_type', 'title',
       'authors', 'daily_likes', 'daily_dislikes', 'video_play',
       'page_impressions', 'clickouts', 'external_clicks',
       'external_impressions', 'h1', 'abstract', 'author_scraped', 'words',
       'last_update', 'image_url', 'check_merge', 'ctr'],
      dtype='object')

In [9]:
df_eda = df_full_scraped.drop_duplicates('page_id', keep='first')
print(df_eda.shape)
print(df_full_scraped.shape)

(6899, 26)
(132845, 26)


Cleaning up for the EDA file.

<b><p>This is the file for the first EDA. For further analysis other columns should be included again.

In [None]:
# Dropping columns that are not of interest in the first EDA
df_eda = df_eda.drop(['url', 'old_index', 'author_scraped', 'words', 'last_update', 'check_merge', 'page_name'], axis=1)

# Sorting the columns
new_order = ['page_id', 'date', 'publish_date', 
             'title', 'h1', 'abstract',
             'classification_product', 'classification_type', 'authors', 'word_count', 'image_url', 
             'daily_likes', 'daily_dislikes', 'video_play', 'page_impressions', 'clickouts', 'external_clicks', 'external_impressions','ctr']

df_eda = df_eda[new_order]

Export file for EDA

In [None]:
# Create file unique_IDs_merged, which contains all performance, meta and scraped data in one merged dataset per unique ID. In other words, all available data for each unique article, excluding different versions of the articles.
df_eda.to_csv('../data/unique_IDs_merged.csv', encoding='utf-8', index=False)