In [1]:
import pandas as pd

In [2]:
df_perf = pd.read_csv('../data/full_data.csv')
df_scrape = pd.read_csv('../data/full_scraped.csv')

### Finetune versions

In [3]:
# Create ongoing version_id that is unique for each version page_id combination
df_perf['version_id_ong'] = df_perf['page_id'].astype(str) + '_' + df_perf['version_id'].astype(str)
df_perf[['version_id_ong', 'page_id', 'version_id']].head()

Unnamed: 0,version_id_ong,page_id,version_id
0,1037_0,1037,0
1,1037_0,1037,0
2,1037_0,1037,0
3,1037_0,1037,0
4,1037_0,1037,0


Thomas:
We have three groups which depend on different concatenated unique keys:

#### Group 1: external_impressions and external_clicks:

page_id
date

#### Group 2: video_play, page_impressions, clickouts:

page_id
date
URL
Author (edge case)

#### Group 3: daily_likes, daily_dislikes:

page_id
date
publishe_date

In [4]:
pd.set_option('display.max_colwidth', 20)

# Columns that differ on a daily basis and need to be aggregated with a certain rule
col_agg_1 = ['external_clicks', 'external_impressions']
col_agg_2 = ['video_play', 'page_impressions', 'clickouts']
col_agg_3 = ['daily_likes', 'daily_dislikes']

# Columns that don't need to be aggregated but are the same for each version
all_columns = df_perf.columns.tolist()
col = [c for c in all_columns if c not in col_agg_1 and c not in col_agg_2 and c not in col_agg_3]
# this includes: ['old_index', 'page_id', 'date', 'url', 'version_id', 'publish_date', 'word_count', 'words_scraped', 'classification_product', 'classification_type', 'page_name', 'authors', 'author_scraped', 'title', 'h1', 'abstract', 'last_update', 'image_url', 'version_id_ong']

# Aggregate by version for columns with simple duplicates
df_agg = df_perf[col].groupby('version_id_ong').first()

# Aggregate by version for columns which need to be aggregated with a certain rule
#tbd

#df_agg.head()

### Target Variables:
Impressions, Clicks, CTR (click-through-rate). The latter we create in the next step:

In [5]:
# Calculate the Click through rate based on external clicks and impressions
df_perf['ctr'] = df_perf['external_clicks'] / df_perf['external_impressions'] *100

### Features:
Category, 
Image, 
H1, 
Abstract, 
URL, 
Title, 
Word count

Extract the last part of the URL to analyze it and inhibit duplicate data with classification_type

In [6]:
# Function to extract last part of URL and clean it
def extract_last_part(url):
    url_text = url.rsplit('/', 1)[-1]
    cleaned_url = url_text.split('_')[0]
    cleaned_url_list = cleaned_url.split('-')
    return cleaned_url_list

# Apply the function to create a new column
df_scrape['url_text'] = df_scrape['url'].apply(extract_last_part)

# Sum up all list items per ongoing Version ID and merge with original df
df_feat = pd.merge(df_scrape, df_scrape.groupby('page_id')['url_text'].apply(lambda x: list(set(sum(x, [])))).reset_index(name='merged_url'), on='page_id', how='left')

In [7]:
#Transform media column
def media_type(df, media_type):
    if 'img-wrapper' in media_type or any(item in media_type for item in ['image-gallery', 'mb-lg-7', 'mb-8']):
        return 'img'
    elif any(item in media_type for item in ['mb-3', 'video-player', 'recobar']):
        return 'video'
    else:
        return 'other'

df_feat['media_type'] = df_scrape['media_type'].apply(lambda x: media_type(df_feat, x))

In [8]:
df_feat.columns

Index(['page_id', 'url', 'h1', 'author', 'date', 'abstract', 'meta_title',
       'meta_description', 'meta_image_url', 'media_type', 'page_img_size',
       'url_text', 'merged_url'],
      dtype='object')

In [9]:
# Title length
df_feat['meta_title_len'] = df_feat['meta_title'].str.len()

# Meta description length
df_feat['meta_desc_len'] = df_feat['meta_description'].str.len()

# H1 length
df_feat['h1_len'] = df_feat['h1'].str.len()

# Abstract length
df_feat['abstract_len'] = df_feat['abstract'].str.len()
# URL length
df_feat['merged_url_len'] = df_feat['merged_url'].str.len()

In [10]:
df_perf.columns

Index(['old_index', 'page_id', 'date', 'url', 'version_id', 'publish_date',
       'word_count', 'words_scraped', 'classification_product',
       'classification_type', 'page_name', 'authors', 'author_scraped',
       'title', 'h1', 'abstract', 'last_update', 'image_url', 'daily_likes',
       'daily_dislikes', 'video_play', 'page_impressions', 'clickouts',
       'external_clicks', 'external_impressions', 'version_id_ong', 'ctr'],
      dtype='object')

In [11]:
df_feat.columns

Index(['page_id', 'url', 'h1', 'author', 'date', 'abstract', 'meta_title',
       'meta_description', 'meta_image_url', 'media_type', 'page_img_size',
       'url_text', 'merged_url', 'meta_title_len', 'meta_desc_len', 'h1_len',
       'abstract_len', 'merged_url_len'],
      dtype='object')

In [12]:
col = ['page_id', 'url', 'date', 'version_id', 'publish_date',
       'word_count', 'classification_product', 'classification_type', 'page_name', 'authors','title',
       'daily_likes', 'daily_dislikes', 'video_play', 'page_impressions',
       'clickouts', 'external_clicks', 'external_impressions','ctr']

on = ['url','page_id']

df_merge = pd.merge(df_perf[col],df_feat,how='left',on='page_id')
df_merge.drop(['url_x'], axis=1, inplace=True)
df_merge.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,132835,132836,132837,132838,132839,132840,132841,132842,132843,132844
page_id,1037,1037,1037,1037,1037,1037,1037,1039,1039,1040,...,1018766,1018766,1018767,1018767,1018768,1018768,1018770,1018771,1018776,1018782
url_x,https://efahrer....,https://efahrer....,https://efahrer....,https://efahrer....,https://efahrer....,https://efahrer....,https://efahrer....,https://efahrer....,https://efahrer....,https://efahrer....,...,https://efahrer....,https://efahrer....,https://efahrer....,https://efahrer....,https://efahrer....,https://efahrer....,https://efahrer....,https://efahrer....,https://efahrer....,https://efahrer....
date_x,2024-03-13,2024-03-13,2024-03-14,2024-03-15,2024-03-16,2024-03-17,2024-03-18,2024-03-17,2024-03-17,2024-02-22,...,2024-03-22,2024-03-23,2024-03-22,2024-03-23,2024-03-22,2024-03-23,2024-03-23,2024-03-23,2024-03-23,2024-03-23
version_id,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
publish_date,2024-03-10,2024-03-10,2024-03-10,2024-03-10,2024-03-10,2024-03-10,2024-03-10,2022-05-05,2022-05-05,2024-02-21,...,2018-01-01,2018-01-01,2018-01-01,2018-01-01,2018-01-01,2018-01-01,2018-01-01,2018-01-01,2018-01-01,2018-01-01
word_count,827.0,827.0,827.0,827.0,827.0,827.0,827.0,1066.0,1066.0,466.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
classification_product,E-Auto,E-Auto,E-Auto,E-Auto,E-Auto,E-Auto,E-Auto,E-Auto,E-Auto,E-Auto,...,E-Auto,E-Auto,Solarspeicher,Solarspeicher,Energie,Energie,E-Auto,Wallbox/Laden,Verkehr,Solaranlagen
classification_type,Ratgeber,Ratgeber,Ratgeber,Ratgeber,Ratgeber,Ratgeber,Ratgeber,News,News,News,...,News,News,Deal,Deal,News,News,News,Kaufberatung,News,News
page_name,efa-1037 | Lades...,efa-1037 | Lades...,efa-1037 | Lades...,efa-1037 | Lades...,efa-1037 | Lades...,efa-1037 | Lades...,efa-1037 | Lades...,efa-1039 | Elekt...,efa-1039 | Elekt...,efa-1040 | Gesch...,...,efa-1018766 | Te...,efa-1018766 | Te...,efa-1018767 | Am...,efa-1018767 | Am...,efa-1018768 | De...,efa-1018768 | De...,efa-1018770 | Ba...,efa-1018771 | Wa...,efa-1018776 | E-...,efa-1018782 | „K...
authors,Eva Goldschald,Eva Goldschald,Eva Goldschald,Eva Goldschald,Eva Goldschald,Eva Goldschald,Eva Goldschald,Moritz Diethelm,Moritz Diethelm,Marius Eichfelder,...,Karl Lüdecke,Karl Lüdecke,Kai Gosejohann,Kai Gosejohann,Gero Gröschel,Gero Gröschel,Karl Lüdecke,Katrin Lehmann,Karl Lüdecke,Sepp Reitberger


### Further Feature Engineering

Add all keywords from URLs and group them in a list per version

In [None]:
# One hot encode category & type


### NLP

Extract the most important words from title, h1, abstract, url with NLP (e.g. TF-IDF )

In [14]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import string

stop_words = set(stopwords.words('german'))

[nltk_data] Downloading package stopwords to /Users/clara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/clara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/clara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
def remove_stopwords(text):
    if isinstance(text, str):        
        words = word_tokenize(text)
        # Remove punctuation and special characters
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove stopwords
        return ' '.join([word for word in words if word.lower() not in stop_words])
    else:
        return text

def remove_stopwords_from_columns(df, columns):
    for col in columns:
        df[col] = df[col].apply(remove_stopwords)
    return df

columns_to_clean = ['h1', 'abstract', 'meta_title', 'meta_description']
df_cleaned = remove_stopwords_from_columns(df_feat, columns_to_clean)

In [16]:
pd.set_option('display.max_colwidth', None)
df_cleaned.head(2)

Unnamed: 0,page_id,url,h1,author,date,abstract,meta_title,meta_description,meta_image_url,media_type,page_img_size,url_text,merged_url,meta_title_len,meta_desc_len,h1_len,abstract_len,merged_url_len
0,1012169,https://efahrer.chip.de/news/zdf-doku-analysiert-e-autos-die-argumente-dagegen-sind-alt-und-widerlegt_1012169,ZDF-Doku analysiert E-Autos : Argumente dagegen alt widerlegt,Sepp Reitberger,02. April 2023,"Kommentar EFAHRER-Chefredakteur Sepp Reitberger neuesten ZDF-Doku Elektromobilität , Frage Titel schon zeigt , Macher längst falsch abgebogen . Verkehrswende Rahmen Energiewende betrachten , einfach sinnlos .",ZDF-Doku analysiert E-Autos : Argumente dagegen alt widerlegt,"Kommentar EFAHRER-Chefredakteur Sepp Reitberger neuesten ZDF-Doku Elektromobilität , Frage Titel schon zeigt , ...",https://im-efahrer.chip.de/files/bildschirmfoto-2023-03-30-um-141313-64257d422159a.jpg?imPolicy=IfOrientation&width=1200&height=630&color=%23000000&hash=3eaeda7b0cceb1f9b9ced8beae24e5c7188503ef3556adcaaa4d6ae3c2507655,img,710px,"[zdf, doku, analysiert, e, autos, die, argumente, dagegen, sind, alt, und, widerlegt]","[dagegen, alt, doku, und, argumente, sind, widerlegt, analysiert, autos, die, zdf, e]",73,156,73,277.0,12
1,1017691,https://efahrer.chip.de/news/des-einen-freud-des-andern-leid-bundesland-fuehrt-solar-euro-ein_1017691,"Freud , Leid : Bundesland führt `` Solar-Euro ''",Kai Gosejohann,05. Februar 2024,Gemeinden Bundesland freuen : erhalten ab 2025 `` Solar-Euro '' Betreibern Freiflächen-Photovoltaikanlagen . genau bedeutet Gemeinden Investoren ?,"Freud , Leid : Bundesland führt `` Solar-Euro ''",Gemeinden Bundesland freuen : erhalten ab 2025 `` Solar-Euro '' Betreibern Freiflächen-Photovoltaikanlagen . ...,https://im-efahrer.chip.de/files/photovoltaic-g5ace9b35e-1280-65c0df8f7ae4c.jpg?imPolicy=IfOrientation&width=1200&height=630&color=%23000000&hash=2b2561be03936c129ec8655239a50c21e429624eda70afcbaef904a00db2d075,video,,"[des, einen, freud, des, andern, leid, bundesland, fuehrt, solar, euro, ein]","[euro, bundesland, andern, leid, einen, freud, ein, fuehrt, solar, des]",67,156,67,206.0,10
