In [1]:
import pandas as pd

In [2]:
df_perf = pd.read_csv('../data/full_data.csv')
df_scrape = pd.read_csv('../data/full_scraped.csv')

### Finetune versions

In [3]:
# Create ongoing version_id that is unique for each version page_id combination
df_perf['version_id_ong'] = df_perf['page_id'].astype(str) + '_' + df_perf['version_id'].astype(str)
df_perf[['version_id_ong', 'page_id', 'version_id']].head()

Unnamed: 0,version_id_ong,page_id,version_id
0,1037_0,1037,0
1,1037_0,1037,0
2,1037_0,1037,0
3,1037_0,1037,0
4,1037_0,1037,0


Thomas:
We have three groups which depend on different concatenated unique keys:

#### Group 1: external_impressions and external_clicks:

page_id
date

#### Group 2: video_play, page_impressions, clickouts:

page_id
date
URL
Author (edge case)

#### Group 3: daily_likes, daily_dislikes:

page_id
date
publishe_date

In [4]:
pd.set_option('display.max_colwidth', 20)

# Columns that differ on a daily basis and need to be aggregated with a certain rule
col_agg_1 = ['external_clicks', 'external_impressions']
col_agg_2 = ['video_play', 'page_impressions', 'clickouts']
col_agg_3 = ['daily_likes', 'daily_dislikes']

# Columns that don't need to be aggregated but are the same for each version
all_columns = df_perf.columns.tolist()
col = [c for c in all_columns if c not in col_agg_1 and c not in col_agg_2 and c not in col_agg_3]
# this includes: ['old_index', 'page_id', 'date', 'url', 'version_id', 'publish_date', 'word_count', 'words_scraped', 'classification_product', 'classification_type', 'page_name', 'authors', 'author_scraped', 'title', 'h1', 'abstract', 'last_update', 'image_url', 'version_id_ong']

# Aggregate by version for columns with simple duplicates
df_agg = df_perf[col].groupby('version_id_ong').first()

# Aggregate by version for columns which need to be aggregated with a certain rule
#tbd

#df_agg.head()

### Target Variables:
Impressions, Clicks, CTR (click-through-rate). The latter we create in the next step:

In [5]:
# Calculate the Click through rate based on external clicks and impressions
df_perf['ctr'] = df_perf['external_clicks'] / df_perf['external_impressions'] *100

### Features:
Category, 
Image, 
H1, 
Abstract, 
URL, 
Title, 
Word count

Extract the last part of the URL to analyze it and inhibit duplicate data with classification_type

In [6]:
# Function to extract last part of URL and clean it
def extract_last_part(url):
    url_text = url.rsplit('/', 1)[-1]
    cleaned_url = url_text.split('_')[0]
    cleaned_url_list = cleaned_url.split('-')
    return cleaned_url_list

# Apply the function to create a new column
df_scrape['url_text'] = df_scrape['url'].apply(extract_last_part)

# Sum up all list items per ongoing Version ID and merge with original df
df_feat = pd.merge(df_scrape, df_scrape.groupby('page_id')['url_text'].apply(lambda x: list(set(sum(x, [])))).reset_index(name='merged_url'), on='page_id', how='left')

In [7]:
#Transform media column
def media_type(df, media_type):
    if 'img-wrapper' in media_type or any(item in media_type for item in ['image-gallery', 'mb-lg-7', 'mb-8']):
        return 'img'
    elif any(item in media_type for item in ['mb-3', 'video-player', 'recobar']):
        return 'video'
    else:
        return 'other'

df_feat['media_type'] = df_scrape['media_type'].apply(lambda x: media_type(df_feat, x))

In [8]:
df_feat.columns

Index(['page_id', 'url', 'h1', 'author', 'date', 'abstract',
       'main_text_length', 'meta_title', 'meta_description', 'meta_image_url',
       'media_type', 'page_img_size', 'url_text', 'merged_url'],
      dtype='object')

In [9]:
# Title length
df_feat['meta_title_len'] = df_feat['meta_title'].str.len()

# Meta description length
df_feat['meta_desc_len'] = df_feat['meta_description'].str.len()

# H1 length
df_feat['h1_len'] = df_feat['h1'].str.len()

# Abstract length
df_feat['abstract_len'] = df_feat['abstract'].str.len()

# URL length
df_feat['merged_url_len'] = df_feat['merged_url'].str.len()

In [10]:
col = ['page_id', 'url', 'date', 'version_id', 'publish_date',
       'word_count', 'classification_product', 'classification_type', 'page_name', 'authors','title',
       'daily_likes', 'daily_dislikes', 'video_play', 'page_impressions',
       'clickouts', 'external_clicks', 'external_impressions','ctr']

on = ['url','page_id']

df_merge = pd.merge(df_perf[col],df_feat,how='left',on='page_id')
df_merge.drop(['url_x'], axis=1, inplace=True)


### Further Feature Engineering

### NLP

Extract the most important words from title, h1, abstract, url with NLP (e.g. TF-IDF )

In [None]:
df_nlp = df_feat.drop(['author','date','meta_image_url','media_type','page_img_size','url_text','meta_title_len','meta_desc_len', 'h1_len','abstract_len', 'merged_url_len'],axis=1)
df_nlp.head()

Unnamed: 0,page_id,url,h1,abstract,meta_title,meta_description,merged_url
0,1012169,https://efahrer....,ZDF-Doku analysi...,Ein Kommentar vo...,ZDF-Doku analysi...,Ein Kommentar vo...,"[argumente, und,..."
1,1017691,https://efahrer....,"Des einen Freud,...",Einige Gemeinden...,"Des einen Freud,...",Einige Gemeinden...,"[freud, solar, d..."
2,1016983,https://efahrer....,160 Kilometer mi...,Als Journalist u...,160 Kilometer mi...,Als Journalist u...,"[dem, bike, pend..."
3,101399,https://efahrer....,Laden eines Elek...,Wer sein Elektro...,Ladestation für ...,Wer sein Elektro...,"[hause, laden, a..."
4,104461,https://efahrer....,Leasing-Deal für...,Den Mazda MX-30 ...,Leasing-Deal für...,Den Mazda MX-30 ...,"[leasing, 150, s..."


In [None]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import string

stop_words = set(stopwords.words('german'))

[nltk_data] Downloading package stopwords to /Users/clara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/clara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/clara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def remove_stopwords(text):
    if isinstance(text, str):        
        words = word_tokenize(text)
        # Remove punctuation and special characters
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove stopwords
        return ' '.join([word for word in words if word.lower() not in stop_words])
    else:
        return text

def remove_stopwords_from_columns(df, columns):
    for col in columns:
        df[col] = df[col].apply(remove_stopwords)
    return df

columns_to_clean = ['h1','abstract','meta_title','meta_description','merged_url']
df_nlp = remove_stopwords_from_columns(df_nlp, columns_to_clean)

In [None]:
df_nlp.describe()

Unnamed: 0,page_id
count,6865.0
mean,932072.4
std,261383.5
min,30.0
25%,1011990.0
50%,1014101.0
75%,1016311.0
max,1018782.0


In [None]:
df_nlp_vec = df_nlp.copy()
df_nlp_vec.drop(['url','page_id'],axis=1,inplace=True)
df_nlp_vec.fillna('', inplace=True)

In [None]:
df_nlp_vec.isna().sum() #info()

h1                  0
abstract            0
meta_title          0
meta_description    0
merged_url          0
dtype: int64

In [None]:
def vectorize_text(column, df):
    col = df[column]
    vect = CountVectorizer().fit(col)
    transformed = vect.transform(col)
    
    # Create a DataFrame from the transformed array
    df_transformed = pd.DataFrame(transformed.toarray(), columns=vect.get_feature_names_out(), index=df.index)
    
    # Concatenate the new DataFrame with the original DataFrame
    df_concatenated = pd.concat([df, df_transformed], axis=1)
    
    # Drop the original column
    df_concatenated.drop(columns=[column], inplace=True)
    
    return df_concatenated

# Iterate over columns to clean
for item in columns_to_clean:
    df_nlp_vec = vectorize_text(column=item, df=df_nlp_vec)
    filename = f'../data/nlp_features_{item}.csv'
    df_nlp_vec.to_csv(filename, encoding='utf-8', index=False)