In [1]:
import pandas as pd
import nltk
import pickle
import numpy as np

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, PowerTransformer
import string

stop_words = set(stopwords.words('german'))

[nltk_data] Downloading package stopwords to /Users/clara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/clara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/clara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Extract the most important words from title, h1, abstract, url with NLP (e.g. TF-IDF )

In [2]:
df = pd.read_csv('/Users/clara/Desktop/neuefische/d-drivers/data/data_nlp_A.csv')

In [3]:
df.columns

Index(['page_id', 'n_days', 'date_min', 'n_urls', 'date_max', 'age', 'url',
       'no_versions', 'last_publish_date', 'word_count',
       'classification_product', 'classification_type', 'page_name', 'title',
       'author_list', 'external_clicks', 'external_impressions',
       'total_likes_n_days', 'daily_likes_median', 'total_dislikes_n_days',
       'daily_dislikes_median', 'video_play', 'page_impressions', 'clickouts',
       'ctr', 'mean_version_lifetime', 'publ_freq', 'ext_impr_norm',
       'urls_per_age', 'urls_per_days', 'h1', 'scraped_author', 'date_scraped',
       'abstract', 'scraped_word_count', 'meta_title', 'meta_description',
       'meta_image_url', 'media_type', 'page_img_size', 'merged_url',
       'meta_title_len', 'meta_desc_len', 'h1_len', 'abstract_len',
       'merged_url_len', 'title_has_colon', 'clickbait_prob',
       'clickbait_label', 'clickbait_prob_raw', 'google_trend_prob',
       'google_trend_label', 'google_trend_score', 'video_player_types',
   

In [12]:
df['external_impressions'].mean()

608686.581511372

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6815 entries, 0 to 6814
Data columns (total 58 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   page_id                 6815 non-null   int64  
 1   n_days                  6815 non-null   int64  
 2   date_min                6815 non-null   object 
 3   n_urls                  6815 non-null   int64  
 4   date_max                6815 non-null   object 
 5   age                     6815 non-null   int64  
 6   url                     6815 non-null   object 
 7   no_versions             6815 non-null   int64  
 8   last_publish_date       6815 non-null   object 
 9   word_count              6815 non-null   float64
 10  classification_product  6815 non-null   object 
 11  classification_type     6815 non-null   object 
 12  page_name               6815 non-null   object 
 13  title                   6815 non-null   object 
 14  author_list             6815 non-null   

In [63]:
df.author_list.unique()#head()

array(['Eva Goldschald', 'Moritz Diethelm', 'Marius Eichfelder',
       'Sven Schulz', 'Sepp Reitberger', 'Irmgard Butter',
       'Lisa Brack;Michi Jo Standl', 'Nicole Hery-Mossmann',
       'Moritz Diethelm;Benjamin Dizdar;Dominik Zarychta',
       'Lars Schwichtenberg;Lisa Brack',
       'Moritz Diethelm;Felix Bausch;Dominik Zarychta',
       'Felix Bausch;Benjamin Dizdar;Moritz Diethelm',
       'Martin Wittler;Büro504', 'Mara Egeling',
       'Martin Wittler;Dominik Zarychta;Büro 504', 'Katrin Lehmann',
       'Dominik Zarychta;Max Wiesmüller;Max Dockhorn', 'EFAHRER.com',
       'Josef Reitberger', 'Lisa Brack', 'EFAHRER.com;Eva Goldschald',
       'Franziska Albrecht', 'Moritz Diethelm;Dominik Zarychta',
       'Irmgard Butter;Dominik Zarychta;Mara Egeling', 'Sebastian Barsch',
       'Tobias Stahl', 'Kira Welling',
       'Annika Moesl;Lisa Brack;Michi Jo Standl',
       'Kira Welling, Annika Mösl', 'Maximilian Becker',
       'Nicole Hery-Moßmann', 'Svenja Schrade;Annika Mösl',

In [64]:
len(df['scraped_author'].unique())

219

### Scale target variable

In [71]:
scaler = PowerTransformer()
scaler_impr = PowerTransformer()

# Transform the target variable
df['external_impressions_scaled'] = scaler.fit_transform(df[['external_impressions']])
df['external_clicks_scaled'] = scaler.fit_transform(df[['external_clicks']])
df['ctr_scaled'] = scaler.fit_transform(df[['ctr']])
df['ext_impr_norm_scaled'] = scaler_impr.fit_transform(df[['ext_impr_norm']].values.reshape(-1, 1))

# Specify the file path for the specific transformer
file_path_specific = '/Users/clara/Desktop/neuefische/d-drivers/notebooks/power_transformer_ext_impr.pkl'

# Open the file in write binary mode and save the transformer
with open(file_path_specific, 'wb') as file:
    pickle.dump(scaler_impr, file)

#df['likes_scaled'] = scaler.fit_transform(df[['likes_n_days']])

### Remove stop words

In [72]:
def remove_stopwords(text):
    if isinstance(text, str):        
        words = word_tokenize(text)
        # Remove punctuation and special characters
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove stopwords
        return ' '.join([word for word in words if word.lower() not in stop_words])
    else:
        return text

def remove_stopwords_from_columns(df, columns):
    for col in columns:
        df[col] = df[col].apply(remove_stopwords)
    return df

columns_to_clean = ['h1','abstract','meta_title','meta_description']
df = remove_stopwords_from_columns(df, columns_to_clean)

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6815 entries, 0 to 6814
Data columns (total 62 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   page_id                      6815 non-null   int64  
 1   n_days                       6815 non-null   int64  
 2   date_min                     6815 non-null   object 
 3   n_urls                       6815 non-null   int64  
 4   date_max                     6815 non-null   object 
 5   age                          6815 non-null   int64  
 6   url                          6815 non-null   object 
 7   no_versions                  6815 non-null   int64  
 8   last_publish_date            6815 non-null   object 
 9   word_count                   6815 non-null   float64
 10  classification_product       6815 non-null   object 
 11  classification_type          6815 non-null   object 
 12  page_name                    6815 non-null   object 
 13  title             

### One hot encode categorical values

In [74]:
df.columns

Index(['page_id', 'n_days', 'date_min', 'n_urls', 'date_max', 'age', 'url',
       'no_versions', 'last_publish_date', 'word_count',
       'classification_product', 'classification_type', 'page_name', 'title',
       'author_list', 'external_clicks', 'external_impressions',
       'total_likes_n_days', 'daily_likes_median', 'total_dislikes_n_days',
       'daily_dislikes_median', 'video_play', 'page_impressions', 'clickouts',
       'ctr', 'mean_version_lifetime', 'publ_freq', 'ext_impr_norm',
       'urls_per_age', 'urls_per_days', 'h1', 'scraped_author', 'date_scraped',
       'abstract', 'scraped_word_count', 'meta_title', 'meta_description',
       'meta_image_url', 'media_type', 'page_img_size', 'merged_url',
       'meta_title_len', 'meta_desc_len', 'h1_len', 'abstract_len',
       'merged_url_len', 'title_has_colon', 'clickbait_prob',
       'clickbait_label', 'clickbait_prob_raw', 'google_trend_prob',
       'google_trend_label', 'google_trend_score', 'video_player_types',
   

In [75]:
categorical = ['sentiment_abstract','sentiment_meta_title','video_player_types','clickbait_label','title_has_colon','media_type']
df_encoded = pd.get_dummies(df, columns=categorical, prefix= categorical,drop_first=True)

In [76]:
df_encoded.columns

Index(['page_id', 'n_days', 'date_min', 'n_urls', 'date_max', 'age', 'url',
       'no_versions', 'last_publish_date', 'word_count',
       'classification_product', 'classification_type', 'page_name', 'title',
       'author_list', 'external_clicks', 'external_impressions',
       'total_likes_n_days', 'daily_likes_median', 'total_dislikes_n_days',
       'daily_dislikes_median', 'video_play', 'page_impressions', 'clickouts',
       'ctr', 'mean_version_lifetime', 'publ_freq', 'ext_impr_norm',
       'urls_per_age', 'urls_per_days', 'h1', 'scraped_author', 'date_scraped',
       'abstract', 'scraped_word_count', 'meta_title', 'meta_description',
       'meta_image_url', 'page_img_size', 'merged_url', 'meta_title_len',
       'meta_desc_len', 'h1_len', 'abstract_len', 'merged_url_len',
       'clickbait_prob', 'clickbait_prob_raw', 'google_trend_prob',
       'google_trend_label', 'google_trend_score', 'confidence_abstract',
       'confidence_meta_title', 'external_impressions_scaled'

In [77]:
df_encoded[['page_id', 'n_urls','word_count',
       'classification_product', 'classification_type', 
       'author_list', 'mean_version_lifetime','ext_impr_norm',
       'urls_per_age', 'urls_per_days', 'h1', 'scraped_author', 
       'abstract', 'meta_title', 'meta_description', 'meta_title_len',
       'meta_desc_len', 'h1_len', 'abstract_len', 'merged_url_len',
       'clickbait_prob', 'clickbait_prob_raw', 'google_trend_prob',
       'google_trend_label', 'google_trend_score', 
       'external_impressions_scaled','ext_impr_norm_scaled',
       'external_clicks_scaled', 'ctr_scaled', 'sentiment_abstract_neutral',
       'sentiment_abstract_positive', 'sentiment_meta_title_neutral',
       'sentiment_meta_title_positive', 'video_standard_and_widget',
       'video_widget', 'not_clickbait', 'title_has_colon_True',
       'media_type_other', 'media_type_video']]

KeyError: "['video_standard_and_widget', 'video_widget', 'not_clickbait'] not in index"

In [53]:
import json

df_encoded.rename({
        'scraped_author': "Author last"}, axis=1, inplace=True)

In [54]:
authors_map = json.load(open('/Users/clara/Desktop/neuefische/d-drivers/data/codes/authors_json.json', 'r', encoding='utf-8'))
authors_map = {key.lower(): value for key, value in authors_map.items()}

df_encoded['Author last'] = df_encoded['Author last'].str.lower()
df_encoded['Author last'] = df_encoded['Author last'].str.replace('/', ',')
df_encoded['Author last'] = df_encoded['Author last'].str.replace(' & ', ', ')
df_encoded['Author last'] = df_encoded['Author last'].str.replace(' und ', ', ')

df_encoded['Authors'] = df_encoded['Author last'].str.lower()
df_encoded['Authors'] = df_encoded['Authors'].str.replace(';', ',')

for auth in authors_map.keys():
    df_encoded['Author last'] = df_encoded['Author last'].str.replace(auth, authors_map[auth.lower()])
    df_encoded['Authors'] = df_encoded['Authors'].str.replace(auth, authors_map[auth.lower()])


In [55]:
df_encoded.rename(columns={"video_player_types_2 - Standard and Widget" : "video_standard_and_widget",
                   "video_player_types_3 - Widget" : "video_widget",
                   "clickbait_label_Not Clickbait": "not_clickbait",
                   "Author last": "Author_last"
           }, inplace=True)

In [56]:
df_encoded.clickbait_prob

0      -0.742681
1      -0.856208
2       0.773124
3      -0.827747
4      -0.780902
          ...   
6810   -0.952400
6811   -0.947575
6812   -0.738150
6813   -0.826913
6814   -0.873315
Name: clickbait_prob, Length: 6815, dtype: float64

In [57]:
df_encoded.to_csv('/Users/clara/Desktop/neuefische/d-drivers/data/preprocessing_nlp_v4.csv', encoding='utf-8', index=False)

In [58]:
import pygwalker as pyg

walker = pyg.walk(df_encoded)

Box(children=(HTML(value='<div id="ifr-pyg-0006164d6631603dhmZ2I8CQTX0jFUNl" style="height: auto">\n    <head>…