In [None]:
import pandas as pd
import pygwalker as pyg

df = pd.read_csv('../data/data_nlp_A.csv', parse_dates=['last_publish_date', 'date_min'])

In [None]:
df['const'] = 10

In [None]:
df_features = df[['page_id', 'external_impressions', 'ctr', 
                  'video_player_types', 'media_type', 'meta_title', 
                 'sentiment_meta_title', 
                 'sentiment_abstract',
                'clickbait_label', 'clickbait_prob', 
                'abstract', 
              ]].copy()

df_features.rename({'page_id': "ID", 
                    'abstract': "Abstract", 
                    'video_player_types': "Video player types", 
                    'media_type': "Media type", 
                    'sentiment_abstract': "Sentiment abstract", 
                    'meta_title': "Meta title", 
                    'sentiment_meta_title': "Sentiment: meta title",
                    'clickbait_label': "Clickbait", 
                    'clickbait_prob': "Clickbait confidence",
                    'external_impressions': "Page impressions",
                    'ctr': "Click-through"}, axis=1, inplace=True)
df_features.loc[:, 'const'] = 1

In [None]:
walker = pyg.walk(df_features, spec='./eda_features_sl.json', show_cloud_tool=False)

In [None]:
df.columns

In [None]:
df_overview = df[['page_id',
                'external_impressions',
                'ctr',
                'date_min',
                'age',
                'scraped_word_count', 
                'classification_product', 
                'classification_type',
                'n_days', 
                'no_versions',
                'date_scraped',
                'mean_version_lifetime', 
                'n_urls',
                'author_list',
                'scraped_author', 
                'total_likes_n_days', 
                'daily_likes_median', 
                'total_dislikes_n_days',
                'daily_dislikes_median', 
                'video_play',   
                'media_type',
                'meta_title_len', 'meta_desc_len', 'h1_len', 'abstract_len', 'merged_url_len', 'title_has_colon',
                'clickbait_prob', 'clickbait_label', 
                #'google_trend_prob', 'google_trend_label', 'google_trend_score',
                'video_player_types', 'sentiment_abstract', 'confidence_abstract',
                'sentiment_meta_title', 'confidence_meta_title']].copy()

df_overview.rename({
        'page_id': "ID",
        'n_days': "N readings", 
        'date_min': "Earliest date", 
        'n_urls': "N URLs total", 
        'age': "Age (days)", 
        'no_versions': "N versions", #'last_publish_date', #'word_count',
        'classification_product': "Topic", 
        'classification_type': "Type",
        'author_list': "Authors", 
        'external_impressions': "Page impressions",
        'total_likes_n_days': "Likes total", 
        'daily_likes_median': "Likes daily median", 
        'total_dislikes_n_days': "Dislikes total",
        'daily_dislikes_median': "Dislikes daily median",
        'video_play': "Video plays",
        'ctr': "Click-through", 
        'mean_version_lifetime': "Mean version lifetime", 
        'scraped_author': "Author last", 
        'date_scraped': "Last update date", 
        'scraped_word_count': "Word count current",
        'media_type': "Media type",
        'meta_title_len': "Meta title length", 
        'meta_desc_len': "Meta description length",
        'h1_len': "H1 length", 
        'abstract_len': "Abstract length", 
        'merged_url_len': "Unique words in URL", 
        'title_has_colon': "Title has colon",
        'clickbait_prob': "Clickbait confidence", 
        'clickbait_label': "Clickbait", 
        #'google_trend_prob': "Related term confidence", 
        #'google_trend_label': "Related term", 
        #'google_trend_score',
        'video_player_types': "Video player type", 
        'sentiment_abstract': "Sentiment: abstract", 
        'confidence_abstract': "Sentiment conf: abstract",
        'sentiment_meta_title': "Sentiment: title", 
        'confidence_meta_title': "Sentiment conf: title"}, axis=1, inplace=True)

In [None]:
import json

authors_map = json.load(open('../data/authors.json', 'r', encoding='utf-8'))
authors_map = {key.lower(): value for key, value in authors_map.items()}

In [None]:
df_overview['Author last mask'] = df_overview['Author last'].str.lower()
df_overview['Author last mask'] = df_overview['Author last mask'].str.replace('/', ',')
df_overview['Author last mask'] = df_overview['Author last mask'].str.replace(' & ', ', ')
df_overview['Author last mask'] = df_overview['Author last mask'].str.replace(' und ', ', ')

In [None]:
for auth in authors_map.keys():
    df_overview['Author last mask'] = df_overview['Author last mask'].replace(auth, authors_map[auth.lower()])

In [None]:
for auth in authors_map.keys():
    df_overview['Author last mask'] = df_overview['Author last mask'].str.replace(auth, authors_map[auth.lower()])

In [None]:
df_overview['Author last mask'].unique()

In [None]:
def map_names_to_aliases(names_list, authors_map=authors_map):
    aliases_list = []
    for name in names_list:
        name = name.lower()
        # Check if the name exists in the animal_aliases dictionary
        if name in authors_map:
            aliases_list.append(''.join(authors_map[name]))
        else:
            aliases_list.append(name)  # Use the original name if no alias is found
    return ', '.join(aliases_list)  # Join aliases with spaces

In [None]:
df_overview['Authors'] = df_overview.Authors.str.split(';')
df_overview['Authors'] = df_overview['Authors'].apply(map_names_to_aliases)

In [None]:
authors_map.keys()

In [None]:
df_overview['Author last'] = df_overview['Author last'].str.replace('/', ',')
#df_overview['Author last'] = df_overview['Author last'].str.strip()
df_overview['Author last'] = df_overview['Author last'].str.split(',').apply(map_names_to_aliases)

for auth in authors_map.keys():
    #df_overview['Author last'].str.replace(auth, authors_map[auth])
    df_overview['Author last'].str.lower().replace(auth, authors_map[auth])

df_overview['Author last'].unique()

In [None]:
df_overview.Authors.replace(authors_map, inplace=True)

In [None]:
df_overview.Authors.unique()

In [None]:
walker2 = pyg.walk(df_overview, spec='./summary.json', show_cloud_tool=False)