# EDA on the full dataset
that includes the scraped data and version counts

Obtained with `get_df.py`

## Load data

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.plotting.register_matplotlib_converters()

import numpy as np
import seaborn as sns
import missingno as mgn

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import pygwalker as pyg

pio.templates.default = "simple_white"

In [3]:
df = pd.read_csv('../data/aggr_data.csv', parse_dates=['date', 'publish_date'])
df.drop('old_index', axis=1, inplace=True)

In [None]:
page106523 = df.query('page_id == 106523')

In [7]:
pyg.walk(df.query('page_id == 106523'))

Box(children=(HTML(value='<div id="ifr-pyg-00061546850649b4SdCDgQYLu0ZrW6tH" style="height: auto">\n    <head>…

<pygwalker.api.pygwalker.PygWalker at 0x28a134650>

In [None]:
df.query('page_id == 106523')

In [None]:
df = pd.read_csv('../data/full_data.csv', parse_dates=['date', 'publish_date'])
df.drop('old_index', axis=1, inplace=True)

In [None]:
mgn.matrix(df);

In [None]:
df.columns 

## Exploring the influence of content changes

In [None]:
df.version_id.max()

In [None]:
COUNT_THRESH = 30

In [None]:
vers_ = df.groupby('page_id')['version_id'].max()
vers_[vers_ > COUNT_THRESH]

In [None]:
pages_frequent = vers_[vers_ > COUNT_THRESH].index

df_frequent = df.query('page_id in @pages_frequent')

In [None]:
df_frequent.page_id.unique().shape # only 7 pages, great!

In [None]:
df_frequent.columns

In [None]:
df_frequent[['page_id', 'date', 'external_impressions']].drop_duplicates() #.groupby(['page_id', 'date']).max() 

In [None]:
df_frequent = df_frequent.drop_duplicates().fillna(0)

In [None]:
version_user_side_ftrs = ['page_id', 'date', 'url', 'version_id', 'publish_date',
       'word_count','daily_likes', 'daily_dislikes', 'video_play', 'page_impressions', 'clickouts',
       'external_clicks', 'external_impressions' ]

In [None]:
df_frequent.date.min()

In [None]:
timeline = pd.date_range(start=df_frequent.date.min(), end=df_frequent.date.max(), freq='D')
timeline

In [None]:
data = df_frequent[['page_id', 'date', 'external_clicks']].groupby(['page_id', 'date'], as_index=False).sum()
#df_frequent[['page_id', 'date', 'external_clicks']].groupby(['page_id', 'date'], as_index=False).sum()
data

In [None]:
data = df_frequent[['page_id', 'date', 'external_impressions']].groupby(['page_id', 'date'], as_index=False).sum()

px.bar(data_frame=data, x='date', y='external_impressions', 
           color=data['page_id'].astype('str'),
           title=f'Ext. page impressions for pages with more than {COUNT_THRESH} versions'

           #color_discrete_map=px.colors.qualitative.Alphabet
           #color_continuous_scale=px.colors.make_colorscale(['red', 'darkblue'])
           #color_continuous_scale=px.colors.named_colorscales('Portland')
           #template='plotly-dark'
           )

In [None]:
data = df_frequent[['page_id', 'date', 'page_impressions']].groupby(['page_id', 'date'], as_index=False).sum()

px.bar(data_frame=data, x='date', y='page_impressions', 
           color=data['page_id'].astype('str'),
           title=f'Internal (Total) impressions for pages with more than {COUNT_THRESH} versions'
           )

In [None]:
data = df_frequent[['page_id', 'date', 'external_clicks']].groupby(['page_id', 'date'], as_index=False).sum()

px.bar(data_frame=data, x='date', y='external_clicks', 
           color=data['page_id'].astype('str'),
           title=f'Ext. page clicks for pages with more than {COUNT_THRESH} versions'

           )

In [None]:
data = df_frequent[['page_id', 'date', 'page_impressions']].groupby(['page_id', 'date'], as_index=False).sum()

px.bar(data_frame=data, x='date', y='page_impressions', 
           color=data['page_id'].astype('str'),
           title=f'Internal (Total) impressions for pages with more than {COUNT_THRESH} versions'
           )

In [None]:
data = df_frequent[['page_id', 'date', 'video_play']].groupby(['page_id', 'date'], as_index=False).sum()

px.bar(data_frame=data, x='date', y='video_play', 
           color=data['page_id'].astype('str'),
           title=f'Video plays for pages with more than {COUNT_THRESH} versions'
           )

In [None]:
data = df_frequent.query('page_id == 105259')[['page_id', 'date', 'page_impressions', 'version_id']]
data = data.groupby(['date', 'version_id'], as_index=False).sum()

px.bar(data_frame=data, x='date', y='page_impressions', 
           color=data['version_id'],#.astype('str'),
           title=f'Internal (Total) impressions for page 105259 - all versions'
           )

In [None]:
data = df_frequent.query('page_id==105259')[['page_id', 'date', 'page_impressions', 'version_id']]

In [None]:
data.page_id.unique()

In [None]:
df_frequent.page_id.unique()

In [8]:
#  a new column for the day of the week

dft = df.copy()
dft['day_of_week'] = dft['date'].dt.day_name()

date_count_per_id = dft.groupby('page_id')['date'].nunique()  # group by the page ids and their unique dates

freq_amount_of_dates_above_70 = date_count_per_id[date_count_per_id >= 70]  # find out which ones had more
                                                                             # than 70 dates (otherwise doesn't help
                                                                             # in the visualization)

fig = go.Figure()

for index in freq_amount_of_dates_above_70.index:  # iterate over each index
    dft_one = dft[dft.page_id == index]
    trace = go.Scatter(
        x=dft_one['date'],
        y=dft_one['external_clicks'],
        mode='lines',
        hoverinfo='skip',
        hovertemplate=f'<b>Page ID:</b> {index}<br><b>Date:</b> %{{x}} (%{{x|%A}})<br><b>External Clicks:</b> %{{y}}<extra></extra>',
        showlegend=False)  # here is to SHUT UP the default trace info from hover

    fig.add_trace(trace)

fig.update_layout(
    title='External Clicks over Time for Different Page IDs',
    xaxis_title='Date',
    yaxis_title='External Clicks',
    hovermode='closest', 
    showlegend=False,  
    xaxis=dict(
        tickmode='auto',
        nticks=20,
        tickformat='%Y-%m-%d (%a)',  # show date with day of the week abbreviated
    )
)

# Show figure
fig.show()