In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.plotting.register_matplotlib_converters()

import numpy as np
import seaborn as sns
import missingno as mgn

import plotly.express as px

Reading the data with standard preprocessing steps

In [None]:
# Just the new file only - for now
df1 = pd.read_excel('../data/data_d-drivers_2024-03-24.xlsx', sheet_name='data',
                    )
df2 = pd.read_excel('../data/data_d-drivers_2024-03-26.xlsx', sheet_name='data')

df1.columns = [col.lower() for col in df1.columns]
df2.columns = [col.lower() for col in df2.columns]

df1.rename({
           'impressions': 'page_impressions',
           'page_efahrer_id': 'page_id',
           'published_at': 'publishing_date',
           'page_canonical_url': 'url',
           'page_author': 'authors', 
            }, axis=1, inplace=True)

df2.rename({
           'impressions': 'page_impressions',
           'page_efahrer_id': 'page_id',
           'published_at': 'publishing_date',
           'page_canonical_url': 'url',
           'page_author': 'authors', 
            }, axis=1, inplace=True)

What makes each entry unique?

In [None]:
df1[df1[['page_id', 'date', 'url', 'authors', 'word_count']].duplicated(keep=False)] # keep=False keeps all duplicated values
                                    #.sort_values(['page_id', 'date', 'url', 'page_author'])

By experimenting with different sets of columns I found those are 

> 'page_id', 'date', 'url', 'authors', 'word_count'

* NOT the `page_name`: it is totally broken
* NOT the `publishing_date`: sometimes the articles changed several times during the day and the word count changed, so `publishing_date` does not capture all combinations

Addressing the rows by all of those columns makes every article unique but just a single one: 1018299 (rows 66544 and 78658). But the second entry is just a mistake with all missing values, it can be simply dropped. 

In [None]:
df1.drop(78658, inplace=True)

Merge on those:

In [None]:
key_columns = ['page_id', 'date', 'url', 'authors', 'word_count']

What columns are in `df1` and not in `df2`?

In [None]:
print(df2.columns.difference(df1.columns))
print(df1.columns.difference(df2.columns))

So the page impressions only present in the 2nd delivery, and clockouts only in the first delivery. 
We will merge the *first to the second*, so in the first dataset we only want to have the key columns and the unique one.

In [None]:
df1 = df1[key_columns + ['clickouts']]

What pages are in df1 and not in df2 and the other way around?

In [None]:
print(df1.set_index('page_id').index.difference(df2.set_index('page_id').index))
print(df2.set_index('page_id').index.difference(df1.set_index('page_id').index))

> The new data delivery includes all pages from the first one + 8 new ones.

What `URLs` are in df1 and not in df2 and the other way around?

In [None]:
print(df1.set_index('url').index.difference(df2.set_index('url').index))
print(df2.set_index('url').index.difference(df1.set_index('url').index))

> 22 new URLs

Which dates are new?

In [None]:
print(df1.set_index('date').index.difference(df2.set_index('date').index))
print(df2.set_index('date').index.difference(df1.set_index('date').index))

> The day between the first and the second data deliveries, makes a lot of sense :D

In [None]:
if df1.shape == df1.drop_duplicates().shape:
    print('No duplicates left in the first dataframe')
else: 
    print('Duplicated entries present: merging will blow up the data frame size')

In [None]:
if df2[key_columns].shape == df2[key_columns].drop_duplicates().shape:
    print('No duplicates are in the second dataframe')
else: 
    print('Duplicated entries present: merging will blow up the data frame size')

In [None]:
df2[df2[key_columns].duplicated(keep=False)] # keep=False keeps all duplicated values
                                    #.sort_values(['page_id', 'date', 'url', 'page_author'])

Same mistake is in `df2`: remove it.

In [None]:
df2.drop(40600, inplace=True)

In [None]:
if df2[key_columns].shape == df2[key_columns].drop_duplicates().shape:
    print('No duplicates are in the second dataframe')
else: 
    print('Duplicated entries present: merging will blow up the data frame size')

Yuppi, now we are ready to merge!

## Merging

Using the `left` merging: we already know that `df1` is malformatted

In [None]:
df = pd.merge(left=df2, right=df1, on=key_columns, how='left') 
# in principle, even the page_id is redundant in this case, 
# because each url contains the page id as the suffix

df

In [None]:
df2.shape

In [None]:
mgn.matrix(df);

### Merging in the scraped data

In [None]:
## To be continued

---

## Feature engineering

### Article (content) versions

We want to label each version.
* Version changes when there is a new `publication date`
* Version changes when there is a new `word count`
* Version does NOT change with a change in `URL`
* Version does NOT change with a change in the `date` column

In [None]:
# Only the necessary columns
# we still need the 'date' column for imputation
df_cnt = df[['page_efahrer_id', 'date', 'published_at', 'word_count']]
df_cnt = df_cnt.sort_values(['page_efahrer_id', 'date', 'published_at'])
df_cnt

In [None]:
mgn.matrix(df_cnt);

There are some columns where the publication date changed but the `word count` was not updated!

In [None]:
wcna_idx = df_cnt[df_cnt.word_count.isna() & df_cnt.published_at.notna()].index
wcna_idx

NOT the other way around:

In [None]:
df_cnt[df_cnt.word_count.notna() & df_cnt.published_at.isna()].index

The best assumption that it did not change (significantly??), so still forward-fill it.

In [None]:
df_cnt = df_cnt.ffill()

In [None]:
sus = df_cnt.loc[wcna_idx]

#### Versions

In [None]:
df_cnt['publ_at_enc'] = df_cnt.groupby('page_efahrer_id')['published_at'].transform(lambda x: pd.factorize(x)[0])
df_cnt

How many versions does each article have?

In [None]:
to_plot = df_cnt[['page_efahrer_id', 'publ_at_enc']].groupby('page_efahrer_id').max()#.reset_index()
first_publ_date = df_cnt[['page_efahrer_id', 'published_at']].groupby('page_efahrer_id').min()
first_publ_date = first_publ_date.rename({'published_at': 'First publication date'}, axis=1)
to_plot = to_plot.join(first_publ_date)
to_plot = to_plot.rename({'publ_at_enc': 'Number of versions'}, axis=1)
to_plot

In [None]:
px.scatter(data_frame=to_plot, x='Number of versions', y='First publication date')

Article with 61 (!!!) versions:

In [None]:
metrics_cols = ['page_canonical_url', 'daily_likes', 
               'daily_dislikes', 'impressions', 'video_play', 
               'discover_clicks', 'discover_impressions']

In [None]:
article105259 = df[df['page_efahrer_id']==105259].sort_values(['date', 'page_canonical_url'])
article_first_url = article105259[metrics_cols + ['date']].drop_duplicates(subset=['date'], keep='first')

In [None]:
import matplotlib.pyplot as plt

xticks=pd.date_range(df.date.min(), df.date.max(), freq='2M')
fig = article_first_url.plot(kind='bar', x='date', y=metrics_cols, subplots=True, figsize=(6, 12), 
                       xticks=xticks)
plt.gca().set_xticklabels([x.strftime('%a\n%d\n%h\n%Y') for x in xticks]);
#plt.xticks(ticks=df[['impressions', 'published_at']].resample('W', on='published_at').max().index);
#plt.xticks(ticks=pd.date_range(df.date.min(), df.date.max(), freq='2M'),
#           labels=pd.date_range(df.date.min(), df.date.max(), freq='2M'));