# Imports and data

In [None]:
import os
os.environ["JAVA_HOME"] = "/lib/jvm/java-11-openjdk-amd64"
# Because otherwise custom modules import errors
import sys
from tqdm import tqdm
sys.path.append('../')
os.makedirs('../figures_report/volume_dynamics', exist_ok=True)
os.makedirs('../interm_results/volume_dynamics', exist_ok=True)
import random

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline

from pyspark.sql.functions import *
from pyspark import SparkContext
from pyspark.sql.types import ArrayType, IntegerType

import plotly.graph_objs as go
import plotly.express as px

from scipy import stats
from scipy import spatial

In [None]:
from more_itertools import consecutive_groups
from itertools import chain

In [None]:
import pyspark

In [None]:
from src.ranking_helpers import *
from src.make_and_plot import*
from src.pages_groups_extraction import*
from src.data_aggregation import*

## Initialize context 

In [None]:
conf = pyspark.SparkConf().setMaster("local[5]").setAll([
                                   ('spark.driver.memory','120G'),
                                   ('spark.executor.memory', '120G'),
                                   ('spark.driver.maxResultSize', '0'),
                                    ('spark.executor.cores', '5'),
                                    ('spark.local.dir', '/scratch/descourt/spark')
                                ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# create the context
sc = spark.sparkContext
sc.setLogLevel('ERROR')

## Download data

Note : the French processed data is stored under `processed_data/fr/pageviews_fr_2015-2023.parquet`. If you want to use it, you should be careful to the language specific analysis cells and plot paths !

In [None]:
dfs = spark.read.parquet("/scratch/descourt/processed_data/en/pageviews_en_2015-2023.parquet").withColumn('date', to_date(col('date'), 'yyyy-MM')).withColumn('project', lit('en'))

In [None]:
dfs_uptonov = dfs.where(dfs.date <= to_date(lit('2022-11'), 'yyyy-MM')).cache()

In [None]:
df_metadata = spark.read.parquet('/scratch/descourt/metadata/akhils_data/wiki_nodes_bsdk_phili_2022-11_en.parquet').withColumn('creation_date', to_date(col('creation_date'), 'yyyy-MM'))

In [None]:
dfs_change = spark.read.parquet("/scratch/descourt/processed_data/en/pageviews_en_articles_ev_2023-03.parquet")

## Match page id changes 

Over time, a given article page id might change. We won't be taking care of it here since it represents a minority fraction of the articles. But we'll leave it here to understand why some pages are missing when matching our data with metadata, and which kind of pages it represents

In [None]:
dfs_change = spark.read.parquet("/scratch/descourt/processed_data/en/pageviews_en_articles_ev_2022-11.parquet")\
                       .select(to_date(col('last_date'), 'yyyy-MM').alias('last_date'),
                               to_date(col('first_date'), 'yyyy-MM').alias('first_date'),
                              'page_ids',
                              'last_page_id',
                              'last_name')

In [None]:
dfs_change_missing = dfs_uptonov.join(df_metadata, on = 'page_id', how='leftanti')\
                                .join(dfs_change, dfs_change.page_ids == dfs_uptonov.page_id)\
                                .withColumn('age_in_months', months_between('date', 'first_date')).cache()

In [None]:
sum_missing = dfs_change_missing.select('tot_count_views', 'age_in_months').summary().cache()

In [None]:
dfs_uptonov.count()

In [None]:
sum_missing.show()

In [None]:
dfs_change_missing.where('date = "2022-11-01"').select('tot_count_views', 'age_in_months').summary().show()

For later analysis

In [None]:
dfs_change_filt = spark.read.parquet("/scratch/descourt/processed_data/en/pageviews_en_articles_ev_2023-03.parquet")

In [None]:
dfs_uptomar23 = dfs.join(dfs_change, dfs_change.page_ids == dfs.page_id).cache()

## Process data
Remove unused columns

In [None]:
df_high_volume = extract_volume(dfs, high=True).select('date', 'tot_count_views', 'page', 'page_id', 'rank').cache()

In [None]:
df_high_volume = df_high_volume.join(dfs_change, dfs_change.page_ids == df_high_volume.page_id).select('date', 'tot_count_views', 'last_name', col('last_page_id').alias('page_id'), 'first_date', 'last_date').cache()

In [None]:
df_low_volume = extract_volume(dfs, high=False).select('date', 'tot_count_views', 'page', 'page_id', 'rank').cache()

In [None]:
df_low_volume = df_low_volume.join(dfs_change, dfs_change.page_ids == df_low_volume.page_id).select('date', 'tot_count_views', 'last_name', col('last_page_id').alias('page_id'), 'first_date', 'last_date').cache()

# Characteristic time

In [None]:
# We need to convert each date into an integer for following analysis. 
# So we express each date as the difference in number of months between this date and the latest possible date, ie. march 23

df_app_h = df_high_volume.select('page_id', 
                                 months_between(to_date(lit('2023-03'), 'yyyy-MM'), col('date')).alias('date'))\
                         .groupBy('page_id').agg(collect_set('date').alias('dates_core')).cache()
df_app_l = df_low_volume.select('page_id', 
                                months_between(to_date(lit('2023-03'), 'yyyy-MM'), col('date')).alias('date'))\
                         .groupBy('page_id').agg(collect_set('date').alias('dates_tail')).cache()
df_app_all = dfs.select('page_id', 
                        months_between(to_date(lit('2023-03'), 'yyyy-MM'), col('date')).alias('date'))\
                         .groupBy('page_id').agg(collect_set('date').alias('dates_all')).cache()

## How much last views change over time

In [None]:
df_views_core = df_high_volume.groupBy('date').agg(min('tot_count_views').alias('min_views')).toPandas()
df_views_core.sort_values('date').set_index('date').plot()

In [None]:
df_views_tail = df_low_volume.groupBy('date').agg(max('tot_count_views').alias('min_views')).toPandas()
df_views_tail.sort_values('date').set_index('date').plot()

In [None]:
df_nbpages_tail = df_low_volume.groupBy('date').agg(count('*').alias('nb_pages')).toPandas()
df_nbpages_tail.sort_values('date').set_index('date').plot()

## How many times do articles appear (not necessarily consecutively) in the core or the tail ?

Articles appearing 93 times (which corresponds to the number of months between July 2015 and March 23) make up the stable part of the core / tail / whole volume


In [None]:
df_occ = dfs.join(dfs.groupBy('page_id').agg(count('*').alias('nb_months_in')), on='page_id')\
            .select('date', 'page_id', 'tot_count_views', 'rank', 'nb_months_in',
                    when(col('nb_months_in') == 93, True).otherwise(False).alias('is_common'),
                    when(col('nb_months_in') == 1, True).otherwise(False).alias('is_unique')).cache()
df_occ_agg = df_occ.groupBy('date').agg( (sum(when(col('is_common'),1).otherwise(0))/ count('*')).alias('perc_common'),
                                         (sum(when(col('is_unique'),1).otherwise(0)) / count('*')).alias('perc_unique'),
                                        count('*').alias('tot_pages')).toPandas()

In [None]:
df_high_occ = df_high_volume.join(df_high_volume.groupBy('page_id').agg(count('*').alias('nb_months_in')), on='page_id')\
            .select('date', 'page_id', 'tot_count_views', 'rank', 'nb_months_in',
                    when(col('nb_months_in') == 93, True).otherwise(False).alias('is_common'),
                    when(col('nb_months_in') == 1, True).otherwise(False).alias('is_unique')).cache()
df_high_occ_agg = df_high_occ.groupBy('date').agg( (sum(when(col('is_common'),1).otherwise(0))/ count('*')).alias('perc_common'),
                                         (sum(when(col('is_unique'),1).otherwise(0)) / count('*')).alias('perc_unique'),
                                        count('*').alias('tot_pages')).toPandas()

In [None]:
df_low_occ = df_low_volume.join(df_low_volume.groupBy('page_id').agg(count('*').alias('nb_months_in')), on='page_id')\
            .select('date', 'page_id', 'tot_count_views', 'rank', 'nb_months_in',
                    when(col('nb_months_in') == 93, True).otherwise(False).alias('is_common'),
                    when(col('nb_months_in') == 1, True).otherwise(False).alias('is_unique')).cache()
df_low_occ_agg = df_low_occ.groupBy('date').agg( (sum(when(col('is_common'),1).otherwise(0))/ count('*')).alias('perc_common'),
                                         (sum(when(col('is_unique'),1).otherwise(0)) / count('*')).alias('perc_unique'),
                                        count('*').alias('tot_pages')).toPandas()

In [None]:
df_occ_agg['perc_left'] = 1 - df_occ_agg['perc_common'] - df_occ_agg['perc_unique']
df_occ_agg['Volume'] = 'Baseline'
df_high_occ_agg['perc_left'] = 1 - df_high_occ_agg['perc_common'] - df_high_occ_agg['perc_unique']
df_high_occ_agg['Volume'] = 'Core'
df_low_occ_agg['perc_left'] = 1 - df_low_occ_agg['perc_common'] - df_low_occ_agg['perc_unique']
df_low_occ_agg['Volume'] = 'Tail'

In [None]:
import datetime
df_plot_1['now'] = df_plot_1['date'].apply(lambda x : (datetime.date(2023, 3, 1) -x).days )

In [None]:
df_plot_1 = pd.concat([df_low_occ_agg, df_high_occ_agg, df_occ_agg])
df_plot_1['now'] = df_plot_1['date'].apply(lambda x : (datetime.date(2023, 3, 1) -x).days)
df_plot_1['now'] = df_plot_1['now'] // 60

data = df_plot_1.loc[df_plot_1['Volume'] == 'Core'].sort_values('now', ascending=False)
fig = px.line(df_plot_1.sort_values(by='date', ascending=False), x='date', y='perc_common', color='Volume')

fig.update_layout(    xaxis_title=dict(text='Date', font=dict(size=20)),
    yaxis_title=dict(text='Percentage of common pages', font=dict(size=20)),
    yaxis = dict( tickfont = dict(size=20)),
    xaxis = dict( tickfont = dict(size=20)),
    legend = dict(font=dict(size=20)),
    legend_title = dict(font=dict(size=20)),
    height=800,
    width=800)
fig.show()
fig.write_image("../figures_report/volume_dynamics/perc_common_decrease_log_fr.pdf")

In [None]:
fig = px.line(df_plot_1.sort_values('date'), x='date', y='perc_unique', color='Volume')
fig.update_layout(    xaxis_title=dict(text='Date', font=dict(size=20)),
    yaxis_title=dict(text='Percentage of unique pages', font=dict(size=20)),
    yaxis = dict( tickfont = dict(size=20)),
    xaxis = dict( tickfont = dict(size=20)),
    legend = dict(font=dict(size=20)),
    legend_title = dict(font=dict(size=20)),
    height=800,
    width=800)
fig.show()
fig.write_image("../figures_report/volume_dynamics/perc_unique_decrease_fr.pdf")

In [None]:
df_plot_1.groupby('Volume').apply(lambda x : x.describe())

## Life time in the core

For the 43% left articles which :
- were present for at most 92 months

Compute
* Average lifetime per article and std
* Max lifetime per article distribution
- consecutive occurrences distribution

In [None]:
def find_consecutive(dates):
    """
    Find consecutive sequences length. Note that we need integers here ! 
    """
    sizes = [len(list(grp)) for grp in consecutive_groups(sorted(dates))]
    return sizes
    
find_consecutive_udf = udf(find_consecutive, ArrayType(IntegerType()))

In [None]:
df_lifetime_h = df_app_h.withColumn('lifetimes', find_consecutive_udf(col('dates_core')))\
                      .select(explode('lifetimes').alias('lifetimes'), 'page_id', size('dates_core').alias('nb_occurrences'),
                              size('lifetimes').alias('consecutive_occurrences')).cache()

In [None]:
df_lifetime_l = df_app_l.withColumn('lifetimes', find_consecutive_udf(col('dates_tail')))\
                      .select(explode('lifetimes').alias('lifetimes'), 'page_id', size('dates_tail').alias('nb_occurrences'),
                              size('lifetimes').alias('consecutive_occurrences')).cache()

In [None]:
df_lifetime_all = df_app_all.withColumn('lifetimes', find_consecutive_udf(col('dates_all')))\
                      .select(explode('lifetimes').alias('lifetimes'), 'page_id', size('dates_all').alias('nb_occurrences'),
                              size('lifetimes').alias('consecutive_occurrences')).cache()

### What is the maximum life time distribution in the core accross the entire period ?

In [None]:
# Compute the max lifetime per article in core and make the distribution
w = Window.partitionBy('page_id').orderBy(desc('lifetimes'))
df_lifetime_h = df_lifetime_h.select('page_id', first('lifetimes').over(w).alias('max_lifetime'), 'consecutive_occurrences')\
                     .distinct().cache()
df_char_agg = df_lifetime_h.groupBy('max_lifetime')\
                     .agg(count('*').alias('nb_pages')).toPandas()
tot_pages = df_char_agg['nb_pages'].sum()
df_char_agg['frac_pages'] = df_char_agg['nb_pages'] / tot_pages * 100

In [None]:
# Compute the max lifetime per article in tail and make the distribution
df_lifetime_l = df_lifetime_l.select('page_id', first('lifetimes').over(w).alias('max_lifetime'), 'consecutive_occurrences')\
                     .distinct().cache()
df_char_agg_l = df_lifetime_l.groupBy('max_lifetime')\
                     .agg(count('*').alias('nb_pages')).toPandas()
tot_pages_l = df_char_agg_l['nb_pages'].sum()
df_char_agg_l['frac_pages'] = df_char_agg_l['nb_pages'] / tot_pages_l * 100

In [None]:
# Compute the max lifetime per article for entire volume and make the distribution
df_lifetime_all = df_lifetime_all.select('page_id', first('lifetimes').over(w).alias('max_lifetime'), 'consecutive_occurrences')\
                     .distinct().cache()
df_char_agg_all = df_lifetime_all.groupBy('max_lifetime')\
                     .agg(count('*').alias('nb_pages')).toPandas()
tot_pages_all = df_char_agg_all['nb_pages'].sum()
df_char_agg_all['frac_pages'] = df_char_agg_all['nb_pages'] / tot_pages_all * 100

In [None]:
df_char_agg_l['Volume'] = 'Tail'
df_char_agg['Volume'] = 'Core'
df_char_agg_all['Volume'] = 'Baseline'

In [None]:
#   title='Distribution of pages max life time in the core and the tail',
df_plot = pd.concat([df_char_agg_l, df_char_agg]).sort_values('max_lifetime')
df_plot['text'] = df_plot.apply(lambda r: str(np.round(r.frac_pages*10)/10) + ' %' if r.max_lifetime in [1, 93] else "", axis=1)
df_point = df_plot.loc[df_plot['max_lifetime'].isin([0, 93])]

fig = px.line(df_plot, x = 'max_lifetime', y='frac_pages', color='Volume', log_y=True, text='text')#, opacity=0.5)

fig.update_layout(
    xaxis_title=dict(text='Maximum lifetime', font=dict(size=25)),
    yaxis_title=dict(text='Percentage of pages', font=dict(size=25)),
    yaxis = dict( tickfont = dict(size=25)),
    xaxis = dict( tickfont = dict(size=25)),
    legend = dict(font=dict(size=25)),
    height=900,
    width=900,
    legend_title = dict(font=dict(size=25)),)
    #barmode='overlay')
fig.update_traces(line={'width': 10}, textfont=dict(size=21))

fig.show()
fig.write_html("../interm_results/volume_dynamics/distribution_maxlifetime_all_perc.html")
fig.write_image('../figures_report/volume_dynamics/distribution_maxlifetime_all_perc_fr.pdf')

In [None]:
samples_max_h = df_lifetime_h.select('max_lifetime').sample(0.1).toPandas()
samples_max_l = df_lifetime_l.select('max_lifetime').sample(0.01).toPandas()
samples_max_all = df_lifetime_all.select('max_lifetime').sample(0.01).toPandas()

In [None]:
samples_max_h.describe(percentiles=[.25, .5, .75, .9])

In [None]:
samples_max_l.describe(percentiles=[.1, .12,.25, .5, .75, .9])

In [None]:
samples_max_all.describe(percentiles=[.1, .095, .25, .5, .75, .9])

In [None]:
#   title='Distribution of pages max life time in the core and the tail',
fig = px.bar(pd.concat([df_char_agg_l, df_char_agg_all]).sort_values('max_lifetime'), x = 'max_lifetime', y='frac_pages', color='Volume', log_y=True, opacity=0.5)
fig.update_layout(
    xaxis_title=dict(text='Maximum lifetime', font=dict(size=20)),
    yaxis_title=dict(text='Percentage of pages', font=dict(size=20)),
    yaxis = dict( tickfont = dict(size=20)),
    xaxis = dict( tickfont = dict(size=20)),
    legend = dict(font=dict(size=20)),
    legend_title = dict(font=dict(size=20)),
    barmode='overlay')
fig.show()
#fig.write_html("interm_results/volume_dynamics/distribution_maxlifetime_all_perc_2.html")
fig.write_image('figures_report/volume_dynamics/distribution_maxlifetime_all_perc_fr_2.pdf')

In [None]:
sample_lifet_h = df_lifetime_h.select('max_lifetime').sample(0.1).toPandas()
sample_lifet_l = df_lifetime_l.select('max_lifetime').sample(0.05).toPandas()
sample_lifet_all = df_lifetime_all.select('max_lifetime').sample(0.01).toPandas()

In [None]:
sample_lifet_h['Volume'] = 'Core'
sample_lifet_l['Volume'] = 'Tail'
sample_lifet_all['Volume'] = 'Baseline'

In [None]:
fig = px.box(pd.concat([sample_lifet_all, sample_lifet_l, sample_lifet_h]), x='Volume', y='max_lifetime')
fig.show()

### For how many consecutive_occurrences do article appear in the core ?

In [None]:
sample_occ_h = df_lifetime_h.select('consecutive_occurrences').sample(0.1).toPandas()
sample_occ_l = df_lifetime_l.select('consecutive_occurrences').sample(0.05).toPandas()
sample_occ_all = df_lifetime_all.select('consecutive_occurrences').sample(0.01).toPandas()

In [None]:
sample_occ_h.describe(percentiles=[.25, .5, .75, .85,.9])

In [None]:
sample_occ_l.describe(percentiles=[.25, .5, .75, .76,.77,.79,.8,.9])

In [None]:
sample_occ_all.describe(percentiles=[.25, .5, .75, .9])

In [None]:
df_consocc_agg = df_lifetime_h.groupBy('consecutive_occurrences')\
                     .agg(count('*').alias('nb_pages')).toPandas()

In [None]:
df_consocc_agg_tail = df_lifetime_l.groupBy('consecutive_occurrences')\
                     .agg(count('*').alias('nb_pages')).toPandas()

In [None]:
df_consocc_agg_all = df_lifetime_all.groupBy('consecutive_occurrences')\
                     .agg(count('*').alias('nb_pages')).toPandas()

In [None]:
df_consocc_agg['Volume'] = 'Core'
df_consocc_agg_tail['Volume'] = 'Tail'
df_consocc_agg_all['Volume'] = 'Baseline'

df_consocc_agg['Percentage'] = df_consocc_agg['nb_pages'] /  df_consocc_agg['nb_pages'].sum() * 100
df_consocc_agg_tail['Percentage'] = df_consocc_agg_tail['nb_pages'] /  df_consocc_agg_tail['nb_pages'].sum() * 100
df_consocc_agg_all['Percentage'] = df_consocc_agg_all['nb_pages'] /  df_consocc_agg_all['nb_pages'].sum() * 100

In [None]:
# title='Distribution of pages max life time in the core',
fig = px.bar(pd.concat([df_consocc_agg, df_consocc_agg_tail, df_consocc_agg_all]).sort_values('consecutive_occurrences'),
            x = 'consecutive_occurrences', y='Percentage', color='Volume')
fig.update_layout(
    xaxis_title=dict(text='Number of occurrences in the core', font=dict(size=20)),
    yaxis_title=dict(text='Fraction of pages', font=dict(size=20)),
    yaxis = dict( tickfont = dict(size=20)),
    xaxis = dict( tickfont = dict(size=20)),
    legend = dict(font=dict(size=15)),
    legend_title = dict(font=dict(size=20)),
    height=800,
    width=800,
barmode='group')
fig.show()
#fig.write_html("interm_results/volume_dynamics/distribution_occurrences_core.html")
fig.write_image("../figures_report/volume_dynamics/distribution_ccurrences_core_fr.pdf")

# What characterises the incoming and outgoing fluxes?

In [None]:
w = Window.partitionBy().orderBy(asc('date'))
# Gather pages that are new compared to the month before
df_fluxes = df_high_volume.groupBy('date').agg(collect_set('page_id').alias('current_ids'))\
                          .select(lag('current_ids').over(w).alias('prev_ids'), 'date', 'current_ids')\
                          .select(explode(array_except('current_ids', 'prev_ids')).alias('page_id'), 'date').cache()

In [None]:
# Gather pages which were already present the month before
df_common_month = df_high_volume.groupBy('date').agg(collect_set('page_id').alias('current_ids'))\
                          .select(lag('current_ids').over(w).alias('prev_ids'), 'date', 'current_ids')\
                          .select(explode(array_intersect('current_ids', 'prev_ids')).alias('page_id'), 'date').cache()

In [None]:
# Gather pages' creation date
df_ages = df_high_volume.join(df_metadata.select('page_id', 'creation_date'), 'page_id')
df_ages = df_ages.select('page_id',
                           months_between('date', 'creation_date').alias('age_in_months'),
                          'creation_date',
                           'date').cache()

In [None]:
# Remove pages for which the reported creation month is wrong
pages_moves = df_ages.where(col('age_in_months') < 0).select('page_id').distinct().cache()
df_fluxes_dates = df_fluxes.join(df_ages.select('page_id', 'creation_date', 'date'), on=['date', 'page_id']).join(pages_moves, on='page_id', how='leftanti').drop('age_in_months').cache()

### What is this incoming flux size over time ?

In [None]:
df_fluxsize = df_fluxes.groupBy('date').agg(count('*').alias('nb_incoming_pages')).toPandas()

In [None]:
df_totsize= df_high_volume.groupBy('date').agg(count('*').alias('nb_tot_pages')).toPandas()

In [None]:
# With match and no page moves filtering
df_fluxcompar = df_fluxsize.merge(df_totsize, on='date')
df_fluxcompar['perc'] = df_fluxcompar['nb_incoming_pages'] / df_fluxcompar['nb_tot_pages'] * 100
fig = px.bar(df_fluxcompar.sort_values('date'), x='date', y='perc')
fig.update_layout(
title='Percentage of incoming pages over the total monthly core',
xaxis_title='Percentage',
yaxis_title='Date')
fig.show()
df_fluxcompar.describe()

### What is the creation date distribution among incoming pages ?

In [None]:
df_ages_creationd = df_fluxes_dates.groupBy('date', 'creation_date').agg(count('*').alias('nb_pages')).toPandas()
df_ages_creationd_sum = df_ages_creationd.groupby('date').agg(tot_pages=('nb_pages', 'sum')).reset_index()
df_ages_creationd = df_ages_creationd.merge(df_ages_creationd_sum, on='date')
df_ages_creationd['perc'] = df_ages_creationd['nb_pages'] / df_ages_creationd['tot_pages'] * 100

In [None]:
#title='Age distribution of core incoming pages in November 2022',
fig = px.bar(df_ages_creationd.loc[df_ages_creationd.date.astype(str) == '2022-11-01'], x ='creation_date', y = 'perc', log_y=False)
fig.update_layout(
    xaxis_title=dict(text='Creation Date', font=dict(size=20)),
    yaxis_title=dict(text='Percentage', font=dict(size=20)),
    yaxis = dict( tickfont = dict(size=20)),
    xaxis = dict( tickfont = dict(size=20)),
    height=600,
    width=1000)
fig.show()
fig.write_image("../figures_report/volume_dynamics/age_distrib_nov22_fr.pdf")

In [None]:
#title='Age distribution of core incoming pages in November 2022',
fig = px.bar(df_ages_creationd.sort_values('date'), x ='creation_date', y = 'perc', log_y=False, animation_frame='date')
fig.update_layout(
    xaxis_title=dict(text='Creation Date', font=dict(size=20)),
    yaxis_title=dict(text='Percentage', font=dict(size=20)),
    yaxis = dict( tickfont = dict(size=20)),
    xaxis = dict( tickfont = dict(size=20)),
    height=600,
    width=1000)
fig.show()
fig.write_html("../figures_report/volume_dynamics/age_distrib_flux_fr.html")

In [None]:
dates_summ = df_fluxes_dates.select('date', months_between('date', 'creation_date').alias('age_in_months'))\
                            .groupBy('date').agg(add_months('date', -percentile_approx('age_in_months', 0.5)).alias('med'),\
                                                 add_months('date', -percentile_approx('age_in_months', 0.1)).alias('10%'),\
                                                  add_months('date', -percentile_approx('age_in_months', 0.25)).alias('25%'),\
                                                  add_months('date', -percentile_approx('age_in_months', 0.75)).alias('75%'),\
                                                  add_months('date', -percentile_approx('age_in_months', 0.9)).alias('90%')).toPandas()

In [None]:
dates_summ.sort_values('date').head(60)

#### Check above results with my own data

In [None]:
df_fluxes_dates_check = df_fluxes.join(df_high_volume.select('page_id', 'first_date', 'date'), on=['date', 'page_id']).cache()

In [None]:
df_dates_check = df_fluxes_dates_check.groupBy('date', 'first_date').agg(count('*').alias('nb_pages')).toPandas()
df_ages_creationd_sum = df_dates_check.groupby('date').agg(tot_pages=('nb_pages', 'sum')).reset_index()
df_dates_check = df_dates_check.merge(df_ages_creationd_sum, on='date')
df_dates_check['perc'] = df_dates_check['nb_pages'] / df_dates_check['tot_pages'] * 100

In [None]:
#title='Age distribution of core incoming pages in November 2022',
fig = px.bar(df_dates_check.sort_values('date'), x ='first_date', y = 'perc', log_y=True, animation_frame='date')
fig.update_layout(
    xaxis_title=dict(text='Creation Date', font=dict(size=20)),
    yaxis_title=dict(text='Percentage', font=dict(size=20)),
    yaxis = dict( tickfont = dict(size=20)),
    xaxis = dict( tickfont = dict(size=20)),
    height=600,
    width=1000)
fig.show()

#### Is the bump before 2010 conserved overtime ?

In [None]:
df_flux_before_2010 = df_fluxes_dates.where(col('creation_date') <= to_date(lit('2010-01'), 'yyyy-MM')).select(col('date').cast('string'), months_between('date', 'creation_date').alias('age_in_months')).sample(0.5).toPandas()

In [None]:
dates = [ '2015-08-01', '2015-09-01', '2015-10-01', '2015-11-01', '2015-12-01']\
        + [f'{y}-{m}-01' for y in ['2016', '2017', '2018', '2019', '2020', '2021', '2022'] for m in
                       [f'0{i}' if i < 10 else i for i in range(1, 13, 1)]]\
        + ['2023-01-01', '2023-02-01', '2023-03-01']

In [None]:
from itertools import combinations
combs = np.array([i for i in combinations(dates, 2)])
idx = [random.randint(0, len(combs)) for i in range(500)]

In [None]:
from scipy.stats import mannwhitneyu

d1, d2, pvalues, n = [], [], [], 0
for date_pair in tqdm(combs[idx]):
    d1.append(date_pair[0])
    d2.append(date_pair[1])
    n += 1
    sample_1 = df_flux_before_2010.loc[df_flux_before_2010.date == date_pair[0]]['age_in_months']
    sample_2 = df_flux_before_2010.loc[df_flux_before_2010.date == date_pair[1]]['age_in_months']
    res = mannwhitneyu(sample_1.sample(n=np.min([sample_1.shape[0],sample_2.shape[0]])).values,
          sample_2.sample(n=np.min([sample_1.shape[0],sample_2.shape[0]])).values)
    pvalues.append(res.pvalue)

print(f"Level of confidence = {0.05 / n}")
df_res = pd.DataFrame({'date_1' : d1, 'date_2' : d2, 'pvalue' : pvalues})
df_res[df_res.pvalue > 0.05/n]

### What is the proportion of newly created pages over the rest of the flux ?

In [None]:
# Number of pages newly created in incoming flux
df_creation_flux = df_dates_check.loc[df_dates_check['date'] == df_dates_check['first_date']]
df_creation_flux.describe()

In [None]:
# With my definition of creation date, ie. estimation
plt.figure()
df_creation_flux.sort_values('date').set_index('date')['perc'].plot()
plt.title('Percentage of newly created pages in incoming flux - Estimated')

In [None]:
dfs_all = dfs.join(dfs_change, dfs_change.page_ids == dfs.page_id)
df_creation_all = dfs_all.where(col('date') == col('first_date')).groupBy('date').agg(count('*').alias('nb_pages_all')).toPandas()

In [None]:
df_new_prop = df_creation_flux.merge(df_creation_all, on='date')

In [None]:
df_new_prop['perc_core'] = df_new_prop['nb_pages'] / df_new_prop['nb_pages_all'] * 100

In [None]:
plt.figure()
df_new_prop.sort_values('date').set_index('date')['perc_core'].plot()
plt.title('Percentage of newly created pages going directly to the core - Estimated')

In [None]:
# With metadata creation date
df_ages_creationd[(df_ages_creationd['date'] == df_ages_creationd['creation_date']) & ((df_ages_creationd['date'] <= pd.to_datetime('2022-11-01')))].sort_values('date').set_index('date')['perc'].plot()
plt.title('Percentage of newly created pages in the core incoming pages over time')
plt.xlabel('Date')
plt.ylabel('Percentage')

In [None]:
df_low_dates_sp = df_low_volume.join(df_metadata, on='page_id').where(col('date') == col('creation_date')).cache()
df_low_dates_sp.show()

In [None]:
df_low_dates = df_low_volume.join(df_metadata, on='page_id').where(col('date') == col('creation_date')).groupBy('date').agg(count('*').alias('nb_pages')).toPandas()

In [None]:
df_core_new = df_ages_creationd[(df_ages_creationd['date'] == df_ages_creationd['creation_date']) & ((df_ages_creationd['date'] <= pd.to_datetime('2022-11-01')))]

In [None]:
df_creation_fluxes_meta = df_low_dates.rename(columns={'nb_pages' : 'nb_pages_tail'}).merge(df_core_new, on='date')

In [None]:
df_creation_fluxes_meta['perc'] = df_creation_fluxes_meta['nb_pages'] / (df_creation_fluxes_meta['nb_pages'] + df_creation_fluxes_meta['nb_pages_tail']) * 100

In [None]:
plt.figure()
df_creation_fluxes_meta.sort_values('date').set_index('date')['perc'].plot()
plt.title('Percentage of pages going directly to the core - Metadata')

In [None]:
df_creation_fluxes_meta.loc[df_creation_fluxes_meta.date >= pd.to_datetime('2020-03')].describe()

### What is the percentage of 1 months created pages which would go to the core the next month ?

In [None]:
df_ages_creationd = df_ages_creationd[~df_ages_creationd['creation_date'].isnull()]

In [None]:
df_ages_creationd['creation_date_1month'] = df_ages_creationd['creation_date'].apply(lambda d : d + pd.DateOffset(months=1))
df_core_1month = df_ages_creationd.loc[(df_ages_creationd['date'] == df_ages_creationd['creation_date_1month']) & (df_ages_creationd['date'] <= pd.to_datetime('2022-11-01'))]

In [None]:
df_low_dates_1month = df_low_volume.join(df_metadata, on='page_id').where(col('date') == add_months(col('creation_date'), 1)).groupBy('date').agg(count('*').alias('nb_pages')).toPandas()

In [None]:
df_est = df_core_1month.merge(df_low_dates_1month.rename(columns={'nb_pages' : 'nb_pages_tail'}), on='date')
df_est['perc'] = df_est['nb_pages'] / (df_est['nb_pages'] + df_est['nb_pages_tail']) * 100
df_est.loc[df_est['date'] <= pd.to_datetime('2022-11-01')].sort_values('date').set_index('date')['perc'].plot()

### What is the proportion of incoming pages over overlap size (it should plateau a bie before 43%)

In [None]:
percs_avg, percs_std, nb = [], [], []
for i in tqdm(range(1,94)):
    w = Window.partitionBy().orderBy(asc('date'))
    stats = df_high_volume_mar23.groupBy('date').agg(collect_set('page_id').alias('current_ids'))\
                              .select(lag('current_ids', offset=i).over(w).alias('prev_ids'), 'date', 'current_ids')\
                              .select((size(array_except('current_ids', 'prev_ids')) / size('current_ids') * 100).alias('new'), 'date')\
                              .where(col('new') > 0)\
                              .select(avg('new').alias('avg'), stddev('new').alias('std'), count('*').alias('nb_pages')).collect()
    percs_avg.append(stats[0]['avg'])
    percs_std.append(stats[0]['std'])
    nb.append(stats[0]['nb_pages'])

In [None]:
df_plot = pd.DataFrame({'Mean overlap percentage' : percs_avg, 'std' : percs_std, 'nb' : nb, 'Number of consecutive months': [i for i in range(1,94)]})
df_plot['error'] = df_plot['std'] * 1.96 / np.sqrt(df_plot['nb'])

fig = px.line(df_plot, y='Mean overlap percentage', x='Number of consecutive months', error_y='error')
fig.update_layout(
    xaxis_title=dict(text='Number of consecutive months', font=dict(size=20)),
    yaxis_title=dict(text='Mean overlap percentage', font=dict(size=20)),
    yaxis = dict( tickfont = dict(size=20)),
    xaxis = dict( tickfont = dict(size=20)),
    height=500,
    width=900)
fig.show()
fig.write_html("interm_results/volume_dynamics/prop_incoming.html")
fig.write_image("figures_report/volume_dynamics/prop_incoming.pdf")

### What are the incoming flux semantics ?

In [None]:
df_topics_sp = spark.read.parquet('/scratch/descourt/metadata/topics/topic_en/topics-enwiki-20230320-parsed.parquet').select(col('page_id').cast('string').alias('page_id'), col('topics_specific_unique').alias('topic')).distinct().cache()
# 

In [None]:
df_fluxes_top = df_fluxes.where(col('date') <= to_date(lit('2022-11'), 'yyyy-MM'))
df_fluxes_top = df_fluxes_top.join(df_topics_sp.select('page_id', 'topic').distinct(), 'page_id').cache()
df_fluxes_top_agg = df_fluxes_top.groupBy('date', 'topic').agg(count('*').alias('nb_pages')).toPandas()

In [None]:
df_fluxes_top_agg_sum = df_fluxes_top_agg.groupby('date').agg(tot_pages=('nb_pages', 'sum'))

In [None]:
df_fluxes_top_agg = df_fluxes_top_agg.merge(df_fluxes_top_agg_sum, on='date')
df_fluxes_top_agg['perc'] = df_fluxes_top_agg['nb_pages'] / df_fluxes_top_agg['tot_pages'] * 100

In [None]:
df_fluxes_top_agg_agg = df_fluxes_top_agg.groupby('topic').agg(avg_perc=('perc', 'mean'), std_perc=('perc', 'std')).reset_index()
df_fluxes_top_agg_agg['error'] = df_fluxes_top_agg_agg['std_perc'] * 1.96 / np.sqrt(93)

In [None]:
df_fluxes_top_agg_agg.sort_values('avg_perc')

In [None]:
color_map = { t: 'lightblue' if 'stem' in t else 'orange' if 'history' in t else 'magenta' if 'geography' in t else 'green' for t in df_fluxes_top_agg_agg.topic.values }

### How much interest is gathered by this flux vs the rest ?
TODO -> average and CI not reliable

In [None]:
df_fluxes_views = df_fluxes.join(df_high_volume.select('page_id', 'date', 'tot_count_views'), on=['page_id', 'date'])
df_common_views = df_common_month.join(df_high_volume.select('page_id', 'date', 'rank', 'tot_count_views'), on=['page_id', 'date'])
fluxes_stats = df_fluxes_views.groupBy('date').agg((avg('tot_count_views').alias('avg_views'), stddev('tot_count_views').alias('std_views'), count('*').alias('nb_pages')).toPandas()
#overall_stats = df_high_volume.groupBy('date').agg(avg('tot_count_views').alias('avg_views'), stddev('tot_count_views').alias('std_views'), count('*').alias('nb_pages')).toPandas()
common_stats = df_common_views.groupBy('date').agg(avg('tot_count_views').alias('avg_views'), stddev('tot_count_views').alias('std_views'), count('*').alias('nb_pages')).toPandas()

In [None]:
fluxes_stats['Pages'] = 'Incoming pages'
overall_stats['Pages'] = 'Entire core'
common_stats['Pages'] = 'Stable core'

fluxes_stats['error'] = fluxes_stats['std_views']*1.96 / np.sqrt(fluxes_stats['nb_pages'])
overall_stats['error'] = overall_stats['std_views']*1.96 / np.sqrt(overall_stats['nb_pages'])
common_stats['error'] = common_stats['std_views']*1.96 / np.sqrt(common_stats['nb_pages'])

df_plot = pd.concat([fluxes_stats, overall_stats, common_stats])
fig = px.bar(df_plot.sort_values('date'), x='date', y='avg_views', error_y='error', color='Pages')
fig.update_layout(barmode='group')
fig.show()

### What are the outgoing pages like ?

In [None]:
w = Window.partitionBy().orderBy(asc('date'))
df_outgoing = df_high_volume.groupBy('date').agg(collect_set('page_id').alias('current_ids'))\
                          .select(lag('current_ids').over(w).alias('prev_ids'), 'date', 'current_ids')\
                          .select(explode(array_except('prev_ids', 'current_ids')).alias('disappeared'), 'date').cache()

In [None]:
df_outgoing

In [None]:
df_outgoingfluxsize = df_outgoing.select(add_months('date', -1).alias('date')).groupBy('date').agg(count('*').alias('nb_incoming_pages')).toPandas()

In [None]:
df_outgoingcompar = df_outgoingfluxsize.merge(df_totsize, on='date')
df_outgoingcompar['perc'] = df_outgoingcompar['nb_incoming_pages'] / df_outgoingcompar['nb_tot_pages'] * 100
fig = px.bar(df_outgoingcompar.sort_values('date'), x='date', y='perc')
fig.update_layout(
title='Percentage of outgoing pages over the total monthly core',
xaxis_title='Percentage',
yaxis_title='Date')
fig.show()
fig.write_html("interm_results/volume_dynamics/core_outgoing_pages.html")

In [None]:
df_outgoingcompar.describe()

In [None]:
df_outgoingflux = df_outgoing.select('date', col('disappeared').cast('bigint').alias('disappeared'))
df_outgoingflux = df_outgoingflux.join(df_topics_sp, df_topics_sp.page_id == df_outgoingflux.disappeared)
df_outgoingflux_agg = df_outgoingflux.groupBy('date', 'topic').agg(count('*').alias('nb_pages')).toPandas()

In [None]:
df_sum = df_outgoingflux_agg.groupby('date').agg(tot_pages=('nb_pages', 'sum')).reset_index()
df_outgoingflux_agg = df_outgoingflux_agg.merge(df_sum, on='date')
df_outgoingflux_agg['perc'] = df_outgoingflux_agg['nb_pages'] / df_outgoingflux_agg['tot_pages'] * 100

In [None]:
df_top_fluxes = pd.concat([df_outgoingflux_agg, df_fluxes_top_agg]).sort_values('topic')

In [None]:
df_outgoingflux_agg['Flux'] = 'Outgoing'
df_fluxes_top_agg['Flux'] = 'Incoming'


fig = px.box(pd.concat([df_outgoingflux_agg, df_fluxes_top_agg]).sort_values('topic'), y='topic', x='perc', orientation='h', color='Flux')
fig.update_layout(
    width = 1000,
    height=2100,
    xaxis_title=dict(text='Topic', font=dict(size=20)),
    yaxis_title=dict(text='Percentage', font=dict(size=20)),
    yaxis = dict( tickfont = dict(size=17)),
    xaxis = dict( tickfont = dict(size=20)),
legend= dict(font=dict(size=20)))
fig.show()
fig.write_image('figures_report/volume_dynamics/topics_fluxes.pdf')

In [None]:
df_outgoing_dates = df_outgoing.select(add_months('date', -1).alias('date'), col('disappeared').alias('page_id')).join(df_ages.select('page_id', 'creation_date', 'date'), on=['date', 'page_id'])
df_outgoing_dates = df_outgoing_dates.join(pages_moves, on='page_id', how='leftanti').drop('age_in_months').cache()

In [None]:
df_ages_out_creationd = df_outgoing_dates.groupBy('date', 'creation_date').agg(count('*').alias('nb_pages')).toPandas()
df_ages_creationd_sum = df_ages_out_creationd.groupby('date').agg(tot_pages=('nb_pages', 'sum')).reset_index()
df_ages_out_creationd = df_ages_out_creationd.merge(df_ages_creationd_sum, on='date')
df_ages_out_creationd['perc'] = df_ages_out_creationd['nb_pages'] / df_ages_out_creationd['tot_pages'] * 100

In [None]:
df_outgoing_full = df_outgoing.select(add_months('date', -1).alias('date'), col('disappeared').alias('page_id')).join(df_high_volume, on=['date' , 'page_id']).cache()

In [None]:
#title='Age distribution of core incoming pages in November 2022',
fig = px.bar(df_ages_out_creationd.loc[df_ages_out_creationd.date.astype(str) == '2022-11-01'], x ='creation_date', y = 'perc', log_y=False)
fig.update_layout(
    xaxis_title=dict(text='Creation Date', font=dict(size=20)),
    yaxis_title=dict(text='Percentage', font=dict(size=20)),
    yaxis = dict( tickfont = dict(size=20)),
    xaxis = dict( tickfont = dict(size=20)),
    height=600,
    width=1000)
fig.show()
#fig.write_image("figures_report/volume_dynamics/age_distrib_nov22.pdf")

### What is the proportion of outgoing pages over overlap size ?

below not run, but compare incoming with outgoing for 2 consecutive months

In [None]:
percs_avg_out, percs_std_out, nb_out = [], [], []
for i in tqdm(range(1,94)):
    w = Window.partitionBy().orderBy(asc('date'))
    stats = df_high_volume_mar23.groupBy('date').agg(collect_set('page_id').alias('current_ids'))\
                              .select(lag('current_ids', offset=i).over(w).alias('prev_ids'), 'date', 'current_ids')\
                              .select((size(array_except('prev_ids', 'current_ids')) / size('current_ids') * 100).alias('out'), 'date')\
                                .where(col('out') > 0)\
                              .select(avg('out').alias('avg'), stddev('out').alias('std'), count('*').alias('nb')).collect()
    percs_avg_out.append(stats[0]['avg'])
    percs_std_out.append(stats[0]['std'])
    nb_out.append(stats[0]['nb'])

In [None]:
df_plot = pd.DataFrame({'Mean overlap percentage' : percs_avg_out, 'std' : percs_std_out, 'nb' : nb_out, 'Number of consecutive months': [i for i in range(1,94)]})
df_plot['error'] = df_plot['std'] * 1.96 / np.sqrt(df_plot['nb'])

fig = px.line(df_plot, y='Mean overlap percentage', x='Number of consecutive months', error_y='error')
fig.update_layout(
    xaxis_title=dict(text='Number of consecutive months', font=dict(size=20)),
    yaxis_title=dict(text='Mean overlap percentage', font=dict(size=20)),
    yaxis = dict( tickfont = dict(size=20)),
    xaxis = dict( tickfont = dict(size=20)),
    height=900)
fig.show()
fig.write_html("interm_results/volume_dynamics/prop_outgoing.html")
fig.write_image("figures_report/volume_dynamics/prop_outgoing.pdf")

 ## Were articles coming from the tail to the core in the core before ?

In [None]:
df_inthecore = df_high_volume.groupBy('page_id').agg(collect_set('date').alias('dates_in_core')).cache()
df_fluxes = df_fluxes.join(df_inthecore, 'page_id').cache()
df_fluxes_in_core = df_fluxes.select( 'page_id', array(*[add_months('date', -i) for i in range(1,90)]).alias('prev_dates'), 'dates_in_core', 'date')\
                             .select('page_id', size(array_intersect('prev_dates', 'dates_in_core')).alias('nb_prev_times_core'), 'date').cache()

In [None]:
df_from_core = df_fluxes_in_core.groupBy('date', 'nb_prev_times_core').agg(count('*').alias('nb_pages')).toPandas()

In [None]:
# View the percentage of pages which were 0 up to max possible times in the core throughout the period of study up to November 2022

df_from_core_sum = df_from_core.groupby('date').agg(tot_pages=('nb_pages', 'sum')).reset_index()
df_from_core = df_from_core.merge(df_from_core_sum, on='date')
df_from_core['perc'] = df_from_core['nb_pages'] / df_from_core['tot_pages'] * 100
fig = px.bar(df_from_core.sort_values('date'), x='nb_prev_times_core', y='perc', animation_frame='date')
fig.show()

In [None]:
# Taking the average over 1 last year because better estimation of the core history
df_from_core[(df_from_core.date.astype('str').isin(['2022-03-01', '2022-04-01', '2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01', '2022-12-01', '2023-01-01', '2023-02-01', '2023-03-01'])) & (df_from_core.nb_prev_times_core == 0)].describe()

In [None]:
1.635571 * 1.96 / np.sqrt(12)

On average based on the full 2022 year, 15.6% of articles were new to the core in the incoming flux, so either coming from the tail, either newly created (but we could assume such articles were "born in the tail"). 
We also know that 1.77% of the incoming flux (see [this section](#What-is-the-proportion-of-new-/-the-rest-?) ). Therefore 100 - 1.77 / 15.6 * 100 = 89 % from the pages new to the core come from the tail.

### How old are articles emerging from the tail to the core ?

In [None]:
# If the number of previous time in the core is 0, it means this article was never seen in the core
df_comingfromthetail = df_fluxes_in_core.where('nb_prev_times_core = "0"').join(df_metadata.select('page_id', 'creation_date'),
                                                                                df_metadata.page_id == df_fluxes_in_core.page_id)
df_agesfromthetail = df_comingfromthetail.groupBy('date', 'creation_date').agg(count('*').alias('nb_pages')).toPandas()

In [None]:
df_agesfromthetail_tail = df_agesfromthetail[pd.to_datetime(df_agesfromthetail['creation_date']) != df_agesfromthetail['date']]
df_agesfromthetail_sum = df_agesfromthetail_tail.groupby('date').agg(tot_pages=('nb_pages', 'sum')).reset_index()
df_agesfromthetail_tail = df_agesfromthetail_tail.merge(df_agesfromthetail_sum, on='date')
df_agesfromthetail_tail['perc'] = df_agesfromthetail_tail['nb_pages'] / df_agesfromthetail_tail['tot_pages'] * 100

fig = px.bar(df_agesfromthetail_tail.sort_values('date'), x='creation_date', y='perc', animation_frame='date')
fig.show()

In [None]:
fig.write_html("interm_results/volume_dynamics/agesfromthetail.html")

# How much interest in gathered by pages always in the core versus the rest ?

In [None]:
df_high_volume = extract_volume(dfs, high=True).select('date', 'tot_count_views', 'page', 'page_id').cache()
df_low_volume = extract_volume(dfs, high=False).select('date', 'tot_count_views', 'page', 'page_id').cache()

In [None]:
stable_core = df_high_volume.join(df_high_volume.groupBy('page_id').agg(count('*').alias('nb_occ')), on=['page_id']).withColumn('stable', col('nb_occ') == 93)
stable_tail = df_low_volume.join(df_low_volume.groupBy('page_id').agg(count('*').alias('nb_occ')), on=['page_id']).withColumn('stable', col('nb_occ') == 93)
stable_all = dfs.join(dfs.groupBy('page_id').agg(count('*').alias('nb_occ')), on=['page_id']).withColumn('stable', col('nb_occ') == 93)

df_stats_core = stable_core.groupBy('stable', 'date').agg(avg('tot_count_views').alias('Average Views'), (stddev('tot_count_views') * 1.96 / sqrt(count('*'))).alias('error')).toPandas()
df_stats_tail = stable_tail.groupBy('stable', 'date').agg(avg('tot_count_views').alias('Average Views'), (stddev('tot_count_views') * 1.96 / sqrt(count('*'))).alias('error')).toPandas()
df_stats_all = stable_all.groupBy('stable', 'date').agg(avg('tot_count_views').alias('Average Views'), (stddev('tot_count_views') * 1.96 / sqrt(count('*'))).alias('error')).toPandas()

In [None]:
sample_stats_core = stable_core.sample(0.01).toPandas()

In [None]:
dates = np.unique(sample_stats_core.date.values)

In [None]:
df_plot = sample_stats_core.loc[sample_stats_core.date.isin(dates[::3])].sample(156928)
fig = px.box(df_plot, x='date', y='tot_count_views', color='stable', log_y=True, color_discrete_map = {'True': 'red', 'False':'blue'})
fig.update_layout(
    height=700,
    width=2100,
    xaxis_title=dict(text='Date', font=dict(size=25)),
    yaxis_title=dict(text='Number of views', font=dict(size=25)),
    yaxis = dict( tickfont = dict(size=25)),
    xaxis = dict( tickfont = dict(size=25)),
    legend = dict(font = dict(size = 20)),
    legend_title = dict(font = dict(size = 25), text='Stable'))
fig.show()
fig.write_image("figures_report/volume_dynamics/avg_views_core_stability_box_red.pdf")

In [None]:
sample_stats_tail = stable_tail.sample(0.01).toPandas()
sample_stats_all = stable_all.sample(0.001).toPandas()

In [None]:
df_plot = sample_stats_tail.loc[sample_stats_tail.date.isin(dates[::3])].sample(156928)
fig = px.box(df_plot, x='date', y='tot_count_views', color='stable', log_y=True,  color_discrete_map = {'True': 'red', 'False':'blue'})
fig.update_layout(
    height=700,
    width=2100,
    xaxis_title=dict(text='Date', font=dict(size=25)),
    yaxis_title=dict(text='Number of views', font=dict(size=25)),
    yaxis = dict( tickfont = dict(size=25)),
    xaxis = dict( tickfont = dict(size=25)),
    legend = dict(font = dict(size = 20)),
    legend_title = dict(font = dict(size = 25), text='Stable'))
fig.show()
fig.write_image("figures_report/volume_dynamics/avg_views_tail_stability_box_red.pdf")

In [None]:
df_plot = sample_stats_all.loc[sample_stats_all.date.isin(dates[::3])].sample(156928)
fig = px.box(df_plot, x='date', y='tot_count_views', color='stable', log_y=True,  color_discrete_map = {'True': 'red', 'False':'blue'})
fig.update_layout(
    height=700,
    width=2100,
    xaxis_title=dict(text='Date', font=dict(size=25)),
    yaxis_title=dict(text='Number of views', font=dict(size=25)),
    yaxis = dict( tickfont = dict(size=25)),
    xaxis = dict( tickfont = dict(size=25)),
    legend = dict(font = dict(size = 20)),
    legend_title = dict(font = dict(size = 25), text='Stable'))
fig.show()
#fig.write_image("figures_report/volume_dynamics/avg_views_tail_stability_box_red.pdf")

In [None]:

df1 = sample_stats_core.sample(160000).loc[sample_stats_tail['stable'] & (sample_stats_tail.date == pd.to_datetime('2020-01-01'))]['tot_count_views'].values
df2 = sample_stats_core.sample(160000).loc[~sample_stats_tail['stable'] & (sample_stats_tail.date == pd.to_datetime('2020-01-01'))]['tot_count_views'].values


In [None]:
from scipy.stats import mannwhitneyu

#pvalue_g = []
pvalue_l = []
#pvalue_diff = []

for d in tqdm(dates):
    
    df1 = sample_stats_tail.sample(160000).loc[sample_stats_tail['stable'] & (sample_stats_tail['date']== d)]['tot_count_views'].values
    df2 = sample_stats_tail.sample(160000).loc[~sample_stats_tail['stable'] & (sample_stats_tail['date'] == d)]['tot_count_views'].values
    
    pvalue_l.append(mannwhitneyu(df1, df2, alternative='less').pvalue)
    #pvalue_diff.append(mannwhitneyu(df1, df2).pvalue)

In [None]:
pvalue_g_core = []
pvalue_diff_core = []

for d in tqdm(dates[::3]):
    
    df1 = sample_stats_core.sample(160000).loc[sample_stats_core['stable'] & (sample_stats_core['date']== d)]['tot_count_views'].values
    df2 = sample_stats_core.sample(160000).loc[~sample_stats_core['stable'] & (sample_stats_core['date'] == d)]['tot_count_views'].values
    
    pvalue_g_core.append(mannwhitneyu(df1, df2, alternative='greater').pvalue)
    pvalue_diff_core.append(mannwhitneyu(df1, df2).pvalue)

In [None]:
pvalue_g_all = []
pvalue_diff_all = []

for d in tqdm(dates[::3]):
    
    df1 = sample_stats_all.sample(160000).loc[sample_stats_all['stable'] & (sample_stats_all['date']== d)]['tot_count_views'].values
    df2 = sample_stats_all.sample(160000).loc[~sample_stats_all['stable'] & (sample_stats_all['date'] == d)]['tot_count_views'].values
    
    pvalue_g_all.append(mannwhitneyu(df1, df2, alternative='greater').pvalue)
    pvalue_diff_all.append(mannwhitneyu(df1, df2).pvalue)