# Installing pyspark requirements

In [None]:
import os
os.environ["JAVA_HOME"] = "/lib/jvm/java-11-openjdk-amd64"
# Because otherwise custom modules import errors
import sys
sys.path.append('../')
os.makedirs("../figures_report/rank_stability", exist_ok=True)

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(15,10)})
import plotly.graph_objs as go
import plotly.express as px

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
from pyspark.sql import SQLContext

In [None]:
import pyspark

In [None]:
from src.ranking_helpers import *
from src.rank_turbulence_divergence import *
from src.pages_groups_extraction import*

## Initialize context 

In [None]:
conf = pyspark.SparkConf().setMaster("local[3]").setAll([
                                   ('spark.driver.memory','100G'),
                                   ('spark.executor.memory', '100G'),
                                   ('spark.driver.maxResultSize', '0'),
                                    ('spark.executor.cores', '3'),
                                    ('spark.local.dir', '/scratch/descourt/spark')
                                  ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# create the context
sc = spark.sparkContext
sc.setLogLevel('ERROR')

# PLotting functions

In [None]:
def plot_salient(df, group='date', path=None):
    fig = go.Figure()
    
    l = []
    for n, grp in df.groupby(group):
        d1_grp = grp[grp['div_sign'] < 0]
        d2_grp = grp[grp['div_sign'] > 0]
        fig.add_trace(
            go.Bar(
                x=d1_grp['div'],
                y=d1_grp['page'],
                name=d1_grp['month'].unique()[0],
                orientation='h',
                text = d1_grp['ranks'],
                textposition = "outside",
                marker={'color': 'red'},
            hovertemplate="Divergence %{x} <br>Ranks %{text}"))
        fig.add_trace(
            go.Bar(
                x=d2_grp['div'],
                y=d2_grp['page'],
                name=d2_grp['month'].unique()[0],
                orientation='h',
                text = d2_grp['ranks'],
                textposition = "outside",
             marker={'color': 'green'},
            hovertemplate="Divergence %{x} <br>Ranks %{text}"))
        l.append(n)

    # Create and add slider
    steps = []
    for i in np.arange(0, len(fig.data), 2):
        step = dict(
            method="update",
            args=[{"visible": [False] * len(fig.data)},
                  {"title": f"Slider switched to {group}: " + str(l[i//2])}],
            label=str(l[i//2])# layout attribute
        )
        step["args"][0]["visible"][i] = True
        step["args"][0]["visible"][i+1] = True # Toggle i'th trace to "visible"
        steps.append(step)

    sliders = [dict(
        active=0,
        currentvalue={"prefix": f"{group}: "},
        pad={"t": 50},
        steps=steps
    )]

    fig.update_layout(
        sliders=sliders,
        showlegend=True,
        autosize=False,
        height=800,
        width=1000,
        xaxis_title='Individual divergence contribution',
        yaxis_title='Page',
        legend_title='Dates to compare',
        title='Top 20 pages contributing to the divergence for each month'
                    )
    
    fig.show()
    
    if path is not None:
        fig.write_html(path)
    
    


# Data

## Download data

df_divs was computed with the following command `python make_and_plot.py --mode rtd --alpha 0.3 --memory 120`

In [None]:
dfs = spark.read.parquet("/scratch/descourt/processed_data/en/pageviews_en_2015-2023.parquet").withColumn('project', lit('en'))

In [None]:
df_divs = spark.read.parquet('/scratch/descourt/plots/thesis/RTD_all.parquet')

In [None]:
df_divs_alpha = spark.read.parquet('/scratch/descourt/plots/thesis/RTD_alphas.parquet')

In [None]:
# Topics
df_topics_sp = spark.read.parquet('/scratch/descourt/metadata/topics/topic_en/topics-enwiki-20230320-parsed.parquet')

In [None]:
dfs_change = spark.read.parquet("/scratch/descourt/processed_data/en/pageviews_en_articles_ev_2023-03.parquet")

## Process data

In [None]:
df_high_volume = extract_volume(dfs, high=True).cache()

In [None]:
df_low_volume = extract_volume(dfs, high=False).cache()

In [None]:
df_divs_augm = df_divs.join(df_topics_sp.select('page_id', col('topics_specific_unique').alias('topic')).distinct(), 'page_id').cache()
df_divs_alphas_augm = df_divs_alpha.join(df_topics_sp.select('page_id', col('topics_specific_unique').alias('topic')).distinct(), 'page_id').cache()

## Estimate the loss when matching with topics

In [None]:
df_high_volume_augm = df_high_volume.join(df_topics_sp.select('page_id', col('topics_specific_unique').alias('topic')).distinct(), 'page_id').cache()

In [None]:
df_low_volume_augm = df_low_volume.join(df_topics_sp.select('page_id', col('topics_specific_unique').alias('topic')).distinct(), 'page_id').cache()

In [None]:
dfs_top = dfs.join(df_topics_sp.select('page_id', col('topics_specific_unique').alias('topic')).distinct(), 'page_id').cache()

In [None]:
df_low_volume.count()

In [None]:
df_low_volume_augm.count()

In [None]:
454331707 / 475441301 * 100

In [None]:
df_high_volume.count()

In [None]:
df_high_volume_augm.count()

In [None]:
60246933/ 61972033 * 100

# Stable and unstable pages

In [None]:
from pandas.tseries.offsets import DateOffset
def prepare_divs_plot(df_divs_augm, group='date', m=20, ungrouped=False):

    # Sample and take max ranks
    w = Window.partitionBy(group).orderBy(desc('div'))
    df_div_pd = df_divs_augm.withColumn('divr', row_number().over(w)).where(col('divr') < 100).drop('divr').toPandas()
    max_rk_1 = df_divs_augm.select(max('rank_1').alias('m_1')).collect()[0]['m_1']
    max_rk_2 = df_divs_augm.select(max('rank_2').alias('m_2')).collect()[0]['m_2']

    # For labelling
    df_div_pd['ranks'] = df_div_pd.apply(lambda r: f"{int(r['rank_1'])} <> {int(r['rank_2'])}", axis=1)
    # Note the exclusive types with an asterix
    df_div_pd['page'] = df_div_pd.apply(
        lambda r: r.page + str('*') if ((r['rank_1'] == max_rk_1) | (r['rank_2'] == max_rk_2)) else r.page, axis=1)

    # Take the top divergence for both dates
    df_div_pd['div_sign'] = df_div_pd.apply(lambda r: (2 * int(r['rank_2'] < r['rank_1']) - 1) * r[f'div'], axis=1)
    if ungrouped: 
        df_plot_head = df_div_pd.sort_values(by=f'div_sign', ascending=False).groupby(group)[['div', 'div_sign', 'page', 'ranks', 'topic', 'date', 'alpha']].head(m//2).reset_index()
        df_plot_tail = df_div_pd.sort_values(by=f'div_sign', ascending=False).groupby(group)[['div', 'div_sign', 'page', 'ranks', 'topic', 'date', 'alpha']].tail(m//2).reset_index()
        df_plot = pd.concat([df_plot_head, df_plot_tail])
    else:
        df_plot = df_div_pd.sort_values(by=f'div', ascending=False).groupby(group)[['div', 'div_sign', 'page', 'ranks', 'topic', 'date', 'alpha']].head(m).reset_index()
        

    # labels
    df_plot['month'] = df_plot.apply(lambda r: (pd.to_datetime(r['date'] + '-01') - DateOffset(months=1)).date() if r['div_sign'] < 0 else r['date'] + '-01', axis=1)
    return df_plot

## For Dec 2020 - Jan 2021

Play with $\alpha$ parameter. $\alpha \to 0$ and $\alpha \to \infty$

In [None]:
from src.make_and_plot import set_up_mapping
color_map = set_up_mapping(grouped=False)

In [None]:
df_plot_raw = pd.read_csv("/scratch/descourt/plots/thesis/divs_alphas.csv.gzip", compression='gzip')

In [None]:
plot_salient(df_plot_raw, group='alpha') # top 20 for each month

In [None]:
df_plot_alphas = prepare_divs_plot(df_divs_alphas_augm, group='alpha', m=25)

In [None]:
# TODO uncomment the lines below to plot with matched topics

#grp = df_plot_alpha_raw.loc[df_plot_alpha_raw['alpha'] == 0.0]
grp = df_plot_alphas.loc[df_plot_alphas['alpha'] == np.inf].sort_values('div', ascending=False)
fig = px.bar(grp.sort_values('topic', ascending=False), x='div', y='page', color='topic', pattern_shape='month', orientation='h', text='ranks', color_discrete_map = color_map)
#fig = px.bar(grp.sort_values('div', ascending=False).head(25), x='div', y='page', color='month', orientation='h', text='ranks')
fig.update_traces(textposition='outside', textfont=dict(size=25))
fig.update_layout(height=1000,
                  width=2500,
                  xaxis_title=dict(text='Divergence', font=dict(size=35)),
                  yaxis_title=dict(text='Pages', font=dict(size=35)),
                  legend=dict(title='Topics', font=dict(size=33), itemwidth=30, itemsizing='constant'),
                yaxis = dict( tickfont = dict(size=25)),
                xaxis = dict( tickfont = dict(size=35)),)
fig.update_annotations(font_size=15)
fig.update_xaxes(range=[0, 0.04]) # TODO to be tuned
fig.show()
fig.write_image("../figures_report/rank_stability/alpha_inf.pdf")

## Throughout the time 

$\alpha = 0.3$

In [None]:
df_plot_date= prepare_divs_plot(df_divs_augm.withColumn('alpha', lit(0.3)), m=30)

In [None]:
grp = df_plot_date.loc[df_plot_date['date'] == '2022-02'].sort_values('div', ascending=False)
fig = px.bar(grp.sort_values('topic', ascending=False), x='div', y='page', color='topic', pattern_shape='month', orientation='h', text='ranks', color_discrete_map = color_map)
#fig = px.bar(grp, x='div', y='page', color='month', orientation='h', text='ranks')
fig.update_traces(textposition='outside', textfont=dict(size=25))
fig.update_layout(height=1000,
                  width=2500,
                  xaxis_title=dict(text='Divergence', font=dict(size=30)),
                  yaxis_title=dict(text='Pages', font=dict(size=30)),
                  legend=dict(title='Topics', font=dict(size=26)),
                yaxis = dict( tickfont = dict(size=22)),
                xaxis = dict( tickfont = dict(size=30)),)
fig.update_annotations(font_size=19)
fig.update_xaxes(range=[0, 27*1e-6])
fig.show()
fig.write_image("../figures_report/rank_stability/date_Ukraine.pdf")

## Core stability across time

Plot core divergence score across time. 

In [None]:
df_core_stab = df_divs.groupBy('date').agg(sum('div').alias('div_sum')).toPandas()

In [None]:
fig = px.line(df_core_stab.sort_values('date'), x='date', y='div_sum')
fig.update_layout(width = 1000,
    xaxis_title=dict(text='Date', font=dict(size=20)),
    yaxis_title=dict(text='Total divergence', font=dict(size=20)),
    yaxis = dict( tickfont = dict(size=20)),
    xaxis = dict( tickfont = dict(size=20)),
    height=600)
fig.show()
fig.write_image("figures_report/rank_stability/core_div_ev.pdf")