# Imports and data

In [None]:
import os
os.environ["JAVA_HOME"] = "/lib/jvm/java-11-openjdk-amd64"
# Because otherwise custom modules import errors
import sys
from tqdm import tqdm
sys.path.append('wikipedia_core_events_semantic/')
os.makedirs('interm_results/volume_dynamics', exist_ok=True)
os.makedirs('figures_report/volume_dynamics', exist_ok=True)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline

from pyspark.sql.functions import *
from pyspark import SparkContext
from pyspark.sql.types import ArrayType, IntegerType

import plotly.graph_objs as go
import plotly.express as px

from scipy import stats
from scipy import spatial

In [None]:
from more_itertools import consecutive_groups
from itertools import chain, combinations

In [None]:
import pyspark

In [None]:
from wikipedia_core_events_semantic.ranking_helpers import *
from wikipedia_core_events_semantic.make_and_plot import*
from wikipedia_core_events_semantic.pages_groups_extraction import*
from wikipedia_core_events_semantic.data_aggregation import*

## Initialize context 

In [None]:
conf = pyspark.SparkConf().setMaster("local[5]").setAll([
                                   ('spark.driver.memory','70G'),
                                   ('spark.executor.memory', '70G'),
                                   ('spark.driver.maxResultSize', '0'),
                                    ('spark.executor.cores', '5'),
                                    ('spark.local.dir', '/scratch/descourt/spark')
                                ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# create the context
sc = spark.sparkContext
sc.setLogLevel('ERROR')

## Download data


In [None]:
dfs = spark.read.parquet("/scratch/descourt/processed_data/en/pageviews_en_2015-2023.parquet").withColumn('date', to_date(col('date'), 'yyyy-MM')).cache()

In [None]:
df_metadata = spark.read.parquet('/scratch/descourt/metadata/akhils_data/wiki_nodes_bsdk_phili_2022-11.parquet')

In [None]:
dfs_change = spark.read.parquet("/scratch/descourt/processed_data/en/pageviews_en_articles_ev_2022-11.parquet")

In [None]:
dfs_uptonov = dfs.where(dfs.date <= to_date(lit('2022-11'), 'yyyy-MM')).cache()

In [None]:
dfs_uptonov = dfs_uptonov.join(dfs_change, dfs_change.page_ids == dfs_uptonov.page_id).cache()

# Find matching for languages editions in terms of number of articles

## Find examples where creation date > number of times in volume

In [None]:
df_occ = dfs_uptonov.groupBy('last_page_id').agg(count('*').alias('nb_occ'))
dfs_augm = dfs_uptonov.join(df_occ, on='last_page_id')\
                      .join(df_metadata.select(col('page_id').alias('last_page_id'), to_date('creation_date', 'yyyy-MM').alias('creation_date')), on='last_page_id').cache()

In [None]:
dfs_augm.where(to_date(col('creation_date'), 'yyyy-MM') > col('date')).select('date', 'page_id', 'last_name', 'creation_date', 'tot_count_views', 'page').distinct().show(60)

In [None]:
dfs.where('page_id = "67394045"').sort(asc('date')).take(2)

# Extract pairs

Below datasets were computed with the following commandes `python make_pairs.py --memory 120`. 

Less articles are found than reported, because pages containing ':' are filtered out, to avoid non main space articles but this is not the best option. 


## Retrieve data set, match on QID, filter out underestimated editions

I also want to exclude editions for whcih removing ':' containing articles incures a too heavy page loss. 

In [None]:
dfs_alleds = spark.read.parquet('/scratch/descourt/processed_data/multieditions/pageviews_agg_all.parquet').withColumn('date', to_date(col('date'), 'yyyy-MM'))
dfs_pairs = spark.read.parquet('/scratch/descourt/processed_data/multieditions/pairs_0.95.parquet')

df_metafilt = df_metadata.select((split('wiki_db', 'wiki')[0]).alias('project'), 'page_id', 'item_id').cache()
dfs_alleds = dfs_alleds.join(df_metafilt, on=['project', 'page_id']).cache()

In [None]:
df_nbarticles = dfs_alleds.groupBy('date', 'project').agg(count('*').alias('nb_articles')).toPandas()
df_metaarticles = df_metafilt.groupBy('project').agg(count('*').alias('nb_articles')).toPandas()

In [None]:
df_nbarticles = df_nbarticles.rename(columns={'nb_articles':'nb_custom'}).merge(df_metaarticles, on='project')

In [None]:
df_filt = df_nbarticles.loc[df_nbarticles.date.astype(str) == '2022-11-01']
df_filt['error'] = df_filt['nb_custom'] / df_filt['nb_articles']
projects_to_keep = df_filt[df_filt['error'] > 0.89]['project'].values

## Gather all langs pairs info - Match on average views

In [None]:
w = Window.partitionBy('pairs', 'date')
df_projects_stats = dfs_alleds.groupBy('project', 'date')\
                              .agg(count('*').alias('numberOfArticles'), 
                                   avg('tot_count_views').alias('averageViews'), 
                                   stddev('tot_count_views').alias('standardDevViews'),
                                   max('tot_count_views').alias('maxView'),\
                                      min('tot_count_views').alias('minView'),\
                                      percentile_approx('tot_count_views', 0.5).alias('med'),\
                                      percentile_approx('tot_count_views', 0.25).alias('25%'),\
                                    percentile_approx('tot_count_views', 0.75).alias('75%'),\
                                      percentile_approx('tot_count_views', 0.9).alias('90%'),\
                                      sum('tot_count_views').alias('sumViews')).cache()

In [None]:
matchlangs = spark.read.parquet('/scratch/descourt/processed_data/multieditions/pairs_0.95.parquet')
df_projects = matchlangs.select('pairs', explode('pairs').alias('project')).join(df_projects_stats, on='project').toPandas()

In [None]:
def compute_views_ratio(item):
    return Row(
        lang1=item[0][0],
        lang2=item[1][0],
        ratio=float(np.abs(np.log(item[0][1] / item[1][1]))))

In [None]:
df_projects = df_projects.merge(df_metafilt.groupBy('project').agg(count('*').alias('numberOfArticlesNov22')).toPandas(), on='project')

In [None]:
df_projects.sort_values(['date', 'numberOfArticlesNov22']).to_csv("/scratch/descourt/processed_data/multieditions/stats_langs.csv")

In [None]:
pairs = df_projects.where('date = "2022-06-01"').select('project', 'averageViews').rdd.map(tuple).cartesian(df_projects.where('date = "2022-06-01"').select('project','averageViews').rdd.map(tuple))

In [None]:
viewsRatio_df = pairs.map(lambda r: compute_views_ratio(r)).toDF()
matching_lang = viewsRatio_df.where((col('ratio') != 0) & (col('ratio') <= -log(lit(0.85))))\
                        .select( array_sort(array(col('lang1'), col('lang2'))).alias('pairs'), 'lang1', 'lang2')\
                        .dropDuplicates(['pairs']).cache()

In [None]:
matching_lang2 = matching_lang.join(dfs_pairs.select('pairs'), on='pairs')

In [None]:
dfs_pairs_filt = matching_lang2.union(matching_lang2.select(array(col('pairs')[1] , col('pairs')[0]).alias('pairs'),'lang1', 'lang2'))

In [None]:
len(pairs_filt)

In [None]:
pairs_filt = [p['pairs'] for p in dfs_pairs_filt.select('pairs').collect()]

# Linear regression

##  Gather all data for analysis

In [None]:
matched_pairs = df_high_finalpairs.where('intail = "1"').join(dfs_alleds.select('date', 'project', 'item_id', 'tot_count_views'), on=['date', 'project', 'item_id']).cache()

In [None]:
matched_pairs = matched_pairs.select(col('date').alias('treatment_date'), add_months('date', -1).alias('previous_date'), 'project', 'item_id', col('tot_count_views').alias('views_post_treat'), 'was_in_tail', 'pairs').cache()
matched_pairs = matched_pairs.join(dfs_alleds.select(col('date').alias('previous_date'), 'project', 'item_id', col('tot_count_views').alias('views_pre_treat')), on =['previous_date', 'project', 'item_id'], how='left').fillna(0, subset=['views_pre_treat'])\
                                .toPandas()

In [None]:
matched_pairs.sort_values(by=['previous_date', 'item_id']).to_csv("/scratch/descourt/processed_data/multieditions/pairs_0.95.csv")

In [None]:
matched_pairs['pairs'] = matched_pairs['pairs'].apply(lambda s : [l.strip()[1:-1] for l in s[1:-1].split(',')])

In [None]:
matched_pairs

In [None]:
# Check for the number of pages that we have
nb_pairs_per_date = matched_pairs.groupby('treatment_date').apply(lambda x : x.size / 2)
nb_pairs_per_date

## Analyse

In [None]:
matched_pairs_feb = matched_pairs.loc[(matched_pairs['treatment_date'] == pd.to_datetime('2022-02-01'))]# & matched_pairs.pairs.isin(pairs_filt)]

In [None]:
matched_pairs_feb

In [None]:
df_pre = matched_pairs_feb[['views_pre_treat', 'was_in_tail']].rename(columns = {'views_pre_treat' : 'views', 'was_in_tail' : 'treated'})
df_pre['post_treat'] = 0
df_post = matched_pairs_feb[['views_post_treat', 'was_in_tail']].rename(columns = {'views_post_treat' : 'views', 'was_in_tail' : 'treated'})
df_post['post_treat'] = 1

df_reg = pd.concat([df_pre, df_post])

In [None]:
from statsmodels.formula.api import ols

ols = ols('views ~ post_treat * treated', data=df_reg).fit()
print(ols.summary())