In [1]:
%run header_data_treatment.ipynb

In [2]:
au_inst_period = spark.read.format('parquet').load('file:\\' + scanR_path.replace("scanR", "panel_fr_res") + 'panel_non_sphericised.parquet')

In [3]:
au_y = (au_inst_period
        .select('author_id', func.explode(func.sequence(func.col('entry_year'), func.lit(2020))).alias('year'))
        .join(au_inst_period
        .groupBy('author_id', 'year')
        .agg(*[func.concat_ws(",",func.array_sort(func.collect_set(col))).alias(col + '_set') for col in 
             ["country","inst_id", "inst_type","display_name", "parent_id","parent_type", "parent_name"]],
             *[func.first(col).alias(col) for col in 
               ['entry_year', 'last_year',"all_y_in_FR", "years_abroad",'n_inst_y','main_field']],
             *[func.sum(col).alias(col) for col in ['publications','citations',
                'nr_type_article','nr_type_book','nr_type_book-series',
                'nr_language_en','nr_language_fr',
                'citations_type_article','citations_type_book','citations_type_book-series',
                'citations_language_en','citations_language_fr']]
            ), on = ['author_id', 'year'], how = 'left')
        .select('author_id', 'year', 
                *[func.col(col + '_set') for col in ["country","inst_id", "inst_type","display_name", "parent_id","parent_type", "parent_name"]],
                *[func.first(col, ignorenulls=True).over(Window.partitionBy('author_id')).alias(col)
                  for col in ['entry_year', 'last_year',"all_y_in_FR", "years_abroad",'n_inst_y','fields']],
                *[func.when(func.col(col).isNotNull(), func.col(col)).otherwise(0).alias(col)
                  for col in ['publications','citations',
                'nr_type_article','nr_type_book','nr_type_book-series',
                'nr_language_en','nr_language_fr',
                'citations_type_article','citations_type_book','citations_type_book-series',
                'citations_language_en','citations_language_fr']])
       )
au_y.cache()

DataFrame[author_id: string, year: bigint, country_set: string, inst_id_set: string, inst_type_set: string, display_name_set: string, parent_id_set: string, parent_type_set: string, parent_name_set: string, entry_year: bigint, last_year: bigint, all_y_in_FR: bigint, years_abroad: bigint, n_inst_y: int, main_field: string, publications: double, citations: double, nr_type_article: double, nr_type_book: double, nr_type_book-series: double, nr_language_en: double, nr_language_fr: double, citations_type_article: double, citations_type_book: double, citations_type_book-series: double, citations_language_en: double, citations_language_fr: double]

In [4]:
au_y.show()

+-----------+----+-----------+-----------+-------------+--------------------+-------------+---------------+--------------------+----------+---------+-----------+------------+--------+--------------------+------------+---------+---------------+------------+-------------------+--------------+--------------+----------------------+-------------------+--------------------------+---------------------+---------------------+
|  author_id|year|country_set|inst_id_set|inst_type_set|    display_name_set|parent_id_set|parent_type_set|     parent_name_set|entry_year|last_year|all_y_in_FR|years_abroad|n_inst_y|          main_field|publications|citations|nr_type_article|nr_type_book|nr_type_book-series|nr_language_en|nr_language_fr|citations_type_article|citations_type_book|citations_type_book-series|citations_language_en|citations_language_fr|
+-----------+----+-----------+-----------+-------------+--------------------+-------------+---------------+--------------------+----------+---------+---------

In [5]:
(au_y
 .withColumn('lag_y', func.lag(func.col('year')).over(Window.partitionBy('author_id').orderBy('year')))
 .withColumn('lag_inst_set', func.lag(func.col('inst_id_set')).over(Window.partitionBy('author_id').orderBy('year')))
 .groupBy((func.col('year')-func.col('lag_y')).alias("y_diff"),
          (func.col('inst_id_set') != func.col('lag_inst_set')).cast('int').alias('change_inst'))
 .count()
 .withColumn('pct_y', func.round( func.col('count')/func.sum(func.col('count')).over(Window.partitionBy('y_diff')) *100, 2))
.sort('y_diff','change_inst')).show(100)

+------+-----------+---------+-----+
|y_diff|change_inst|    count|pct_y|
+------+-----------+---------+-----+
|  NULL|       NULL|   842221|100.0|
|     0|       NULL|304325374|55.64|
|     0|          0|242622312|44.36|
|     1|       NULL| 15372860|78.75|
|     1|          0|  1133624| 5.81|
|     1|          1|  3013575|15.44|
+------+-----------+---------+-----+



In [6]:
au_y.write.mode('overwrite').parquet('file:\\' + scanR_path.replace("scanR", "panel_fr_res") + 'au_y_sphericised.parquet')

In [19]:
(au_y
 .withColumn('lag_y', func.lag(func.col('year')).over(Window.partitionBy('author_id').orderBy('year')))
 .withColumn('lag_inst_set', func.lag(func.col('inst_id_set')).over(Window.partitionBy('author_id').orderBy('year')))
 .filter(func.col('year')-func.col('lag_y')==20)
).show()

+-----------+----+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+---------+-----------+------------+--------+------------------+---------+------------------+------------+-------------------+------------------+--------------+----------------------+-------------------+--------------------------+---------------------+---------------------+-----+--------------------+
|  author_id|year|country_set|         inst_id_set|       inst_type_set|    display_name_set|       parent_id_set|     parent_type_set|     parent_name_set|entry_year|last_year|all_y_in_FR|years_abroad|n_inst_y|      publications|citations|   nr_type_article|nr_type_book|nr_type_book-series|    nr_language_en|nr_language_fr|citations_type_article|citations_type_book|citations_type_book-series|citations_language_en|citations_language_fr|lag_y|        lag_inst_set|
+-----------+----+-----------+--------------------+-------------