In [10]:
%run header_data_treatment.ipynb

In [11]:
works_au_af = spark.read.format('parquet').load('file:\\' + openalex_path + 'works_au_af.parquet')
df_scanR = spark.read.format('parquet').load('file:\\' + scanR_path + 'publications.parquet')

In [24]:
df_authors = spark.read.format('parquet').load('file:\\' + openalex_path + 'authors.parquet')


In [13]:
works_au_af_subset = (works_au_af
                      .join(works_au_af.filter(func.col('country')=='FR').select('author_id').distinct(),
                            on = ['author_id'], how = 'inner')
                      .groupBy('author_id')
                      .agg( func.collect_set('title').alias('titles'),
                               func.collect_set('inst_id').alias('institutions'),
                              func.array_distinct(func.flatten(func.collect_set(func.col('fields')))).alias('fields')
                             )
                     )

works_au_af_subset.cache()
works_au_af_subset.show()

+-----------+--------------------+--------------------+--------------------+
|  author_id|              titles|        institutions|              fields|
+-----------+--------------------+--------------------+--------------------+
|A5000016899|[MEASUREMENT OF T...|[I4210152651, I30...|[Engineering, Phy...|
|A5000023309|[«Host-Guests Rel...|       [I4210128200]|[Biology, Physics...|
|A5000083574|[Computational de...|[I1294671590, I37...|[Computer science...|
|A5000087584|[Optimization of ...|[I48430043, I1294...|[Computer science...|
|A5000093350|[Dentists’ perspe...|         [I92834545]|[Medicine, Psycho...|
|A5000139201|[Rapport de missi...|[I251321805, I131...|[Chemistry, Compu...|
|A5000156980|[Clinafloxacin mo...|[I2799288026, I42...|[Mathematics, Med...|
|A5000157973|[Religious and no...|[I90669466, I2746...|[Art, History, Ph...|
|A5000204781|[Effect of strong...|[I66946132, I4210...|[Engineering, Env...|
|A5000209351|[Determination of...|[I4210134555, I30...|[Chemistry, Geolo...|

In [None]:
authors_scanR = (df_scanR
                 .withColumn('authors', func.explode(func.col('authors')))
                 .withColumn('af', func.explode(func.col('authors.affiliations')))
                 .withColumn('idref', func.regexp_replace(func.lower(func.col("authors.person")), 'idref',''))
                 .withColumn('idref', func.when(func.col('idref').isNull(), func.col('authors.fullName')).otherwise(func.col('idref')))
                 .groupBy('idref')
                 .agg( *[func.first(col).alias(col.replace('authors.', '')) for col in ['authors.firstName', 'authors.fullName', 'authors.lastName']],
                         func.collect_set('title.default').alias('titles'),
                         func.collect_set('af').alias('institutions'),
                         func.array_distinct(func.flatten(func.collect_set(func.col('domains')))).alias('fields')
                     )
                )
authors_scanR.cache()
authors_scanR.show()

In [22]:
authors_scanR = authors_scanR.toDF('idref', 'firstName', 'fullName', 'lastName', 'titles_scanR', 'institutions_scanR', 'fields_scanR')

In [23]:
authors_scanR.show()

+-------------+--------------+--------------------+---------------+--------------------+--------------------+--------------------+
|        idref|     firstName|            fullName|       lastName|        titles_scanR|  institutions_scanR|        fields_scanR|
+-------------+--------------+--------------------+---------------+--------------------+--------------------+--------------------+
|- Auclair, D.|             -|       - Auclair, D.|    Auclair, D.|[Gestion des pays...|             [inrae]|[{NULL, {FORESTER...|
|   - Roman D.|             -|          - Roman D.|       Roman D.|[Génie végétal en...|     [grid.464018.f]|[{Q1369325, {Géni...|
|    026733412|François-Marie|François-Marie Bl...|        Blondel|[Diagnostic et ai...|[grid.29172.3f, t...|[{Q17006654, {ano...|
|    026760312|          Yves|       Yves Bussiere|       Bussiere|[Vulnérabilité au...|[grid.418084.1, h...|[{Q142, {français...|
|    026814986|     Jean-Marc|Jean-Marc de Leer...|  de Leersnyder|[Les câbles sous

In [26]:
test_merge = (works_au_af_subset
              .join(df_authors.select(func.regexp_replace(func.col('id'), 'https://openalex.org/','').alias('author_id'),
                                      func.col('display_name').alias('fullName')), on ='author_id', how ='inner')
              .withColumn('in_oa', func.lit(1))
              .join(authors_scanR
                    .withColumn('in_scanR', func.lit(1)), on = 'fullName', how = 'full')
             )
test_merge.groupBy('in_oa','in_scanR').count().show()

+-----+--------+-------+
|in_oa|in_scanR|  count|
+-----+--------+-------+
| NULL|       1|1448801|
|    1|    NULL| 771446|
|    1|       1| 799712|
+-----+--------+-------+



In [27]:
test_merge.filter(func.col('in_oa').isNull()).show()

+------------------+---------+------+------------+------+-----+------------------+---------+----------------+--------------------+--------------------+--------------------+--------+
|          fullName|author_id|titles|institutions|fields|in_oa|             idref|firstName|        lastName|        titles_scanR|  institutions_scanR|        fields_scanR|in_scanR|
+------------------+---------+------+------------+------+-----+------------------+---------+----------------+--------------------+--------------------+--------------------+--------+
|     - Auclair, D.|     NULL|  NULL|        NULL|  NULL| NULL|     - Auclair, D.|        -|     Auclair, D.|[Gestion des pays...|             [inrae]|[{NULL, {FORESTER...|       1|
|        - Roman D.|     NULL|  NULL|        NULL|  NULL| NULL|        - Roman D.|        -|        Roman D.|[Génie végétal en...|     [grid.464018.f]|[{Q1369325, {Géni...|       1|
|        42 Factory|     NULL|  NULL|        NULL|  NULL| NULL|        42 Factory|     NUL

In [28]:
test_merge.filter(func.col('fullName')=='Mathias Wargon').show()

+--------------+-----------+--------------------+--------------------+--------------------+-----+---------+---------+--------+--------------------+--------------------+--------------------+--------+
|      fullName|  author_id|              titles|        institutions|              fields|in_oa|    idref|firstName|lastName|        titles_scanR|  institutions_scanR|        fields_scanR|in_scanR|
+--------------+-----------+--------------------+--------------------+--------------------+-----+---------+---------+--------+--------------------+--------------------+--------------------+--------+
|Mathias Wargon|A5054102803|[Caractéristiques...|[I4210129146, I42...|[Art, Political s...|    1|109258312|  Mathias|  Wargon|[The analysis of ...|[180036048, 19751...|[{Q220570, {Pulmo...|       1|
+--------------+-----------+--------------------+--------------------+--------------------+-----+---------+---------+--------+--------------------+--------------------+--------------------+--------+



In [None]:
test_merge.cache()
test_merge.filter(func.col('fullName')=='Odile Moreau').show()