# Alternative method for merging OpenAlex and scanR author profiles

In this notebook, we attempt to link OpenAlex authors to their scanR profile using the fact that openAlex harvested theses.fr, the website hosting dissertations for French PhD students.
Therefore, we first link the theses that are in OpenAlex to their scanR entry, and simply use the fact that each dissertation is single-authored and entered on the theses.fr website manually in order to have a less ambiguous identifiers translation between the two databases. 

In [1]:
import os
base_path = os.path.dirname(os.path.abspath(""))
base_path = base_path + "/headers/"
print(base_path)
header_path = base_path + 'header_data_treatment.ipynb'
app_name = "merge_authors_scanr_2"
%run  $header_path $app_name=app_name

C:\Users\common\projet_3_lru\sorting_univ\script/headers/
merge_authors_scanr_2


# Load data

In [2]:
df_scanR = spark.read.format('parquet').load('file:\\' + scanR_path + '/publications.parquet')
merge_path = main_path_openalex.replace("data_extracted",'merge')
df_works_theses = (spark.read.format('parquet').load('file:\\' + main_path_openalex + 'works.parquet')
                    #.filter(func.col('type') == 'dissertation')
                    .select(func.col('primary_location.landing_page_url').alias('link'), 'id', 'authorships')
                    .filter(func.col('link').contains('theses.fr'))
           )

In [3]:
df_scanR.printSchema()

root
 |-- affiliations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- acronym: struct (nullable = true)
 |    |    |    |-- default: string (nullable = true)
 |    |    |    |-- en: string (nullable = true)
 |    |    |    |-- fr: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- id_name: string (nullable = true)
 |    |    |-- isFrench: boolean (nullable = true)
 |    |    |-- kind: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- label: struct (nullable = true)
 |    |    |    |-- default: string (nullable = true)
 |    |    |    |-- en: string (nullable = true)
 |    |    |    |-- fr: string (nullable = true)
 |    |    |-- level: string (nullable = true)
 |    |    |-- mainAddress: struct (nullable = true)
 |    |    |    |-- address: string (nullable = true)
 |    |    |    |-- city: string (nullable = true)
 |    |    |    |-- country: string (nullable = true

In [4]:
theses_scanR = (df_scanR
                .filter(func.col('landingPage').contains("theses.fr"))
                .select('landingPage', 'authors', 'id')
               )
theses_scanR.count()

443738

In [6]:
df_works_theses.cache()
df_works_theses.show()

+--------------------+--------------------+--------------------+
|                link|                  id|         authorships|
+--------------------+--------------------+--------------------+
|http://theses.fr/...|https://openalex....|[{first, {https:/...|
|http://www.theses...|https://openalex....|[{first, {https:/...|
|http://www.theses...|https://openalex....|[{first, {https:/...|
|http://www.theses...|https://openalex....|[{first, {https:/...|
|http://www.theses...|https://openalex....|[{first, {https:/...|
|http://www.theses...|https://openalex....|[{first, {https:/...|
|http://www.theses...|https://openalex....|[{first, {https:/...|
|https://www.these...|https://openalex....|[{first, {https:/...|
|http://www.theses...|https://openalex....|[{first, {https:/...|
|http://www.theses...|https://openalex....|[{first, {https:/...|
|http://www.theses...|https://openalex....|[{first, {https:/...|
|https://www.these...|https://openalex....|[{first, {https:/...|
|http://www.theses...|htt

In [7]:
df_works_theses.count()

598665

In [8]:
matched_theses = (theses_scanR.toDF('link', 'authors_scanR', 'scanR_id')
                  .join(df_works_theses.toDF('link','work_id', 'authors_oa'), on = 'link', how= 'full'))
matched_theses.write.mode('overwrite').parquet('file:\\' + merge_path + 'matched_theses.parquet')

In [4]:
merge_path = main_path_openalex.replace("data_extracted",'merge')
matched_theses = spark.read.parquet('file:\\' + merge_path + 'matched_theses.parquet')

In [13]:
matched_theses.groupBy(func.col('scanr_id').isNotNull().alias('in_scanr'), func.col('work_id').isNotNull().alias('in_oa')).count().show()

+--------+-----+------+
|in_scanr|in_oa| count|
+--------+-----+------+
|    true|false|442320|
|    true| true|  2968|
|   false| true|595698|
+--------+-----+------+



In [5]:
already_matched = (matched_theses
                   .filter( (func.col('scanr_id').isNotNull())
                           & (func.col('work_id').isNotNull()))
                  )
unmatched_oa = (matched_theses
                .filter( (func.col('scanr_id').isNull())
                           & (func.col('work_id').isNotNull()))
                .select('link','authors_oa', 'work_id')
                .withColumn('theses_id', func.regexp_extract(func.col('link'), r'[A-z0-9]*$', 0))

               )
unmatched_scanR = (matched_theses
                   .filter( (func.col('scanr_id').isNotNull())
                           & (func.col('work_id').isNull()))
                   .select('link', 'authors_scanR', 'scanR_id')
                   .withColumn('theses_id',func.regexp_extract(func.col('link'), r'[A-z0-9]*$', 0))
                  )
unmatched_scanR.cache()
unmatched_oa.cache()
unmatched_scanR.show()


+--------------------+--------------------+---------------+------------+
|                link|       authors_scanR|       scanR_id|   theses_id|
+--------------------+--------------------+---------------+------------+
|https://theses.fr...|[{[{http://catalo...|nnt1985aix22001|1985AIX22001|
|https://theses.fr...|[{[{http://catalo...|nnt1985aix30022|1985AIX30022|
|https://theses.fr...|[{[{http://catalo...|nnt1985besa1020|1985BESA1020|
|https://theses.fr...|[{[{http://catalo...|nnt1985bor10502|1985BOR10502|
|https://theses.fr...|[{[{http://catalo...|nnt1985bor10520|1985BOR10520|
|https://theses.fr...|[{[{http://catalo...|nnt1985bor10531|1985BOR10531|
|https://theses.fr...|[{[{http://catalo...|nnt1985bor10546|1985BOR10546|
|https://theses.fr...|[{[{http://catalo...|nnt1985bor10554|1985BOR10554|
|https://theses.fr...|[{[{http://catalo...|nnt1985bor10556|1985BOR10556|
|https://theses.fr...|[{[{http://catalo...|nnt1985bor10579|1985BOR10579|
|https://theses.fr...|[{[{http://catalo...|nnt1985b

In [33]:
unmatched_oa.show(truncate = False)

+-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------+------------+
|link                         |authors_oa                                                                                                                                                          |work_id                         |theses_id   |
+-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------+------------+
|http://theses.fr/1985AIX1A011|[{first, {https://openalex.org/A5077420463, Chantal Charnet, NULL}, [], [], NULL, true, Chantal Charnet, [], NULL}]                                                 |https://openalex.org/W3015410635|1985AIX1A011|
|http://theses.fr/1985AIX1A0

In [6]:
rematch = (unmatched_oa
            .join(unmatched_scanR, on = ['theses_id'], how = 'full')
          )
rematch.cache()
rematch.groupBy(func.col('scanr_id').isNotNull().alias('in_scanr'), func.col('work_id').isNotNull().alias('in_oa')).count().show()


+--------+-----+------+
|in_scanr|in_oa| count|
+--------+-----+------+
|    true|false|187796|
|    true| true|527327|
|   false| true| 68373|
+--------+-----+------+



In [35]:
rematch.filter( (func.col('scanr_id').isNotNull()) & (func.col('work_id').isNull())).show()

+------------+----+----------+-------+--------------------+--------------------+---------------+
|   theses_id|link|authors_oa|work_id|                link|       authors_scanR|       scanR_id|
+------------+----+----------+-------+--------------------+--------------------+---------------+
|        0013|NULL|      NULL|   NULL|https://theses.fr...|[{[{NULL, NULL, N...|nnt2012enst0013|
|          07|NULL|      NULL|   NULL|https://theses.fr...|[{[{NULL, NULL, N...|nnt2012ecdl0007|
|1985AIX1A001|NULL|      NULL|   NULL|https://theses.fr...|[{[{http://catalo...|nnt1985aix1a001|
|1985AIX1A005|NULL|      NULL|   NULL|https://theses.fr...|[{[{http://catalo...|nnt1985aix1a005|
|1985AIX1A025|NULL|      NULL|   NULL|https://theses.fr...|[{[{http://catalo...|nnt1985aix1a025|
|1985BOR10505|NULL|      NULL|   NULL|https://theses.fr...|[{[{http://catalo...|nnt1985bor10505|
|1985BOR10555|NULL|      NULL|   NULL|https://theses.fr...|[{[{http://catalo...|nnt1985bor10555|
|1985BOR10560|NULL|      NULL|

In [8]:
rematched_theses = (already_matched.drop('link')
                    .unionAll(rematch.select(*[col for col in already_matched.columns if col != 'link']))
                   )
rematched_theses.printSchema()

root
 |-- authors_scanR: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- affiliations: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- BNF: string (nullable = true)
 |    |    |    |    |-- ISNI: string (nullable = true)
 |    |    |    |    |-- acronym: array (nullable = true)
 |    |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |    |-- address: string (nullable = true)
 |    |    |    |    |-- address-line: string (nullable = true)
 |    |    |    |    |-- addresses: array (nullable = true)
 |    |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |    |-- alias_idref: array (nullable = true)
 |    |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |    |-- aliases: array (nullable = true)
 |    |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |    |-- city: string (nullable 

In [9]:
rematched_theses.write.mode('overwrite').parquet('file:\\' + merge_path + 'matched_theses_step2.parquet')

In [11]:
spark.catalog.clearCache()

In [12]:
matched_theses = spark.read.parquet('file:\\' + merge_path + 'matched_theses_step2.parquet')

In [13]:
matched_theses.groupBy(func.size(func.col('authors_oa'))).count().show()

+----------------+------+
|size(authors_oa)| count|
+----------------+------+
|              -1|187796|
|               1|586984|
|               6|    37|
|               3|   269|
|               5|    60|
|               4|   118|
|               7|    30|
|               2| 11127|
|               0|    17|
|               8|    18|
|               9|     6|
|              15|     2|
+----------------+------+



In [14]:
matched_theses.filter(func.size(func.col('authors_oa'))==2).show(truncate = False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [40]:
matched_authors_single_matched = (matched_theses
                                  .filter( (func.col('scanR_id').isNotNull())
                                            & (func.col('work_id').isNotNull()))
                                   .withColumn('authors_oa', func.explode(func.col('authors_oa')))
                                  .withColumn('authors_scanR', func.explode(func.col('authors_scanR')))
                                  .filter(func.col('authors_scanR.role') == 'author')
                                  .select('scanR_id','work_id','authors_oa.author.display_name', 'authors_scanR.fullName',
                                         func.regexp_replace(func.col('authors_oa.author.id'),'https://openalex.org/','').alias('author_id'),
                                          func.col('authors_scanR.person').alias('idref') ,
                                          'authors_oa','authors_scanR')
                                 )
matched_authors_single_matched.cache()
matched_authors_single_matched.show()

+---------------+--------------------+--------------------+--------------------+-----------+--------------+--------------------+--------------------+
|       scanR_id|             work_id|        display_name|            fullName|  author_id|         idref|          authors_oa|       authors_scanR|
+---------------+--------------------+--------------------+--------------------+-----------+--------------+--------------------+--------------------+
|nnt1927pa081231|https://openalex....|Armand-Guy Nsatou...|Armand-Guy Nsatou...|A5064591993|          NULL|{first, {https://...|{[{http://catalog...|
|nnt1927pa081231|https://openalex....|Armand-Guy Nsatou...|Armand-Guy Nsatou...|A5064591993|          NULL|{first, {https://...|{[{http://catalog...|
|nnt1984pa112254|https://openalex....|      J.P. Wieleczko|Jean-Pierre Wiele...|A5016316519|idref069691207|{first, {https://...|{[{http://catalog...|
|nnt1984pa112254|https://openalex....|      J.P. Wieleczko|Jean-Pierre Wiele...|A5016316519|idref069

In [41]:
import unidecode
def unidecode_debug(x):
    try:
        y = unidecode.unidecode(x)
        return(y)
    except:
        return(x)

udf_unidecode = func.udf(unidecode_debug, StringType())

In [42]:
matched_authors_single_matched = (matched_authors_single_matched
                                  .withColumn('tokens_oa', func.split(udf_unidecode(func.lower(func.col('display_name'))), ' '))
                                  .withColumn('tokens_scanR', func.split(udf_unidecode(func.lower(func.col('fullName'))), ' '))
                                 )
matched_authors_single_matched.groupBy(func.arrays_overlap(func.col('tokens_oa'), func.col('tokens_scanR'))).count().show()

+---------------------------------------+------+
|arrays_overlap(tokens_oa, tokens_scanR)| count|
+---------------------------------------+------+
|                                   true|534932|
|                                  false|  7059|
+---------------------------------------+------+



In [43]:
matched_authors_single_matched = (matched_authors_single_matched
                                  .filter(func.arrays_overlap(func.col('tokens_oa'), func.col('tokens_scanR')))
                                 )

In [44]:
merged_authors_theses =(matched_authors_single_matched
                        .select('author_id', 'idref')
                        .distinct()
                       )
merged_authors_theses.cache()
merged_authors_theses.groupBy('author_id').count().groupBy(func.col('count').alias('n_idref')).count().show()
merged_authors_theses.groupBy('idref').count().groupBy(func.col('count').alias('n_author_id')).count().show()

+-------+------+
|n_idref| count|
+-------+------+
|      7|    18|
|      6|    23|
|      5|    53|
|      1|258325|
|      3|   608|
|      8|     9|
|      2|  4192|
|      4|   154|
|     10|     3|
|     12|     1|
|     14|     1|
+-------+------+

+-----------+------+
|n_author_id| count|
+-----------+------+
|          5|     2|
|          1|207339|
|      33486|     1|
|          3|   579|
|          2| 13555|
|          4|    31|
+-----------+------+



In [45]:
merged_authors_theses_1match = (merged_authors_theses
                         .withColumn('n_oa_id', func.size(func.collect_set(func.col('author_id')).over(Window.partitionBy('idref'))))
                         .withColumn('n_scanr_id', func.size(func.collect_set(func.col('idref')).over(Window.partitionBy('author_id'))))
                         .filter( (func.col('n_oa_id') ==1) & (func.col('n_scanr_id') == 1))
                          .drop('n_oa_id', 'n_scanr_id'))

In [46]:
merged_authors_theses_1match.write.mode('overwrite').parquet('file:\\' + merge_path + 'matched_authors_theses.parquet')

# Assessment for authors

In [47]:
merged_authors = spark.read.parquet('file:\\' + merge_path + 'merge_oa_scanr_authors.parquet')

In [48]:
merged_authors_theses_1match = spark.read.parquet('file:\\' + merge_path + 'matched_authors_theses.parquet')

In [50]:
merged_authors.withColumnRenamed('id', 'idref').join(merged_authors_theses_1match, on = ['idref','author_id'], how = 'inner').count()

8129

In [51]:
merged_authors.count()

53410

In [52]:
merged_authors_theses_1match.count()

201470

In [53]:
(merged_authors_theses_1match
 .join(merged_authors
                                  .select(func.col('id').alias('idref'),
                                          func.col('author_id').alias('first_match_oa_id')),
                                  on = ['idref'], how = 'inner')
 .groupBy(func.col('author_id') == func.col('first_match_oa_id')).count()).show()

+-------------------------------+-----+
|(author_id = first_match_oa_id)|count|
+-------------------------------+-----+
|                           true| 8129|
|                          false| 5459|
+-------------------------------+-----+



In [55]:
(merged_authors_theses_1match
 .join(merged_authors
                                  .select(func.col('id').alias('idref'),
                                          func.col('author_id').alias('first_match_oa_id')),
                                  on = ['idref'], how = 'inner')
 .filter(func.col('author_id') != func.col('first_match_oa_id'))).show()

+--------------+-----------+-----------------+
|         idref|  author_id|first_match_oa_id|
+--------------+-----------+-----------------+
|idref057152462|A5000178941|      A5041823308|
|idref177333952|A5000290173|      A5081221370|
|idref061688835|A5000398468|      A5007024543|
|idref067213642|A5000418022|      A5086115154|
|idref197901298|A5000560070|      A5080059539|
|idref149645538|A5000597116|      A5087852222|
|idref113593821|A5000599774|      A5001674341|
|idref077482727|A5000655960|      A5040333150|
|idref086072587|A5000725239|      A5089792876|
|idref220421161|A5000753222|      A5000652009|
|idref081955057|A5000832841|      A5079493518|
|idref059378883|A5000845033|      A5101402516|
|idref060567376|A5000853268|      A5089666505|
|idref23692009X|A5000886843|      A5053259416|
|idref154242136|A5000903082|      A5038501408|
|idref110781201|A5000918542|      A5074160592|
|idref235059021|A5000947728|      A5060634050|
|idref145096300|A5000970896|      A5013005996|
|idref1765999

In [56]:
(merged_authors_theses_1match
 .join(merged_authors
                                  .select(func.col('id').alias('first_match_idref'),
                                          func.col('author_id').alias('author_id')),
                                  on = ['author_id'], how = 'inner')
 .filter(func.col('idref') != func.col('first_match_idref'))).show()

+-----------+--------------+-----------------+
|  author_id|         idref|first_match_idref|
+-----------+--------------+-----------------+
|A5000601948|idref033096449|   idref110722973|
|A5000601948|idref033096449|   idref075736950|
|A5000646953|idref079029434|   idref265372852|
|A5001258652|idref031023118|   idref195810287|
|A5001495290|idref033076189|   idref171370899|
|A5004146540|idref033713189|   idref157731502|
|A5004148978|idref035829265|   idref101526679|
|A5005224396|idref026871408|   idref075544393|
|A5005294557|idref117479748|   idref060283246|
|A5005839602|idref078176859|   idref192591460|
|A5007428887|idref077616650|   idref273453696|
|A5008118303|idref112935605|   idref260223360|
|A5009701365|idref068981317|   idref244222517|
|A5011350481|idref115583629|   idref189494972|
|A5011830167|idref25584493X|   idref094609624|
|A5012601539|idref072325534|   idref275144208|
|A5013301555|idref121779823|   idref256463352|
|A5013460943|idref06155555X|   idref167005960|
|A5014054255|

# Final match

In [66]:
merged_authors_final = (merged_authors.withColumnRenamed('id', 'idref').withColumn('names_match', func.lit(1))
                        .join(merged_authors_theses_1match.withColumn('theses_match', func.lit(1)), on = ['idref','author_id'], how = 'full')
                        .withColumn('n_idref', func.size(func.collect_set(func.col('idref')).over(Window.partitionBy('author_id'))))
                        .fillna(0, subset = ['theses_match','names_match'])
                        .filter( (func.col('n_idref') == 1) | (func.col('theses_match')==1))
                        .select('idref', 'author_id')
                       )

In [67]:
merged_authors_final.select('idref').distinct().count()

239655

In [68]:
merged_authors_final.select('author_id').distinct().count()

244946

In [69]:
merged_authors_final.write.mode('overwrite').parquet('file:\\' + merge_path + 'matched_authors_2steps.parquet')