In [1]:
import re
import os
import gzip
import shutil
import pandas as pd
import time
from itertools import cycle, islice
from transformers import pipeline, AutoModel, AutoModelForSequenceClassification,AutoTokenizer
import findspark
findspark.init("C:\\Spark\\spark-3.5.0-bin-hadoop3")

from pyspark.sql import SparkSession
import pyspark.sql.functions as func
from pyspark.sql.window import Window
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,ArrayType,BooleanType
spark = SparkSession.builder \
                    .config("spark.sql.debug.maxToStringFields", 1000)\
                    .config("spark.sql.files.maxPartitionBytes", str(160 * 1024 * 1024)+"b")\
                    .config("spark.executor.memory", "20g")\
                    .config("spark.driver.memory", "20g")\
                    .config('spark.executor.cores',4) \
                    .master("local[4]") \
                    .appName('open_works') \
                    .getOrCreate()
                    #.enableHiveSupport()\
    
main_path = 'C:\\Users\\common\\projet_3_lru\\'
scanR_path = 'D:\\scanR\\'
openalex_path = 'D:\\openalex-snapshot\\data_extracted\\'


In [2]:
df_scanR = spark.read.format('parquet').load('file:\\' + scanR_path + 'publications.parquet')

In [3]:
df_scanR.printSchema()

root
 |-- affiliations: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- affiliations: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- firstName: string (nullable = true)
 |    |    |-- fullName: string (nullable = true)
 |    |    |-- lastName: string (nullable = true)
 |    |    |-- person: string (nullable = true)
 |    |    |-- role: string (nullable = true)
 |    |    |-- rolePatent: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- role: string (nullable = true)
 |    |    |-- typeParticipant: string (nullable = true)
 |-- authorsCount: long (nullable = true)
 |-- doiUrl: string (nullable = true)
 |-- domains: array (nullable = true)
 |    |-- 

In [4]:
import unidecode
def unidecode_debug(x):
    try:
        y = unidecode.unidecode(x)
        return(y)
    except:
        return(x)

udf_unidecode = func.udf(unidecode_debug, StringType())

In [3]:
import nltk
from nltk.corpus import stopwords
print(stopwords.fileids())
stopwords_dict = {}
for language in stopwords.fileids():
    stopwords_dict[language.capitalize()] = stopwords.words(language)

['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']


In [4]:
values = [(k, stopwords_dict[k]) for k in list(stopwords_dict.keys())]
columns = ['language', 'stopwords']
stopwords_df = spark.createDataFrame(values, columns)
values = [(k, stopwords_dict[k]) for k in list(stopwords_dict.keys())]
columns = ['language', 'stopwords']
stopwords_df = spark.createDataFrame(values, columns)

values = [(k, stopwords_dict[k]) for k in list(stopwords_dict.keys())]
columns = ['language', 'stopwords']
stopwords_df = spark.createDataFrame(values, columns)
iso_639_choices ={
'ab': 'Abkhaz', 'aa': 'Afar', 'af': 'Afrikaans', 'ak': 'Akan', 'sq': 'Albanian', 'am': 'Amharic', 'ar': 'Arabic',
'an': 'Aragonese', 'hy': 'Armenian', 'as': 'Assamese', 'av': 'Avaric', 'ae': 'Avestan', 'ay': 'Aymara', 'az': 'Azerbaijani',
'bm': 'Bambara', 'ba': 'Bashkir', 'eu': 'Basque', 'be': 'Belarusian', 'bn': 'Bengali', 'bh': 'Bihari', 'bi': 'Bislama',
'bs': 'Bosnian', 'br': 'Breton', 'bg': 'Bulgarian', 'my': 'Burmese', 'ca': 'Catalan; Valencian', 'ch': 'Chamorro', 'ce': 'Chechen',
'ny': 'Chichewa; Chewa; Nyanja', 'zh': 'Chinese', 'cv': 'Chuvash', 'kw': 'Cornish', 'co': 'Corsican', 'cr': 'Cree', 'hr': 'Croatian',
'cs': 'Czech', 'da': 'Danish', 'dv': 'Divehi; Maldivian;', 'nl': 'Dutch', 'dz': 'Dzongkha', 'en': 'English', 'eo': 'Esperanto',
'et': 'Estonian', 'ee': 'Ewe', 'fo': 'Faroese', 'fj': 'Fijian', 'fi': 'Finnish', 'fr': 'French', 'ff': 'Fula', 'gl': 'Galician',
'ka': 'Georgian', 'de': 'German', 'el': 'Greek, Modern', 'gn': 'Guaraní', 'gu': 'Gujarati', 'ht': 'Haitian','ha': 'Hausa',
'he': 'Hebrew','hz': 'Herero','hi': 'Hindi','ho': 'Hiri Motu','hu': 'Hungarian','ia': 'Interlingua','id': 'Indonesian',
'ie': 'Interlingue','ga': 'Irish','ig': 'Igbo','ik': 'Inupiaq','io': 'Ido','is': 'Icelandic','it': 'Italian','iu': 'Inuktitut',
'ja': 'Japanese','jv': 'Javanese', 'kl': 'Kalaallisut', 'kn': 'Kannada', 'kr': 'Kanuri', 'ks': 'Kashmiri', 
'kk': 'Kazakh', 'km': 'Khmer', 'ki': 'Kikuyu, Gikuyu', 'rw': 'Kinyarwanda', 'ky': 'Kirghiz, Kyrgyz', 'kv': 'Komi', 
'kg': 'Kongo', 'ko': 'Korean', 'ku': 'Kurdish', 'kj': 'Kwanyama, Kuanyama', 'la': 'Latin', 'lb': 'Luxembourgish', 
'lg': 'Luganda', 'li': 'Limburgish', 'ln': 'Lingala', 'lo': 'Lao', 'lt': 'Lithuanian', 'lu': 'Luba-Katanga', 'lv': 'Latvian',
'gv': 'Manx', 'mk': 'Macedonian', 'mg': 'Malagasy', 'ms': 'Malay', 'ml': 'Malayalam', 'mt': 'Maltese', 'mi': 'Māori', 
'mr': 'Marathi','mh': 'Marshallese','mn': 'Mongolian','na': 'Nauru','nv': 'Navajo','nb': 'Norwegian Bokmål',
'nd': 'North Ndebele','ne': 'Nepali','ng': 'Ndonga','nn': 'Norwegian Nynorsk','no': 'Norwegian','ii': 'Nuosu',
'nr': 'South Ndebele','oc': 'Occitan','oj': 'Ojibwe, Ojibwa','cu': 'Old Church Slavonic','om': 'Oromo','or': 'Oriya',
'os': 'Ossetian, Ossetic','pa': 'Panjabi, Punjabi','pi': 'Pāli','fa': 'Persian','pl': 'Polish','ps': 'Pashto, Pushto',
'pt': 'Portuguese','qu': 'Quechua','rm': 'Romansh','rn': 'Kirundi', 'ro': 'Romanian, Moldavan', 'ru': 'Russian', 
'sa': 'Sanskrit (Saṁskṛta)', 'sc': 'Sardinian', 'sd': 'Sindhi', 'se': 'Northern Sami', 'sm': 'Samoan', 'sg': 'Sango',
'sr': 'Serbian', 'gd': 'Scottish Gaelic', 'sn': 'Shona', 'si': 'Sinhala, Sinhalese', 'sk': 'Slovak', 'sl': 'Slovene',
'so': 'Somali', 'st': 'Southern Sotho', 'es': 'Spanish', 'su': 'Sundanese', 'sw': 'Swahili', 'ss': 'Swati', 'sv': 'Swedish', 
'ta': 'Tamil', 'te': 'Telugu', 'tg': 'Tajik', 'th': 'Thai', 'ti': 'Tigrinya','bo': 'Tibetan','tk': 'Turkmen','tl': 'Tagalog','tn': 'Tswana','to': 'Tonga','tr': 'Turkish','ts': 'Tsonga','tt': 'Tatar','tw': 'Twi','ty': 'Tahitian','ug': 'Uighur, Uyghur','uk': 'Ukrainian','ur': 'Urdu','uz': 'Uzbek','ve': 'Venda','vi': 'Vietnamese','vo': 'Volapük','wa': 'Walloon','cy': 'Welsh','wo': 'Wolof','fy': 'Western Frisian','xh': 'Xhosa','yi': 'Yiddish','yo': 'Yoruba','za': 'Zhuang, Chuang',
'zu': 'Zulu', "zh-cn" : "Chinese"
}
        

values = [(k, iso_639_choices[k]) for k in list(iso_639_choices.keys())]
columns = ["language_iso2",'language']
iso_df = spark.createDataFrame(values, columns)
iso_df.show()

stopwords_df = (stopwords_df.join(iso_df, on = 'language', how = 'left')
                .select('stopwords', func.col('language_iso2').alias('language'))
def rm_stopwords(x,y):
    if x is None:
        return(x)
    if y is None:
        return(y)
    else:
        try:
            list_x = x.split(' ')
            list_y = y
            return(' '.join([word for word in list_x if word not in list_y]))
        except:
            return(x)
udf_rm_stopwords = func.udf(rm_stopwords)

def rm_stopwords(x,y):
    if x is None:
        return(x)
    if y is None:
        return(y)
    else:
        try:
            list_x = x.split(' ')
            list_y = y
            return(' '.join([word for word in list_x if word not in list_y]))
        except:
            return(x)
udf_rm_stopwords = func.udf(rm_stopwords)

NameError: name 'spark' is not defined

In [7]:
openalex_path = 'D:\\openalex-snapshot\\data_extracted\\'

works_au_af_subset = (spark.read.format('parquet').load('file:\\' + openalex_path + 'works_au_af.parquet').filter(func.col('country')=='FR')
               .select('work_id','title', 'citations',func.col('publication_year').alias('year'),'language').distinct())

In [8]:
df_scanR.groupBy(func.col('title.default').isNull(), func.col('title.fr').isNull(), func.col('title.en').isNull()).count().show()

+-----------------------+------------------+------------------+-------+
|(title.default IS NULL)|(title.fr IS NULL)|(title.en IS NULL)|  count|
+-----------------------+------------------+------------------+-------+
|                  false|              true|              true| 108322|
|                  false|              true|             false|2321193|
|                  false|             false|              true|1523743|
|                  false|             false|             false| 206728|
+-----------------------+------------------+------------------+-------+



In [9]:
barebones_scanR = (df_scanR.filter(func.col('year')==2019)
                  .filter(func.col('productionType') == 'publication')
                  .select(func.col('title.default').alias('title'), 'id', 'externalIds',
                          func.when(func.col('title.default')==func.col('title.en'), 'English')
                          .otherwise('French').alias('language'))
                   .withColumn('language', func.when(func.col('language').isNull(), 'French').otherwise(func.col('language')))
                   .join(stopwords_df, on = ['language'], how = 'left')
                   .withColumn('cleaned_title', udf_rm_stopwords(func.lower(func.col('title')), func.col('stopwords')))
                   .withColumn('cleaned_title', udf_unidecode(func.col('cleaned_title')))
                   .withColumn('cleaned_title', func.regexp_replace(func.col('cleaned_title'), r'[[^A-Za-z0-9 -]]+', ''))  # Remove special characters
                   .withColumn('cleaned_title', func.regexp_replace(func.col('cleaned_title'), '  ', ''))  # Remove special characters
                   .withColumn('cleaned_title', func.regexp_replace(func.col('cleaned_title'), '  ', ''))  # Remove special characters
                   .drop('languages')
                               
                            
                  .withColumn('identifiers', func.when(func.col('externalIds.id').isNotNull(),
                                                 func.array_compact(func.flatten(func.array(func.col('externalIds.id'), 
                                                                                            func.array([func.col('id'), func.col('cleaned_title')])))))
                                                 .otherwise(func.array([func.col('id'), func.col('cleaned_title')])))
                   .select('id', func.explode(func.col('identifiers')).alias('identifiers'), func.lit(1).alias('in_scanR'))

                  )
barebones_scanR.show()


+--------------------+--------------------+--------+
|                  id|         identifiers|in_scanR|
+--------------------+--------------------+--------+
|     halhal-04024361|        hal-04024361|       1|
|     halhal-04024361|     halhal-04024361|       1|
|     halhal-04024361|study authorial i...|       1|
|doi10.1093/oso/97...|10.1093/oso/97801...|       1|
|doi10.1093/oso/97...|10.1093/oso/97801...|       1|
|doi10.1093/oso/97...|doi10.1093/oso/97...|       1|
|doi10.1093/oso/97...|oxford studies ag...|       1|
|     halhal-04020278|        hal-04020278|       1|
|     halhal-04020278|     halhal-04020278|       1|
|     halhal-04020278|paris savant capi...|       1|
|doi10.1007/978-3-...|10.1007/978-3-030...|       1|
|doi10.1007/978-3-...|10.1007/978-3-030...|       1|
|doi10.1007/978-3-...|doi10.1007/978-3-...|       1|
|doi10.1007/978-3-...|textbook oncofert...|       1|
|doi10.1515/978303...|10.1515/978303561...|       1|
|doi10.1515/978303...|10.1515/978303561...|   

In [10]:
merge_path = 'D:\\openalex-snapshot\\merge\\'

works_minimal_clean = spark.read.format('parquet').load('file:\\' + merge_path + 'oa_identifiers.parquet')


In [11]:
works_minimal_clean.show()

+-----------+--------------------+---------+
|    work_id|         identifiers|countries|
+-----------+--------------------+---------+
|W1000001460|http://www.cqvip....|         |
|W1000001460|Yan Zi Chun QiuZh...|         |
|W1000004386|http://www.cqvip....|         |
|W1000004386|Sha Jing Di Ji Ch...|         |
|W1000012492|https://www.cabdi...|         |
|W1000012492|leaf diseases hea...|         |
| W100001460|https://doi.org/1...|       US|
| W100001460|https://doi.org/1...|       US|
| W100001460|age-related micro...|       US|
|W1000015727|http://www.cqvip....|         |
|W1000015727|Min Guo Shi Qi Fe...|         |
|W1000017715|http://www.cqvip....|         |
|W1000017715|Yuan Qu Shu Fa Re...|         |
|W1000022074|http://www.cqvip....|         |
|W1000022074|San Tai He Yi Zhe...|         |
|W1000036519|http://www.cqvip....|         |
|W1000036519|Zhua Hao Fu Pin G...|         |
|W1000047560|https://libguides...|         |
|W1000047560|research guides e...|         |
|W10000507

In [12]:
merge = (works_minimal_clean.withColumn('in_oa', func.lit(1))
         .join(barebones_scanR, on = ['identifiers'], how = 'full')
         .select('id', 'work_id', 'in_scanR', 'in_oa')
         .distinct()
        )

In [18]:
unambiguous_merge.write.mode('overwrite').parquet('file:\\' + merge_path + 'merge_oa_scanr.parquet')

In [6]:
merge_path = 'D:\\openalex-snapshot\\merge\\'

merge = spark.read.format('parquet').load('file:\\' + merge_path + 'merge_oa_scanr.parquet')
merge.show()

+--------------------+-----------+--------+-----+
|                  id|    work_id|in_scanR|in_oa|
+--------------------+-----------+--------+-----+
|     halhal-03754658|W4292443170|       1|    1|
|doi10.3917/rfp.83...|W2969964743|       1|    1|
|doi10.1016/j.kine...|W4211255436|       1|    1|
|doi10.1016/j.kine...|W4232209428|       1|    1|
|doi10.1016/j.kine...|W4250988968|       1|    1|
|doi10.3389/fmicb....|W2952999755|       1|    1|
|doi10.1007/978-1-...|W4240127323|       1|    1|
|doi10.1007/978-1-...|W4245348838|       1|    1|
|doi10.1097/qai.00...|W2969281750|       1|    1|
|doi10.1177/245574...|W4211015669|       1|    1|
|doi10.1177/245574...|W4229582053|       1|    1|
|doi10.1177/245574...|W4243003195|       1|    1|
|doi10.1177/245574...|W4243283818|       1|    1|
|doi10.1093/bioinf...|W2898371836|       1|    1|
|doi10.1007/978-3-...|W2944054768|       1|    1|
|     halhal-03403170|W4296568381|       1|    1|
|doi10.5040/978147...|W4237660652|       1|    1|


In [13]:
unambiguous_merge = (merge
                     .filter(func.col('in_scanR')+func.col('in_oa') ==2)
                     .withColumn('n_id', func.size(func.collect_set(func.col('id')).over(Window.partitionBy('work_id'))))
                     .withColumn('n_w_id', func.size(func.collect_set(func.col('work_id')).over(Window.partitionBy('id'))))
                     .filter((func.col('n_id')==1) & (func.col('n_w_id')==1))
                    )
unambiguous_merge.count()

201792

In [14]:
unambiguous_merge.cache()
unambiguous_merge.show()

+--------------------+-----------+--------+-----+----+------+
|                  id|    work_id|in_scanR|in_oa|n_id|n_w_id|
+--------------------+-----------+--------+-----+----+------+
|doi10.1001/jama.2...|W2912774204|       1|    1|   1|     1|
|doi10.1001/jama.2...|W2967318538|       1|    1|   1|     1|
|doi10.1001/jama.2...|W2916224422|       1|    1|   1|     1|
|doi10.1001/jama.2...|W2977332845|       1|    1|   1|     1|
|doi10.1001/jama.2...|W2982116083|       1|    1|   1|     1|
|doi10.1001/jama.2...|W2944362313|       1|    1|   1|     1|
|doi10.1001/jama.2...|W2978029701|       1|    1|   1|     1|
|doi10.1001/jama.2...|W2945351692|       1|    1|   1|     1|
|doi10.1001/jama.2...|W2945034488|       1|    1|   1|     1|
|doi10.1001/jama.2...|W2922550132|       1|    1|   1|     1|
|doi10.1001/jama.2...|W2948500349|       1|    1|   1|     1|
|doi10.1001/jama.2...|W2956119632|       1|    1|   1|     1|
|doi10.1001/jamaca...|W2904950765|       1|    1|   1|     1|
|doi10.1

In [16]:
unambiguous_merge.limit(20).collect()

[Row(id='doi10.1001/jama.2019.0071', work_id='W2912774204', in_scanR=1, in_oa=1, n_id=1, n_w_id=1),
 Row(id='doi10.1001/jama.2019.10551', work_id='W2967318538', in_scanR=1, in_oa=1, n_id=1, n_w_id=1),
 Row(id='doi10.1001/jama.2019.1113', work_id='W2916224422', in_scanR=1, in_oa=1, n_id=1, n_w_id=1),
 Row(id='doi10.1001/jama.2019.14608', work_id='W2977332845', in_scanR=1, in_oa=1, n_id=1, n_w_id=1),
 Row(id='doi10.1001/jama.2019.14969', work_id='W2982116083', in_scanR=1, in_oa=1, n_id=1, n_w_id=1),
 Row(id='doi10.1001/jama.2019.1534', work_id='W2944362313', in_scanR=1, in_oa=1, n_id=1, n_w_id=1),
 Row(id='doi10.1001/jama.2019.16181', work_id='W2978029701', in_scanR=1, in_oa=1, n_id=1, n_w_id=1),
 Row(id='doi10.1001/jama.2019.2064', work_id='W2945351692', in_scanR=1, in_oa=1, n_id=1, n_w_id=1),
 Row(id='doi10.1001/jama.2019.2072', work_id='W2945034488', in_scanR=1, in_oa=1, n_id=1, n_w_id=1),
 Row(id='doi10.1001/jama.2019.2942', work_id='W2922550132', in_scanR=1, in_oa=1, n_id=1, n_w_id=

In [20]:
df_scanR.filter(func.col('id').isin(['doi10.1001/jamacardio.2019.0014'])).select('title.default').collect()

[Row(default='Baseline Characteristics and Risk Profiles of Participants in the ISCHEMIA Randomized Clinical Trial')]