In [1]:
import re
import os
import gzip
import shutil
import pandas as pd
import time
from itertools import cycle, islice
from transformers import pipeline, AutoModel, AutoModelForSequenceClassification,AutoTokenizer
import findspark
findspark.init("C:\\Spark\\spark-3.5.0-bin-hadoop3")

from pyspark.sql import SparkSession
import pyspark.sql.functions as func
from pyspark.sql.window import Window
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,ArrayType,BooleanType,NoneType
spark = SparkSession.builder \
                    .config("spark.sql.debug.maxToStringFields", 1000)\
                    .config("spark.sql.files.maxPartitionBytes", str(160 * 1024 * 1024)+"b")\
                    .config("spark.executor.memory", "20g")\
                    .config("spark.driver.memory", "20g")\
                    .config('spark.executor.cores',4) \
                    .master("local[4]") \
                    .appName('open_works') \
                    .getOrCreate()
                    #.enableHiveSupport()\
    
main_path = 'C:\\Users\\common\\projet_3_lru\\'
scanR_path = 'D:\\scanR\\'
openalex_path = 'D:\\openalex-snapshot\\data_extracted\\'


In [2]:
df_scanR = spark.read.format('parquet').load('file:\\' + scanR_path + 'publications.parquet')

In [32]:
df_scanR.columns

['affiliations',
 'authors',
 'authorsCount',
 'doiUrl',
 'domains',
 'externalIds',
 'grantedDate',
 'id',
 'inpadocFamily',
 'inventionKind',
 'isInternational',
 'isOa',
 'isOeb',
 'keywords',
 'oaEvidence',
 'patents',
 'productionType',
 'projects',
 'publicationDate',
 'source',
 'submissionDate',
 'summary',
 'title',
 'type',
 'year']

In [22]:
import unidecode
def unidecode_debug(x):
    try:
        y = unidecode.unidecode(x)
        return(y)
    except:
        return(x)

udf_unidecode = func.udf(unidecode_debug, StringType())

In [45]:
openalex_path = 'D:\\openalex-snapshot\\data_extracted\\'

works_au_af_subset = (spark.read.format('parquet').load('file:\\' + openalex_path + 'works_au_af.parquet').filter(func.col('country')=='FR')
               .select('work_id','title', 'citations',func.col('publication_year').alias('year'),'language').distinct())

In [52]:
test_merge = (df_scanR.filter(func.col('year')==2019)
              .filter(func.col('productionType') == 'publication')
              .select(func.col('title.default').alias('title'),'id', func.lit(1).alias('in_scanr'))
              #.withColumn('title', udf_unidecode(func.col('title')))
              .withColumn('title', func.lower(func.col('title')))
              .join(works_au_af_subset.filter(func.col('year')==2019)
                    .drop('year')
                    #.withColumn('title', udf_unidecode(func.col('title')))
                    .withColumn('title', func.lower(func.col('title')))
                    .withColumn('in_oa', func.lit(1))
                    , on = ['title'], how = 'full')
             )
test_merge.cache()
test_merge.groupBy('in_oa','in_scanr').count().show()


+-----+--------+------+
|in_oa|in_scanr| count|
+-----+--------+------+
|    1|    NULL| 41924|
| NULL|       1|128721|
|    1|       1|241960|
+-----+--------+------+



In [63]:
test_merge.groupBy('in_oa','in_scanr',
                   (func.when(func.col('language')== 'en', 'en')
                   .when(func.col('language')== 'fr', 'fr')
                    .otherwise('other'))
                   .alias('language')).agg(
    func.count('work_id').alias('count'), func.sum('citations').alias('citations'), 
    func.mean('citations').alias('avg_cit'), func.percentile_approx('citations', 0.5).alias('med_cit'),
    func.percentile_approx('citations', 0.75).alias('3_quart'), func.max('citations').alias('max')
                   ).show()


+-----+--------+--------+------+---------+-------------------+-------+-------+----+
|in_oa|in_scanr|language| count|citations|            avg_cit|med_cit|3_quart| max|
+-----+--------+--------+------+---------+-------------------+-------+-------+----+
|    1|    NULL|   other|  2195|      516|0.23507972665148064|      0|      0|  24|
| NULL|       1|   other|     0|     NULL|               NULL|   NULL|   NULL|NULL|
|    1|       1|      en|149478|  1913256|  12.79958254726448|      2|     30|9119|
|    1|    NULL|      en| 25873|    71554|  2.765585745758126|      0|      5|1846|
|    1|    NULL|      fr| 13856|      792|0.05715935334872979|      0|      0|  63|
|    1|       1|      fr| 55472|    22914| 0.4130732621863282|      0|      0| 230|
|    1|       1|   other| 37010|    11951|0.32291272629019185|      0|      1| 269|
+-----+--------+--------+------+---------+-------------------+-------+-------+----+



In [56]:
test_merge.filter((func.col('language')=='fr')&(func.col('in_scanr').isNull())).show()

+--------------------+----+--------+-----------+---------+--------+-----+
|               title|  id|in_scanr|    work_id|citations|language|in_oa|
+--------------------+----+--------+-----------+---------+--------+-----+
|                NULL|NULL|    NULL|W3153289204|        0|      fr|    1|
|                NULL|NULL|    NULL|W2945098689|        0|      fr|    1|
|                NULL|NULL|    NULL|W3046631608|        0|      fr|    1|
|                NULL|NULL|    NULL|W3213035235|        0|      fr|    1|
|                NULL|NULL|    NULL|W3045959133|        1|      fr|    1|
|                NULL|NULL|    NULL|W4251115215|        0|      fr|    1|
|                NULL|NULL|    NULL|W3106607111|        0|      fr|    1|
|" erreurs et inve...|NULL|    NULL|W4313730129|        0|      fr|    1|
|" on a ri comme a...|NULL|    NULL|W4288573239|        0|      fr|    1|
|" on a ri comme a...|NULL|    NULL|W4288573310|        0|      fr|    1|
|" un retour aux s...|NULL|    NULL|W4

In [36]:
test_merge = (df_scanR.filter(func.col('year')==2019)
              .filter(func.col('productionType') == 'publication')
              .select(func.col('title.default').alias('title'),'id', func.lit(1).alias('in_scanr'))
              #.withColumn('title', udf_unidecode(func.col('title')))
              .withColumn('title', func.lower(func.col('title')))
              .join(works_au_af_subset.filter(func.col('year')==2019)
                    .drop('year')
                    #.withColumn('title', udf_unidecode(func.col('title')))
                    .withColumn('title', func.lower(func.col('title')))
                    .withColumn('in_oa', func.lit(1))
                    , on = ['title'], how = 'full')
             )

test_merge.groupBy('in_oa','in_scanr').count().show()


+-----+--------+------+
|in_oa|in_scanr| count|
+-----+--------+------+
|    1|    NULL| 41924|
| NULL|       1|128721|
|    1|       1|241960|
+-----+--------+------+



In [38]:
test_merge.filter(func.col('in_scanr').isNull()).show()

+-----+----+--------+-----------+---------+-----+
|title|  id|in_scanr|    work_id|citations|in_oa|
+-----+----+--------+-----------+---------+-----+
| NULL|NULL|    NULL|W3118980988|        0|    1|
| NULL|NULL|    NULL|W2588093740|        9|    1|
| NULL|NULL|    NULL|W4232756027|        0|    1|
| NULL|NULL|    NULL|W4245439636|        0|    1|
| NULL|NULL|    NULL|W4241699515|        0|    1|
| NULL|NULL|    NULL|W3122775396|        3|    1|
| NULL|NULL|    NULL|W4248855243|        0|    1|
| NULL|NULL|    NULL|W4256290495|        0|    1|
| NULL|NULL|    NULL|W3083674946|        0|    1|
| NULL|NULL|    NULL|W4235757433|        1|    1|
| NULL|NULL|    NULL|W4230365602|        0|    1|
| NULL|NULL|    NULL|W4229643486|        0|    1|
| NULL|NULL|    NULL|W4230599589|        0|    1|
| NULL|NULL|    NULL|W4243087944|        0|    1|
| NULL|NULL|    NULL|W4293191137|        0|    1|
| NULL|NULL|    NULL|W3035799676|        0|    1|
| NULL|NULL|    NULL|W2992696898|        0|    1|


In [41]:
test_merge.filter(func.col('in_scanr').isNull()).groupBy(func.col('title').isNull()).count().show()

+---------------+-----+
|(title IS NULL)|count|
+---------------+-----+
|           true|   66|
|          false|41858|
+---------------+-----+



In [44]:
test_merge.filter(func.col('in_oa').isNull()).limit(20).collect()

[Row(title='"accounting treatment of transactions with partners with invalid vat code"', id='doi10.31926/but.es.2019.12.61.1.17', in_scanr=1, work_id=None, citations=None, in_oa=None),
 Row(title='"aquí y ahora": la noción de contrato social en el lesbianismo materialista de monique wittig', id='doi10.5209/infe.60722', in_scanr=1, work_id=None, citations=None, in_oa=None),
 Row(title='"blockchain research in information systems: current trends and an inclusive future research agenda"', id='doi10.17705/1jais.00571', in_scanr=1, work_id=None, citations=None, in_oa=None),
 Row(title='"chantons noël"(à propos de la ballade xi de l’adolescence clémentine)', id='doi10.4000/babel.5669', in_scanr=1, work_id=None, citations=None, in_oa=None),
 Row(title='"cosa guardiamo quando guardiamo un evento sportivo"? il ruolo dell’ultimo uomo nel recente panorama del giornalismo sportivo in italia', id='doi10.4000/italies.7739', in_scanr=1, work_id=None, citations=None, in_oa=None),
 Row(title='"distribu

In [48]:
df_scanR.select(func.explode(func.col('externalIds')).alias('externalIds'),'id').groupBy('externalIds.type').count().show()

+-----+-------+
| type|  count|
+-----+-------+
|  doi|2322332|
|  hal|2479391|
|  nnt| 444248|
|scanr|2322332|
| pmid| 551349|
+-----+-------+

