In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf


conf = SparkConf()\
    .setMaster('local[*]')\
    .setAppName('Json data with DF')\
    .setExecutorEnv('spark.driver.memory','2g')\
    .setExecutorEnv('spark.executor.memory','4g')

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
##### data source: https://www.kaggle.com/datasets/Cornell-University/arxiv

In [3]:
df = spark.read.json('arxiv-metadata-oai-snapshot.json')

In [4]:
df.printSchema()

root
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- authors_parsed: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- categories: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- id: string (nullable = true)
 |-- journal-ref: string (nullable = true)
 |-- license: string (nullable = true)
 |-- report-no: string (nullable = true)
 |-- submitter: string (nullable = true)
 |-- title: string (nullable = true)
 |-- update_date: string (nullable = true)
 |-- versions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- created: string (nullable = true)
 |    |    |-- version: string (nullable = true)



In [5]:
df.rdd.getNumPartitions()

25

### 1. Create a new Schema

In [6]:
from pyspark.sql.types import StructType, StructField, ArrayType, StringType

manual_schema = StructType([
    StructField('authors', StringType(), True),
    StructField('categories', StringType(), True),
    StructField('license', StringType(), True),
    StructField('comments', StringType(), True),
    StructField('abstract', StringType(), True),
    StructField('versions', ArrayType(StringType()), True)
])

print(manual_schema)

StructType(List(StructField(authors,StringType,true),StructField(categories,StringType,true),StructField(license,StringType,true),StructField(comments,StringType,true),StructField(abstract,StringType,true),StructField(versions,ArrayType(StringType,true),true)))


### 2. Binding data to a manual schema

In [8]:
df = spark.read.json('arxiv-metadata-oai-snapshot.json',
                    schema = manual_schema)

In [10]:
df.printSchema()

root
 |-- authors: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- license: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- versions: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [9]:
df.show(10)

+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|             authors|       categories|             license|            comments|            abstract|            versions|
+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|C. Bal\'azs, E. L...|           hep-ph|                null|37 pages, 15 figu...|  A fully differe...|[{"version":"v1",...|
|Ileana Streinu an...|    math.CO cs.CG|http://arxiv.org/...|To appear in Grap...|  We describe a n...|[{"version":"v1",...|
|         Hongjun Pan|   physics.gen-ph|                null| 23 pages, 3 figures|  The evolution o...|[{"version":"v1",...|
|        David Callan|          math.CO|                null|            11 pages|  We show that a ...|[{"version":"v1",...|
|Wael Abu-Shammala...|  math.CA math.FA|                null|                null|  In this paper w...|[{"version":"v1",...|


### 3. Missing values for 'comments' ans 'license' attributes

In [10]:
# drop
df = df.dropna(subset=['comments'])

In [11]:
# replace
df = df.fillna(value='unknown', subset=['license'])

In [12]:
df.show(10)

+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|             authors|       categories|             license|            comments|            abstract|            versions|
+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|C. Bal\'azs, E. L...|           hep-ph|             unknown|37 pages, 15 figu...|  A fully differe...|[{"version":"v1",...|
|Ileana Streinu an...|    math.CO cs.CG|http://arxiv.org/...|To appear in Grap...|  We describe a n...|[{"version":"v1",...|
|         Hongjun Pan|   physics.gen-ph|             unknown| 23 pages, 3 figures|  The evolution o...|[{"version":"v1",...|
|        David Callan|          math.CO|             unknown|            11 pages|  We show that a ...|[{"version":"v1",...|
|Y. H. Pong and C....|cond-mat.mes-hall|             unknown|6 pages, 4 figure...|  We study the tw...|[{"version":"v1",...|


### 4. Get the author names who published a paper in a math category

In [14]:
import pyspark.sql.functions as sf
df.filter(sf.col('categories').contains('math')).count()

374379

In [17]:
df.createOrReplaceTempView('d_archive')

query = '''
    select authors from d_archive
    where categories like '%math%'
'''
spark.sql(query).show(10)
print(spark.sql(query).count())

+--------------------+
|             authors|
+--------------------+
|Ileana Streinu an...|
|        David Callan|
|  Sergei Ovchinnikov|
|Clifton Cunningha...|
|        Koichi Fujii|
|         Norio Konno|
|Simon J.A. Malham...|
|Robert P. C. de M...|
|  P\'eter E. Frenkel|
|          Mihai Popa|
+--------------------+
only showing top 10 rows

374379


In [18]:
df.filter(df.categories.contains('math')).count()


374379

In [19]:
df.filter("categories like '%math%'").count()


374379

In [25]:
df.filter("categories rlike 'math'").count()


374379

In [22]:
df.filter("categories rlike 'math|physics'").count()


374379

### 5. Get licenses with 5 or more letters in the abstract

In [32]:
query2 = '''
    select abstract from d_archive
    where abstract regexp '\(([A-Z][^!@#/+-_<>.,$%())]{5,})\)'
'''
spark.sql(query2).show(5)

+--------------------+
|            abstract|
+--------------------+
|  A fully differe...|
|  We describe a n...|
|  The evolution o...|
|  We show that a ...|
|  We study the tw...|
+--------------------+
only showing top 5 rows



### 6. Extract statistics of the number of pages for unknown licenses

In [33]:
import re

def get_page(line):
    search = re.findall('\d+ pages', line)
    if search:
        return int(search[0].split(' ')[0])
    else: 
        return 0

get_page('123 pages')


123

In [34]:
spark.udf.register('get_page_number', get_page)

<function __main__.get_page(line)>

In [38]:
query3 = '''
    select avg(get_page_number(comments)) as avg,
           min(get_page_number(comments)) as min,
           max(get_page_number(comments)) as max,
           std(get_page_number(comments)) as std,
           count(1) as cnt 
    from   d_archive
    where  license = 'unknown'
    and    get_page_number(comments) != 0
'''

spark.sql(query3).show()

+------------------+---+---+------------------+------+
|               avg|min|max|               std|   cnt|
+------------------+---+---+------------------+------+
|15.991180538236561|  1| 99|17.168944606277094|352856|
+------------------+---+---+------------------+------+

