In [87]:
import findspark
findspark.init()

In [89]:
import pyspark
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

import json

In [90]:
conf = SparkConf()\
    .setMaster('local[*]')\
    .setAppName('Json analysis with rdd')\
    .setExecutorEnv('spark.driver.memory','4g')\
    .setExecutorEnv('spark.executor.memory','4g')

spark = SparkSession.builder.config(conf=conf).getOrCreate()

sc = spark.sparkContext

In [1]:
##### data source: https://www.kaggle.com/datasets/Cornell-University/arxiv

In [91]:
rdd_json = sc.textFile('arxiv-metadata-oai-snapshot.json', 100)

In [92]:
rdd = rdd_json.map(lambda x: json.loads(x))

In [93]:
from pyspark import StorageLevel

# setting storage level to MEMORY_AND_DISK
rdd.persist(StorageLevel.MEMORY_AND_DISK)

PythonRDD[62] at RDD at PythonRDD.scala:49

In [94]:
rdd.take(1)

[{'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [95]:
print(f'Number of partitions: {rdd.getNumPartitions()}')
print(f'Default paralellism:  {sc.defaultParallelism}')

Number of partitions: 100
Default paralellism:  16


### 1. Count Elements

In [24]:
rdd.count()

2011231

### 2. Get the first 2 records

In [96]:
rdd.take(2)

[{'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

### 3. Get all attributes

In [26]:
attributes = rdd.flatMap(lambda x: x.keys()).distinct().collect()
print(attributes)

['authors', 'comments', 'title', 'id', 'journal-ref', 'versions', 'submitter', 'categories', 'update_date', 'authors_parsed', 'report-no', 'license', 'abstract', 'doi']


### 4. Get the name of the license

In [21]:
licenses = rdd.map(lambda x: x['license']).distinct().collect()
print(licenses)

[None, 'http://creativecommons.org/licenses/publicdomain/', 'http://creativecommons.org/licenses/by-nc-nd/4.0/', 'http://creativecommons.org/licenses/by-nc-sa/4.0/', 'http://creativecommons.org/licenses/by-nc-sa/3.0/', 'http://creativecommons.org/licenses/by/3.0/', 'http://creativecommons.org/licenses/by/4.0/', 'http://creativecommons.org/publicdomain/zero/1.0/', 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/', 'http://creativecommons.org/licenses/by-sa/4.0/']


### 5. The shortest and longest titles

In [23]:
shortest_title = rdd.map(lambda x: x['title']).reduce(lambda x,y: x if x < y else y)
longest_title =  rdd.map(lambda x: x['title']).reduce(lambda x,y: x if x > y else y)

print('Shortest title :', shortest_title)
print('Longest title :', longest_title)

Shortest title : !-Graphs with Trivial Overlap are Context-Free
Longest title : Weyl formula for the negative dissipative eigenvalues of Maxwell's
  equations


### 6. Find abbrevations with 5 or more letters in the abstract

In [54]:
import re

def get_abbrivations(line):
    result = re.search(r"\(([A-Z][^!@#/+-_<>.,$%())]{5,})\)", line)
    if result:
        return result.group(1)

abbv_rdd = rdd.filter(lambda x: get_abbrivations(x['abstract']))
abbv_rdd.count()

In [44]:
abbv_rdd.take(1)

[{'id': '0704.0261',
  'submitter': 'Scott Randall',
  'authors': 'Scott W. Randall, Maxim Markevitch, Douglas Clowe, Anthony H.\n  Gonzalez, and Marusa Bradac',
  'title': 'Constraints on the Self-Interaction Cross-Section of Dark Matter from\n  Numerical Simulations of the Merging Galaxy Cluster 1E 0657-5',
  'comments': None,
  'journal-ref': None,
  'doi': '10.1086/587859',
  'report-no': None,
  'categories': 'astro-ph',
  'license': None,
  'abstract': "  (Abridged) We compare recent results from X-ray, strong lensing, weak\nlensing, and optical observations with numerical simulations of the merging\ngalaxy cluster 1E0657-56. X-ray observations reveal a bullet-like subcluster\nwith a prominent bow shock, while lensing results show that the positions of\nthe total mass peaks are consistent with the centroids of the collisionless\ngalaxies (and inconsistent with the X-ray brightness peaks). Previous studies,\nbased on older observational datasets, have placed upper limits on the\ns

### 7. Get the number of archive reports per months (update_date)

In [60]:
from datetime import datetime

def extract_date(DateIn):
    return datetime.strptime(DateIn, '%Y-%m-%d').month

extract_date('2004-05-23')

5

In [61]:
rdd.map(lambda x: (extract_date(x['update_date']), 1))\
    .reduceByKey(lambda x, y: x + y)\
    .collect()

[(1, 134247),
 (2, 116948),
 (3, 126458),
 (4, 117126),
 (5, 296587),
 (6, 191746),
 (7, 122649),
 (8, 138469),
 (9, 138978),
 (10, 197755),
 (11, 297963),
 (12, 132305)]

In [62]:
rdd.map(lambda x: (extract_date(x['update_date']), 1))\
    .reduceByKey(lambda x, y: x + y)\
    .sortBy(lambda x: x[1])\
    .collect()

[(2, 116948),
 (4, 117126),
 (7, 122649),
 (3, 126458),
 (12, 132305),
 (1, 134247),
 (8, 138469),
 (9, 138978),
 (6, 191746),
 (10, 197755),
 (5, 296587),
 (11, 297963)]

### 8. Get average number of pages 

In [97]:
import re

def get_page(line):
    search = re.findall('\d+ pages', line)
    if search:
        return int(search[0].split(' ')[0])
    else: 
        return 0

get_page('123 pages')

123

In [103]:
rdd_avg = rdd.map(lambda x: get_page(x['comments'] if x['comments'] is not None else 'None'))

# remove 0's
rdd_avg = rdd_avg.filter(lambda x: x!=0)
counter = rdd_avg.count()
summ = rdd_avg.reduce(lambda x,y: x+y)

print(f'Count: {counter} \nSum: {summ} \nAverage: {round(summ / counter)}')

Count: 1184075 
Sum: 21139516 
Average: 18


In [104]:
max_page = rdd_avg.reduce(lambda x,y: x if x>y else y)
min_page = rdd_avg.reduce(lambda x,y: x if x<y else y)

print(f'Max page: {max_page} \nMin page: {min_page}')


Max page: 11232 
Min page: 1
