In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').getOrCreate()

print(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/15 20:43:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


<pyspark.sql.session.SparkSession object at 0x10c912a60>


In [3]:
# df = spark.read.json("data/arxiv-metadata-oai-snapshot.json")
df = spark.read.format("json").load("data/arxiv-metadata-oai-snapshot.json")

df.printSchema()

                                                                                

root
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- authors_parsed: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- categories: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- id: string (nullable = true)
 |-- journal-ref: string (nullable = true)
 |-- license: string (nullable = true)
 |-- report-no: string (nullable = true)
 |-- submitter: string (nullable = true)
 |-- title: string (nullable = true)
 |-- update_date: string (nullable = true)
 |-- versions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- created: string (nullable = true)
 |    |    |-- version: string (nullable = true)



In [4]:
df.rdd.getNumPartitions()

25

In [7]:
df.show(10)

+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+---------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+--------------------+
|            abstract|             authors|      authors_parsed|       categories|            comments|                 doi|       id|         journal-ref|             license|           report-no|         submitter|               title|update_date|            versions|
+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+---------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+--------------------+
|  A fully differe...|C. Bal\'azs, E. L...|[[Balázs, C., ], ...|           hep-ph|37 pages, 15 figu...|10.1103/PhysRevD....|0704.0001|Phys.Rev.D76:0130...|                null|    ANL-HEP

## Create a New Schema

In [10]:
from pyspark.sql.types import *

# Define Schema
schema = StructType([
    StructField('authors', StringType(), True),
    StructField('categories', StringType(), True),
    StructField('license', StringType(), True),
    StructField('comments', StringType(), True),
    StructField('abstract', StringType(), True),
    StructField('versions', ArrayType(StringType()), True),
]
)

print(schema)

StructType([StructField('authors', StringType(), True), StructField('categories', StringType(), True), StructField('license', StringType(), True), StructField('comments', StringType(), True), StructField('abstract', StringType(), True), StructField('versions', ArrayType(StringType(), True), True)])


In [12]:
df = spark.read \
    .format('json') \
    .schema(schema) \
    .load('data/arxiv-metadata-oai-snapshot.json')

# df = spark.read.json('data/arxiv-metadata-oai-snapshot.json')

df.show(10)

+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|             authors|       categories|             license|            comments|            abstract|            versions|
+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|C. Bal\'azs, E. L...|           hep-ph|                null|37 pages, 15 figu...|  A fully differe...|[{"version":"v1",...|
|Ileana Streinu an...|    math.CO cs.CG|http://arxiv.org/...|To appear in Grap...|  We describe a n...|[{"version":"v1",...|
|         Hongjun Pan|   physics.gen-ph|                null| 23 pages, 3 figures|  The evolution o...|[{"version":"v1",...|
|        David Callan|          math.CO|                null|            11 pages|  We show that a ...|[{"version":"v1",...|
|Wael Abu-Shammala...|  math.CA math.FA|                null|                null|  In this paper w...|[{"version":"v1",...|


## Missing values for `comments` and `license` attributes

In [13]:
df = df.dropna(subset=['comments'])

df = df.fillna(value='unknown', subset=['license'])

df.show(10)

+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|             authors|       categories|             license|            comments|            abstract|            versions|
+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|C. Bal\'azs, E. L...|           hep-ph|             unknown|37 pages, 15 figu...|  A fully differe...|[{"version":"v1",...|
|Ileana Streinu an...|    math.CO cs.CG|http://arxiv.org/...|To appear in Grap...|  We describe a n...|[{"version":"v1",...|
|         Hongjun Pan|   physics.gen-ph|             unknown| 23 pages, 3 figures|  The evolution o...|[{"version":"v1",...|
|        David Callan|          math.CO|             unknown|            11 pages|  We show that a ...|[{"version":"v1",...|
|Y. H. Pong and C....|cond-mat.mes-hall|             unknown|6 pages, 4 figure...|  We study the tw...|[{"version":"v1",...|


In [None]:
from pyspark.sql.functions import asc, col

df.groupBy(['categories']).count().filter((col('count') != 1) & (col('categories').like('math%'))).sort(asc('count')).show()



+--------------------+-----+
|          categories|count|
+--------------------+-----+
|math.AG math-ph m...|    2|
|       math.OA gr-qc|    2|
|math.DS math.CA q...|    2|
|math.CO cs.CG mat...|    2|
|     math.GM math.AT|    2|
|math.NA math-ph m...|    2|
|math.FA math.LO m...|    2|
|math.DG math.GR m...|    2|
|math.CA cs.IT mat...|    2|
|math.ST astro-ph ...|    2|
|math.OC physics.c...|    2|
|math.MG math-ph m...|    2|
|math.AG math-ph m...|    2|
|math-ph math.GN m...|    2|
|math.QA math.AG m...|    2|
|math-ph math.DS m...|    2|
|math.NT cs.CR cs....|    2|
|math.CO cond-mat....|    2|
|math.RA hep-th ma...|    2|
|math.PR math.OC q...|    2|
+--------------------+-----+
only showing top 20 rows



                                                                                

23/10/19 10:35:43 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 28636421 ms exceeds timeout 120000 ms
23/10/19 10:35:43 WARN SparkContext: Killing executors is not supported by current scheduler.
23/10/19 10:35:43 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage