In [10]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Python Spark SQL delivery 3") \
    .getOrCreate()

sc = spark.sparkContext

In [20]:
# A JSON dataset is pointed to by path.
# The path can be either a single text file or a directory storing text files
publications_file = "publications.json"
venues_file = "venues.json"
fos_file = "fos.json"
rel_dw_file = "rel_dw.json"
authors_file = "authors.json"

publicationsDF = spark.read.json(publications_file)
venuesDF = spark.read.json(venues_file)
fosDF = spark.read.json(fos_file)
rel_dwDF = spark.read.json(rel_dw_file)
authorsDF = spark.read.json(authors_file)

In [12]:
# The inferred schema can be visualized using the printSchema() method
publicationsDF.printSchema()

root
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- id: long (nullable = true)
 |-- pages: long (nullable = true)
 |-- publisher: string (nullable = true)
 |-- references: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)
 |-- venue: long (nullable = true)



In [13]:
publicationsDF.show(truncate=True)

+--------------------+--------------------+----+-----+--------------------+--------------------+--------------------+----------+----------+
|            abstract|             authors|  id|pages|           publisher|          references|               title|      type|     venue|
+--------------------+--------------------+----+-----+--------------------+--------------------+--------------------+----------+----------+
|Abstract of Preli...|[2312688602, 2482...|1091|    6|Springer, Berlin,...|[1091, 6762, 1674...|Preliminary Desig...|Conference|1127419992|
|Abstract of Furth...|        [2718958994]|1388|   13|   PoliPrint, Milano|  [8763, 1674, 1688]|Further Results o...|   Journal|  73158690|
|Abstract of A met...|[2103626414, 2117...|1674|   11|Eurographics Asso...|        [5781, 1091]|A methodology for...|Conference|2754954274|
|Abstract of Compa...|[2300589394, 2308...|1688|    3|Springer, Berlin,...|[1388, 5411, 8373...|Comparison of GAR...|Conference|1136274694|
|Abstract of COMPA..

In [14]:
publicationsDF.select("authors").show(truncate=False)

+------------------------------------------------------------------------------------------------+
|authors                                                                                         |
+------------------------------------------------------------------------------------------------+
|[2312688602, 2482909946, 2128134587, 2101782692, 2114054191, 1989208940, 2134989941, 2307479915]|
|[2718958994]                                                                                    |
|[2103626414, 2117665592]                                                                        |
|[2300589394, 2308774408, 2126056503, 2425818370]                                                |
|[2125293936, 2101693188, 2159120860, 2146570697]                                                |
|[1237859792, 220887178]                                                                         |
|[2022192081, 2023460672, 2174205032]                                                            |
|[21422490

In [24]:
from pyspark.sql.functions import col

In [19]:
fosDF.show()
publicationsDF.show(truncate= False)

+--------------------+--------------------+
|                  id|                name|
+--------------------+--------------------+
|ac6663816c9635e15...|Telecommunication...|
|284fcfb183d191953...|    Computer science|
|d17475f16d76e4052...|            Mind map|
|c2a5462d06dd702e2...|Human–computer in...|
|2f56b4f336dc97edf...|          Multimedia|
|ff369ad079366681e...|  Empirical research|
|28e169980e17fc27c...|       Comprehension|
|2e74da7ce756356a0...|Communications pr...|
|4cdbd2bafa8193091...|               Graph|
|27ce971356df02c63...|Discrete mathematics|
|6c2f06ae9649fffd1...|       Combinatorics|
|05df30932021c3376...|      Direct product|
|540b21ecdb276f508...|         Mathematics|
|f34b29e2dd11d27c2...|              Statue|
|e3df226c8bed88438...| Engineering drawing|
|7c0d914a5aa9dc8f2...|Virtual reconstru...|
|ce09e3d6818263940...|       Visualization|
|1e1b9006b2ad5f189...|          Polychrome|
|9d0996a44c6d51cf2...|Artificial intell...|
|b2a57f84041a796df...|Autoregres

In [32]:
# WHERE, JOIN
fosDF.filter(col("name") == "Artificial intelligence").join(rel_dwDF, fosDF.id == rel_dwDF.fos_id, "inner").join(publicationsDF, rel_dwDF.pub_id == publicationsDF.id, "inner").show(truncate=True)


+--------------------+--------------------+--------------------+------+-------+--------------------+--------------------+----+-----+--------------------+--------------------+--------------------+----------+----------+
|                  id|                name|              fos_id|pub_id| weight|            abstract|             authors|  id|pages|           publisher|          references|               title|      type|     venue|
+--------------------+--------------------+--------------------+------+-------+--------------------+--------------------+----+-----+--------------------+--------------------+--------------------+----------+----------+
|9d0996a44c6d51cf2...|Artificial intell...|9d0996a44c6d51cf2...|  1674|0.40496|Abstract of A met...|[2103626414, 2117...|1674|   11|Eurographics Asso...|        [5781, 1091]|A methodology for...|Conference|2754954274|
|9d0996a44c6d51cf2...|Artificial intell...|9d0996a44c6d51cf2...|  1688|    0.0|Abstract of Compa...|[2300589394, 2308...|1688|  