In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Python Spark SQL delivery 3") \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
# A JSON dataset is pointed to by path.
# The path can be either a single text file or a directory storing text files
publications_file = "publications.json"
venues_file = "venues.json"
fos_file = "fos.json"
rel_dw_file = "rel_dw.json"
authors_file = "authors.json"

publicationsDF = spark.read.json(publications_file)
venuesDF = spark.read.json(venues_file)
fosDF = spark.read.json(fos_file)
rel_dwDF = spark.read.json(rel_dw_file)
authorsDF = spark.read.json(authors_file)

In [None]:
# The inferred schema can be visualized using the printSchema() method
# publicationsDF.printSchema()

In [None]:
# publicationsDF.show(truncate=True)

In [6]:
from pyspark.sql.functions import col

In [10]:
# WHERE, JOIN
fosDF.filter(col("name") == "Artificial intelligence").join(rel_dwDF, fosDF.id == rel_dwDF.fos_id, "inner").join(publicationsDF, rel_dwDF.pub_id == publicationsDF.id, "inner").select("title").show(truncate=False)


+--------------------------------------------------------------------------------------------------+
|title                                                                                             |
+--------------------------------------------------------------------------------------------------+
|A methodology for the physically accurate visualisation of roman polychrome statuary              |
|Comparison of GARCH, Neural Network and Support Vector Machine in Financial Time Series Prediction|
|COMPARING GNG3D AND QUADRIC ERROR METRICS METHODS TO SIMPLIFY 3D MESHES                           |
|Vectorial fast correlation attacks.                                                               |
|Improved Secret Image Sharing Method By Encoding Shared Values With Authentication Bits           |
|Identifying Psychological Theme Words from Emotion Annotated Interviews                           |
|A COMPUTATIONAL SALIENCY MODEL INTEGRATING SACCADE PROGRAMMING                            

In [51]:
# WHERE, LIMIT, LIKE
from pyspark.sql.functions import expr
authorsDF.withColumnRenamed("id", "authorId").filter(col("affiliation").like("%Politecnico%")).limit(5).join(publicationsDF, expr(
    "array_contains(authors, authorId)")).select(col("title").alias("publicationTitle"), col("name").alias("authorName"), "affiliation").show(truncate=False)

authorsDF.withColumnRenamed("id", "authorId").filter(
    col("affiliation").like("%Politecnico%")).limit(5)
publicationsDF.select(count(.name)).show()


+-----------------------------------------------------------------------+---------------------+---------------------+
|publicationTitle                                                       |authorName           |affiliation          |
+-----------------------------------------------------------------------+---------------------+---------------------+
|COMPARING GNG3D AND QUADRIC ERROR METRICS METHODS TO SIMPLIFY 3D MESHES|José-Francisco Vicent|Politecnico di Bari  |
|Vectorial fast correlation attacks.                                    |Jovan Dj. Golic      |Politecnico di Bari  |
|Cleaneval: a Competition for Cleaning Web Pages                        |Marco Baroni         |Politecnico di Milano|
|Face Detection, Recognition in an Image Sequence Using Eigenedginess.  |B. Yegnanarayana     |Politecnico di Bari  |
|Speech recognition based on the integration of FSVQ and neural network.|Li-Qun Xu            |Politecnico di Bari  |
+-------------------------------------------------------

In [None]:
# We can count these authors publications and find out that they have published only 1 paper each
authorsDF.withColumnRenamed("id", "authorId").filter(col("affiliation").like("%Politecnico%")).limit(5).show()