In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, ArrayType
from pyspark.sql.functions import col                       # Filtering using the col() function
from pyspark.sql.functions import array_contains            # Filtering on array columns
from pyspark.sql.functions import explode                   # Explode Arrays in Individual Rows
from pyspark.sql.functions import sum, avg, count, max      # Multiple Aggregations

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("MyFirstSparkApplication") \
      .getOrCreate()

In [3]:
# Read multiline json file  ARTICLE
df_article = spark.read.option("multiline","true") \
      .json("Datasets (json)/article-db.json")

df_article.printSchema()

root
 |-- author: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |-- citations: long (nullable = true)
 |-- cite: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ee: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- journal: string (nullable = true)
 |-- key: string (nullable = true)
 |-- keyword: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- note: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pages: long (nullable = true)
 |-- publisher: string (nullable = true)
 |-- title: string (nullable = true)
 |-- volume: long (nullable = true)
 |-- year: long (nullable = true)



1. The following query can be used to find the authors who wrote about a given field (e.g. "data mining") ordering them by the sum of citations they received for thoose publications

In [4]:
exploded_df_article = df_article.filter(array_contains(col("keyword"), "data mining"))  \
    .select(df_article.key, explode(df_article.author), df_article.keyword, df_article.citations  )    \
    .withColumnRenamed("col", "author")
exploded_df_article.printSchema()
# exploded_df_article.show()

exploded_df_article.groupBy("author").agg(
    sum("citations").alias("Sum of Citations"), 
    avg("citations").alias("Average Citations"), 
    count("citations").alias("Number of Paper"),
    max("citations").alias("Max Citations"))    \
    .sort(col("Sum of Citations").desc()).show(truncate = False)


root
 |-- key: string (nullable = true)
 |-- author: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- orcid: string (nullable = true)
 |-- keyword: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- citations: long (nullable = true)

+----------------------------------------------+----------------+-----------------+---------------+-------------+
|author                                        |Sum of Citations|Average Citations|Number of Paper|Max Citations|
+----------------------------------------------+----------------+-----------------+---------------+-------------+
|{Clement T. Yu, Clement T. Yu}                |231             |77.0             |3              |138          |
|{Kenneth C. Sevcik, Kenneth C. Sevcik}        |229             |114.5            |2              |126          |
|{K. Lam, K. Lam}                              |207             |103.5            |2              |138          |
|{Michael Ley, Michael Ley}  

In [9]:
# Read multiline json file  WWW
df_article = spark.read.option("multiline","true") \
    .json("Datasets (json)/www-db.json")  \
    .withColumnRenamed("note", "university")

df_article.printSchema()

root
 |-- author: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |-- key: string (nullable = true)
 |-- university: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)

