In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, ArrayType
from pyspark.sql.functions import col                       # Filtering using the col() function
from pyspark.sql.functions import array_contains            # Filtering on array columns
from pyspark.sql.functions import explode                   # Explode Arrays in Individual Rows
from pyspark.sql.functions import sum, avg, count, max      # Multiple Aggregations
from pyspark.sql.functions import first, last	
from pyspark.sql.functions import array_union               # Union of 2 array without duplicates
from pyspark.sql.functions import lit, array


# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("MyFirstSparkApplication") \
      .getOrCreate()

In [2]:
#IMPORT

df_book = spark.read.option("multiline","true")  \
      .json("Datasets (json)/book-db.json")

df_article = spark.read.option("multiline","true")  \
      .json("Datasets (json)/article-db.json")

df_incollection = spark.read.option("multiline","true")  \
      .json("Datasets (json)/incollection-db.json")

df_www = spark.read.option("multiline","true")  \
      .json("Datasets (json)/www-db.json")

AnalysisException: Path does not exist: file:/c:/Users/chris/OneDrive/Documenti/GitHub/SMBUD-project-1/Spark/Query/Datasets (json)/book-db.json

### 1. Most cited AUTHORS in a given field
The following query can be used to find the authors, and their website (if existing), who
wrote about a given field (e.g. "data mining") ordering them by the sum of citations they
received for thoose publications.

In [69]:
exploded_df_article = df_article.filter(array_contains(col("keyword"), "data mining"))  \
    .select(df_article.key, explode(df_article.author), df_article.keyword, df_article.citations  )    \
    .withColumnRenamed("col", "author")
# exploded_df_article.printSchema()
# exploded_df_article.show()

exploded_df_www = df_www  \
    .select(df_www.key, explode(df_www.author), df_www.url)    \
    .withColumnRenamed("col", "author")
# exploded_df_www.printSchema()
# exploded_df_www.show()

df_top_author = exploded_df_article.groupBy("author").agg(
    sum("citations").alias("Sum of Citations"), 
    avg("citations").alias("Average Citations"), 
    count("citations").alias("Number of Paper"),
    max("citations").alias("Max Citations"))    \
    .sort(col("Sum of Citations").desc()).limit(5)
# df_top_author.printSchema()
# df_top_author.show()

df_website = df_www.select(explode(df_www.author), df_www.url)    \
    .withColumnRenamed("col", "author_w")   \
    .withColumnRenamed("url", "website")
# df_website.printSchema()
# df_website.show()


result = df_top_author.join(df_website, df_top_author.author == df_website.author_w, "left")  \
    .select(col("author"), col("website"), col("Sum of Citations"), col("Average Citations"), col("Number of Paper"), col("Max Citations")) \
    

result.printSchema()    
result.show()

root
 |-- author: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- orcid: string (nullable = true)
 |-- website: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Sum of Citations: long (nullable = true)
 |-- Average Citations: double (nullable = true)
 |-- Number of Paper: long (nullable = false)
 |-- Max Citations: long (nullable = true)

+--------------------+--------------------+----------------+-----------------+---------------+-------------+
|              author|             website|Sum of Citations|Average Citations|Number of Paper|Max Citations|
+--------------------+--------------------+----------------+-----------------+---------------+-------------+
|{Clement T. Yu, C...|[http://www.cs.ui...|             231|             77.0|              3|          138|
|{Kenneth C. Sevci...|[http://www.cs.ut...|             229|            114.5|              2|          126|
|    {K. Lam, K. Lam}|                null|             20

### 2. Add a KEYWORD to a publications
here we can see a specific
example with the binding of the keyword "machine learning" to books in the "Intelligent
Systems Reference Library" series from volume 85 to volume 100

In [None]:
result = df_book    \
    .filter(df_book.series.title == "Intelligent Systems Reference Library")\
    .filter(df_book.volume >= 85)\
    .filter(df_book.volume <= 100)\
    .select("key","keyword","volume")\
    .withColumn("add", array(lit("machine learning")))

result.show(truncate=False)

result\
    .withColumn("keyword", array_union(result.keyword, result.add))\
    .drop("add")\
    .show(truncate=False)
