In [40]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, ArrayType
from pyspark.sql.functions import col                       # Filtering using the col() function
from pyspark.sql.functions import array_contains            # Filtering on array columns
from pyspark.sql.functions import explode                   # Explode Arrays in Individual Rows
from pyspark.sql.functions import sum, avg, count, max      # Multiple Aggregations
from pyspark.sql.functions import first, last
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import lit, array
from pyspark.sql.functions import when

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("MyFirstSparkApplication") \
      .getOrCreate()

In [41]:
#IMPORT

df_book = spark.read.option("multiline","true")  \
      .json("Datasets/book-db.json")

df_article = spark.read.option("multiline","true")  \
      .json("Datasets/article-db.json")

df_incollection = spark.read.option("multiline","true")  \
      .json("Datasets/incollection-db.json")

df_www = spark.read.option("multiline","true")  \
      .json("Datasets/www-db.json")


In [6]:
df_article.show()
df_book.show()
df_incollection.show()
df_www.show()


+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+----+-----+---------+--------------------+------+----+
|              author|citations|                cite|                  ee|             journal|                 key|             keyword|note|pages|publisher|               title|volume|year|
+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+----+-----+---------+--------------------+------+----+
|[{Dennis E. Shash...|       81|[journals/is/Jens...|[http://sites.com...|IEEE Data Eng. Bull.|journals/debu/Sha...|[java, graphs, de...|null|   37|     null|Tuning Time Serie...|    22|1999|
|[{Nick Roussopoul...|     null|[conf/sigmod/Blak...|[http://sites.com...|IEEE Data Eng. Bull.|journals/debu/Rou...|[debugging, graph...|null|   27|     null|The ADMS Project:...|    18|1995|
|[{Giansalvatore M...|     null|[www/org

# **1. Top Journals for heterogeneity of topics**
This query is useful to obtain the most heterogeneous journals, i.e. the ones whose articles cover a good number of topics. 
In this case, we want the top 10 journals that have published articles related to at least 25 different keywords.

In [4]:
result = df_article \
    .select(col("journal"), explode("keyword")) \
    .withColumnRenamed("col", "keyword") \
    .groupBy("journal") \
    .agg(countDistinct("journal", "keyword")) \
    .withColumnRenamed("count(journal, keyword)", "number of keywords") \
    .filter(col("number of keywords") > 25) \
    .sort(col("number of keywords").desc())
    
result.limit(10).show(truncate = False)


+----------------------------+------------------+
|journal                     |number of keywords|
+----------------------------+------------------+
|SIGMOD Rec.                 |45                |
|IEEE Trans. Knowl. Data Eng.|45                |
|IEEE Data Eng. Bull.        |45                |
|ACM SIGMOD Digit. Rev.      |45                |
|IWBS Report                 |45                |
|ACM Trans. Database Syst.   |45                |
|VLDB J.                     |45                |
|LILOG-Report                |44                |
|ACM Comput. Surv.           |44                |
|Commun. ACM                 |28                |
+----------------------------+------------------+



# **2. Find books not about a certain topic and related to favourite Publishers**
This query is useful if you want to buy a book which has been published by one of your favourite publishers but that does not treat a certain topic, for example because you have already studied it or because you just simply don't like it.

In [22]:
favourite_publishers = ["CRC Press", "World Scientific", "Routledge", "SIAM", "Atlantis Press"]
hated_topic = "data processing"
df_book_favPubl = df_book.filter(col("publisher").isin(favourite_publishers))
result = df_book_favPubl.filter(array_contains(df_book_favPubl.keyword, hated_topic) == False) \
    .select(col("title"), col("publisher"), col("year"), col("isbn")) \
    .limit(5) \
    .show(truncate=False)

+-------------------------------------------+----------------+----+--------------------------------------+
|title                                      |publisher       |year|isbn                                  |
+-------------------------------------------+----------------+----+--------------------------------------+
|Introduction to Text Visualization         |Atlantis Press  |2016|[978-94-6239-185-7, 978-94-6239-186-4]|
|Cognitive Design for Artificial Minds      |Routledge       |2021|[9781138207950]                       |
|Evaluating Gas Network Capacities          |SIAM            |2015|[978-1-611-97368-6]                   |
|Adaptive Cloud Enterprise Architecture     |World Scientific|2015|[978-981-4632-12-6, 978-981-4632-14-0]|
|Information Theory Tools for Visualization.|CRC Press       |2016|[9781315369228]                       |
+-------------------------------------------+----------------+----+--------------------------------------+



# **3. Update the available URL for a certain author**
This query could be useful in a scenario where an author decides that the link to his webpage (specified by him to the dataframe admin) should be the only link available among the various URLs associated to him in the dataframe, thus deleting all the other URLs uploaded in the past.
Here is shown another possible method to save the keys related to an author, different from the one proposed in the command "Delete PUBLICATIONS of an author".

In [42]:
target_key = df_www  \
    .select(explode(df_www.author), df_www.key)   \
    .withColumnRenamed("col", "author") \
    .filter(col("author.name") == "Elena Ferrari") \
    .select(col("key")) \
    .collect()

target_key = [k[0] for k in target_key]

# show before update 
df_www.filter(df_www.key.isin(target_key)) \
      .show(truncate=False)

df_www = df_www \
    .withColumn("url",
        when(
            df_www.key.isin(target_key), array(lit("http://www.dicom.uninsubria.it/~elena.ferrari/"))
        ) \
        .otherwise(df_www.url)) 

#show after update
df_www.filter(df_www.key.isin(target_key)) \
      .show(truncate=False)
    

+--------------------------------+------------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|author                          |key                     |note                                   |url                                                                                                                                                                                                                                                                             