In [7]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, ArrayType
from pyspark.sql.functions import col                       # Filtering using the col() function
from pyspark.sql.functions import array_contains            # Filtering on array columns
from pyspark.sql.functions import explode                   # Explode Arrays in Individual Rows
from pyspark.sql.functions import sum, avg, count, max      # Multiple Aggregations
from pyspark.sql.functions import first, last

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("MyFirstSparkApplication") \
      .getOrCreate()

In [8]:
#IMPORT

df_book = spark.read.option("multiline","true")  \
      .json("Datasets (json)/book-db.json")

df_article = spark.read.option("multiline","true")  \
      .json("Datasets (json)/article-db.json")

df_incollection = spark.read.option("multiline","true")  \
      .json("Datasets (json)/incollection-db.json")

df_www = spark.read.option("multiline","true")  \
      .json("Datasets (json)/www-db.json")

<h3>1. Find SERIES about relational databases </h3>
Series that contain at least 2 books about relational databases, ordered by the year of the most recent publication

In [83]:
result = df_book.\
    filter((col("series").isNotNull()) & (array_contains(col("keyword"), "relational databases"))). \
    groupBy("series").\
    agg(count("key").alias("count"), max("year").alias("Most recent pub")).\
    filter(col("count")>=2).\
    sort(col("Most recent pub").desc(), col("count").desc()).\
    select(col("series.title").alias("Series title"), col("series.href").alias("Series key"),
        col("count").alias("Number of books"), col("Most recent pub"))

result.show()

+--------------------+--------------------+---------------+---------------+
|        Series title|          Series key|Number of books|Most recent pub|
+--------------------+--------------------+---------------+---------------+
|Studies in Comput...|db/series/sci/ind...|              5|           2022|
|Springer Briefs i...|db/series/sbcs/in...|              3|           2022|
|Intelligent Syste...|db/series/isrl/in...|              2|           2020|
|Lecture Notes in ...|db/series/lncs/in...|              2|           2020|
|Studies in Fuzzin...|db/series/sfsc/in...|              2|           2015|
+--------------------+--------------------+---------------+---------------+



In [63]:
#Most used keywords in books
exploded_kw = df_book.filter(col("keyword").isNotNull())  \
    .select(df_book.key, explode("keyword"))
exploded_kw.groupBy("col").count().sort(col("count").desc()).show(truncate=False)

+-----------------------+-----+
|col                    |count|
+-----------------------+-----+
|q-learning             |62   |
|relational databases   |61   |
|graph database         |52   |
|language processing    |51   |
|A*                     |50   |
|mongodb                |50   |
|computer science       |49   |
|minimum path           |48   |
|database design        |48   |
|software               |48   |
|data mining            |47   |
|computer architectures |46   |
|machine learning       |45   |
|markov decision process|45   |
|multiagent systems     |45   |
|c#                     |44   |
|neo4j                  |44   |
|graphs                 |43   |
|data processing        |43   |
|c89                    |42   |
+-----------------------+-----+
only showing top 20 rows

