In [7]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, ArrayType
from pyspark.sql.functions import col                       # Filtering using the col() function
from pyspark.sql.functions import array_contains            # Filtering on array columns
from pyspark.sql.functions import explode                   # Explode Arrays in Individual Rows
from pyspark.sql.functions import sum, avg, count, max      # Multiple Aggregations
from pyspark.sql.functions import first, last

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("MyFirstSparkApplication") \
      .getOrCreate()

In [8]:
#IMPORT

df_book = spark.read.option("multiline","true")  \
      .json("Datasets (json)/book-db.json")

df_article = spark.read.option("multiline","true")  \
      .json("Datasets (json)/article-db.json")

df_incollection = spark.read.option("multiline","true")  \
      .json("Datasets (json)/incollection-db.json")

df_www = spark.read.option("multiline","true")  \
      .json("Datasets (json)/www-db.json")

<h3>1. Find SERIES about relational databases </h3>
Series that contain books about relational databases published after 2018, ordered by the year of the most recent publication

In [122]:
result = df_book.\
    filter((col("series").isNotNull()) & (array_contains(col("keyword"), "relational databases"))). \
    groupBy("series").\
    agg(count("key").alias("count"), max("year").alias("Most recent pub")).\
    filter(col("Most recent pub")>2018).\
    sort(col("Most recent pub").desc(), col("count").desc()).\
    select(col("series.title").alias("Series title"), col("series.href").alias("Series key"),
        col("count").alias("Number of books"), col("Most recent pub"))

result.show()

+--------------------+--------------------+---------------+---------------+
|        Series title|          Series key|Number of books|Most recent pub|
+--------------------+--------------------+---------------+---------------+
|Studies in Comput...|db/series/sci/ind...|              5|           2022|
|Springer Briefs i...|db/series/sbcs/in...|              3|           2022|
|Synthesis Lecture...|db/series/synthes...|              1|           2021|
|History of Computing|db/series/hoc/ind...|              1|           2021|
|Synthesis Lecture...|db/series/synthes...|              1|           2021|
|Lecture Notes in ...|db/series/lncs/in...|              2|           2020|
|Intelligent Syste...|db/series/isrl/in...|              2|           2020|
|Texts in Computer...|db/series/txcs/in...|              1|           2020|
|Monographs in The...|db/series/eatcs/i...|              1|           2020|
+--------------------+--------------------+---------------+---------------+



<h3> 2. Top KEYWORDS used with another keyword </h3>

In [92]:
pub_book = df_book.select(col("key"), col("keyword"))
pub_article = df_article.select(col("key"), col("keyword"))

df_publications = pub_book.union(pub_article)

result = df_publications.\
    filter(array_contains(col("keyword"), "data mining")).\
    select(col("key"), explode("keyword")).\
    withColumnRenamed("col", "keyword").\
    filter(col("keyword") != "data mining").\
    groupBy("keyword").count().\
    sort(col("count").desc())

result.show(truncate=False)

+----------------------------+-----+
|keyword                     |count|
+----------------------------+-----+
|minimum path                |20   |
|database design             |20   |
|dijkstra                    |20   |
|software                    |20   |
|c++                         |19   |
|interfaces                  |17   |
|computer science engineering|17   |
|q-learning                  |17   |
|A*                          |17   |
|key-value databases         |16   |
|java                        |16   |
|neo4j                       |15   |
|machine learning            |15   |
|c89                         |15   |
|graph model                 |15   |
|big data                    |14   |
|javascript                  |14   |
|graphs                      |14   |
|ai                          |13   |
|markov decision process     |13   |
+----------------------------+-----+
only showing top 20 rows



<h3> 3. Top BOOKS about a specific topic </h3>
Find books containing more than 2 incollections about deep learning or machine learning

In [120]:
incollection_kw = df_incollection.filter(array_contains(col("keyword"), "deep learning")
        | array_contains(col("keyword"), "machine learning")).\
    select(col("key"), col("crossref"), col("citations"))

result = df_book.join(incollection_kw, df_book.key == incollection_kw.crossref).\
    select(df_book.key, df_book.title, df_book.year, incollection_kw.key, incollection_kw.citations).\
    groupBy(df_book.key, df_book.title, df_book.year).\
    agg(sum(incollection_kw.citations).alias("citations sum"),
        count(incollection_kw.key).alias("count")).\
    filter(col("count") >= 2).\
    sort(col("year").desc(), col("citations sum").desc()).\
    select(col("title"), col("year"), col("citations sum"), col("count"))

result.show()

+--------------------+----+-------------+-----+
|               title|year|citations sum|count|
+--------------------+----+-------------+-----+
|Software Sustaina...|2021|          142|    2|
|Towards Interoper...|2020|          136|    2|
|Integrating Resea...|2020|           57|    2|
|Encyclopedia of E...|2020|         null|    2|
|Encyclopedia of D...|2018|          700|   15|
|Applications of B...|2018|           32|    2|
|Developing Suppor...|2018|         null|    2|
|Software for Exas...|2016|         null|    2|
+--------------------+----+-------------+-----+



In [106]:
#Most used keywords
exploded_kw = df_incollection.filter(col("keyword").isNotNull())  \
    .select(col("key"), explode("keyword"))
exploded_kw.groupBy("col").count().sort(col("count").desc()).show(truncate=False)

+----------------------------+-----+
|col                         |count|
+----------------------------+-----+
|cloud computing             |132  |
|debugging                   |127  |
|deep learning               |127  |
|graph model                 |126  |
|javascript                  |126  |
|key-value databases         |125  |
|html                        |124  |
|ai                          |124  |
|software engineering        |123  |
|interfaces                  |123  |
|data processing             |123  |
|path finding                |122  |
|python                      |121  |
|software                    |120  |
|A*                          |120  |
|unstructured data           |119  |
|computer science engineering|119  |
|c#                          |119  |
|mongodb                     |118  |
|graph search                |116  |
+----------------------------+-----+
only showing top 20 rows

