In [12]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, ArrayType
from pyspark.sql.functions import col                       # Filtering using the col() function
from pyspark.sql.functions import array_contains            # Filtering on array columns
from pyspark.sql.functions import explode                   # Explode Arrays in Individual Rows
from pyspark.sql.functions import sum, avg, count, max      # Multiple Aggregations
from pyspark.sql.functions import first, last
from pyspark.sql.functions import countDistinct

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("MyFirstSparkApplication") \
      .getOrCreate()

In [2]:
#IMPORT

df_book = spark.read.option("multiline","true")  \
      .json("Datasets/book-db.json")

df_article = spark.read.option("multiline","true")  \
      .json("Datasets/article-db.json")

df_incollection = spark.read.option("multiline","true")  \
      .json("Datasets/incollection-db.json")

df_www = spark.read.option("multiline","true")  \
      .json("Datasets/www-db.json")


In [None]:
#df_article.show()
#df_book.show()
#df_incollection.show()
#df_www.show()


# **1. Top Journals for heterogeneity of topics**
This query is useful to obtain the journals which are the most heterogeneous, i.e. the ones whose articles cover a good number of topics. 
In this case, we want the top 10 journals that have published articles related to at least 25 different keywords.

In [29]:
result = df_article \
    .select(col("journal"), explode("keyword")) \
    .withColumnRenamed("col", "keyword") \
    .groupBy("journal") \
    .agg(countDistinct("journal", "keyword")) \
    .withColumnRenamed("count(journal, keyword)", "number of keywords") \
    .filter(col("number of keywords") > 25) \
    .sort(col("number of keywords").desc())
    
result.limit(10).show(truncate = False)


+----------------------------+------------------+
|journal                     |number of keywords|
+----------------------------+------------------+
|SIGMOD Rec.                 |45                |
|IEEE Trans. Knowl. Data Eng.|45                |
|IEEE Data Eng. Bull.        |45                |
|ACM SIGMOD Digit. Rev.      |45                |
|IWBS Report                 |45                |
|ACM Trans. Database Syst.   |45                |
|VLDB J.                     |45                |
|LILOG-Report                |44                |
|ACM Comput. Surv.           |44                |
|Commun. ACM                 |28                |
+----------------------------+------------------+

