In [318]:
# Import the basic spark library

from pyspark.sql import SparkSession

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("schemaBook") \
      .getOrCreate()
# master contains the URL of your remote spark instance or 'local'

In [319]:
#schema book
df_book = spark.read.option("multiline","true").json("book-db-manual.json")
df_book.printSchema()
#df_book.show()

root
 |-- author: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |-- citations: long (nullable = true)
 |-- crossref: string (nullable = true)
 |-- editor: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |-- ee: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- isbn: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- key: string (nullable = true)
 |-- keyword: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pages: long (nullable = true)
 |-- publisher: string (nullable = true)
 |-- series: struct (nullable = true)
 |    |-- href: string (nullable = true)
 |    |-- title: string (nullable = true)
 |-- title: string (nullable = true)
 |-- volume: long (nullable = true)
 |-- 

In [320]:
#schema article
df_article = spark.read.option("multiline","true").json("article-db-manual_conrefs.json")
df_article.printSchema()
#df_article.show()

root
 |-- author: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |-- citations: long (nullable = true)
 |-- cite: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ee: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- journal: string (nullable = true)
 |-- key: string (nullable = true)
 |-- keyword: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- note: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pages: long (nullable = true)
 |-- publisher: string (nullable = true)
 |-- title: string (nullable = true)
 |-- volume: long (nullable = true)
 |-- year: long (nullable = true)



In [321]:
#schema www
df_www = spark.read.option("multiline","true").json("www-db-manual.json")
df_www.printSchema()
#df_www.show()

root
 |-- author: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |-- key: string (nullable = true)
 |-- note: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)



0.1.4 VERSION 2: Best PROFESSORS according to number of citations at a given university

The following query can be used to find the authors affiliated with a specific university
(e.g.’University of Maryland, College Park, USA’) ordering them by the average number of citations(at least 5) received by their publications(books, articles) from any article published in the last years(since 1990).

In [322]:
#import
from pyspark.sql.functions import col, first                       # Filtering using the col() function
from pyspark.sql.functions import array_contains            # Filtering on array columns
from pyspark.sql.functions import explode                   # Explode Arrays in Individual Rows
from pyspark.sql.functions import collect_list
from pyspark.sql.functions import avg, count                # Multiple Aggregations



In [323]:
df_book = df_book  \
    .select(df_book.key, df_book.author, df_book.year)
df_article = df_article  \
    .select(df_article.key, df_article.author, df_article.cite, df_article.year )

In [324]:
#authors affiliated with a specific university
df_www_authors=df_www.filter(array_contains(df_www.note, "University of Maryland, College Park, USA"))\
    .select(explode(df_www.author)).withColumnRenamed("col", "author")
#df_www_authors.printSchema()
#df_www_authors.show(truncate=False)

In [325]:
#union of all the publications
df_pub= df_article.unionByName(df_book, allowMissingColumns=True).select("key", "author")
#df_pub.printSchema()
#df_pub.show(truncate=False)

In [326]:
#filter articles published since 1990
df_article=df_article.filter(col("year")>1990).select(explode("cite")).withColumnRenamed("col", "cite")
#df_article.printSchema()
#df_article.show(truncate=False)

In [327]:
#count number of citations received by each publication from articles published since 1990
df_pub=df_article.join(df_pub, df_article.cite==df_pub.key, "left")\
    .groupBy("key").agg(count("cite").alias("CountOfCitations"),first("author").alias("author"))\
    .select(col("key"), col("CountOfCitations"), explode(col("author")))\
    .withColumnRenamed("col", "author")

In [328]:
result = df_www_authors.join(df_pub, df_www_authors.author == df_pub.author, "left")\
    .groupBy(df_pub.author).agg(avg("CountOfCitations").alias("Avg Cit"))\
    .filter(col("Avg Cit")>="5")\
    .select(col("author.name"), col("Avg Cit")).sort(col("Avg Cit").desc()).show(truncate=False)

+--------------+-------+
|name          |Avg Cit|
+--------------+-------+
|Alan Sussman  |7.0    |
|Louiqa Raschid|6.0    |
|Joel H. Saltz |6.0    |
|Dana S. Nau   |5.0    |
+--------------+-------+

