In [49]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, ArrayType
from pyspark.sql.functions import col                       # Filtering using the col() function
from pyspark.sql.functions import array_contains            # Filtering on array columns
from pyspark.sql.functions import explode                   # Explode Arrays in Individual Rows
from pyspark.sql.functions import sum, avg, count, max      # Multiple Aggregations
from pyspark.sql.functions import first, last	

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("MyFirstSparkApplication") \
      .getOrCreate()

In [50]:
# Read multiline json file  ARTICLE
df_article = spark.read.option("multiline","true") \
      .json("Datasets (json)/article-db.json")

df_article.printSchema()

root
 |-- author: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |-- citations: long (nullable = true)
 |-- cite: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ee: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- journal: string (nullable = true)
 |-- key: string (nullable = true)
 |-- keyword: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- note: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pages: long (nullable = true)
 |-- publisher: string (nullable = true)
 |-- title: string (nullable = true)
 |-- volume: long (nullable = true)
 |-- year: long (nullable = true)



In [52]:
# Read multiline json file  WWW
df_www = spark.read.option("multiline","true") \
    .json("Datasets (json)/www-db.json")  \
    .withColumnRenamed("note", "university")

df_www.printSchema()
df_www.show()

root
 |-- author: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |-- key: string (nullable = true)
 |-- university: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+--------------------+--------------------+--------------------+
|              author|                 key|          university|                 url|
+--------------------+--------------------+--------------------+--------------------+
|[{Alejandro Rodri...|  homepages/189/9749|[Western Norway U...|[https://orcid.or...|
|[{Kikuo Fujimura,...|homepages/f/Kikuo...|[Ohio State Unive...|[http://www.cse.o...|
|[{Pierfrancesco F...|homepages/f/Pierf...|                null|[http://garga.iet...|
|[{Farshad Fotouhi...|homepages/f/Farsh...|[Wayne State Univ...|[http://www.cs.wa...|
|[{Ada W

### 1. Most cited AUTHORS in a given field
The following query can be used to find the authors, and their website (if existing), who
wrote about a given field (e.g. "data mining") ordering them by the sum of citations they
received for thoose publications.

In [69]:
exploded_df_article = df_article.filter(array_contains(col("keyword"), "data mining"))  \
    .select(df_article.key, explode(df_article.author), df_article.keyword, df_article.citations  )    \
    .withColumnRenamed("col", "author")
# exploded_df_article.printSchema()
# exploded_df_article.show()

exploded_df_www = df_www  \
    .select(df_www.key, explode(df_www.author), df_www.url)    \
    .withColumnRenamed("col", "author")
# exploded_df_www.printSchema()
# exploded_df_www.show()

df_top_author = exploded_df_article.groupBy("author").agg(
    sum("citations").alias("Sum of Citations"), 
    avg("citations").alias("Average Citations"), 
    count("citations").alias("Number of Paper"),
    max("citations").alias("Max Citations"))    \
    .sort(col("Sum of Citations").desc()).limit(5)
# df_top_author.printSchema()
# df_top_author.show()

df_website = df_www.select(explode(df_www.author), df_www.url)    \
    .withColumnRenamed("col", "author_w")   \
    .withColumnRenamed("url", "website")
# df_website.printSchema()
# df_website.show()


result = df_top_author.join(df_website, df_top_author.author == df_website.author_w, "left")  \
    .select(col("author"), col("website"), col("Sum of Citations"), col("Average Citations"), col("Number of Paper"), col("Max Citations")) \
    

result.printSchema()    
result.show()

root
 |-- author: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- orcid: string (nullable = true)
 |-- website: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Sum of Citations: long (nullable = true)
 |-- Average Citations: double (nullable = true)
 |-- Number of Paper: long (nullable = false)
 |-- Max Citations: long (nullable = true)

+--------------------+--------------------+----------------+-----------------+---------------+-------------+
|              author|             website|Sum of Citations|Average Citations|Number of Paper|Max Citations|
+--------------------+--------------------+----------------+-----------------+---------------+-------------+
|{Clement T. Yu, C...|[http://www.cs.ui...|             231|             77.0|              3|          138|
|{Kenneth C. Sevci...|[http://www.cs.ut...|             229|            114.5|              2|          126|
|    {K. Lam, K. Lam}|                null|             20