# Big Data Systems Architecture - Spark Assignment 

In [179]:
#import of libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [180]:
#We create a spark session in order to create an application to exlore the data
spark = SparkSession.builder.appName('Data_Exploration').getOrCreate()

In [181]:
#We load the dataset in a dataframe. We read the file with the .json() function
books = spark.read.json("books_5000.json")
type(books)

pyspark.sql.dataframe.DataFrame

In [182]:
#We use the count function to count the number of the books
print('The number of total books in the dataset is',books.count())

The number of total books in the dataset is 4999


In [183]:
#we run a SQL-like query to find the number of books with the index is_ebook as true
namesDF = spark.sql("SELECT * FROM books WHERE is_ebook = True ");
namesDF.count()

749

In [184]:
#We keep only the 'average_rating' column of each record - book that does not have a null value from an SQL query.
rating_summary = spark.sql("SELECT average_rating from books WHERE average_rating IS NOT NULL").summary()
rating_summary.show()

+-------+-------------------+
|summary|     average_rating|
+-------+-------------------+
|  count|               4999|
|   mean| 3.9112042408481678|
| stddev|0.43444489528688784|
|    min|               1.00|
|    25%|               3.66|
|    50%|               3.98|
|    75%|               4.23|
|    max|               5.00|
+-------+-------------------+



In [185]:
#We keep only the 'format' column of each record - book with the explode and split functions. Then we group by the values of each column and we count the frequency of each one.
formatCounts = books.select(explode(split(books.format, "\s+")) \
.alias("format")) \
.groupBy("format") \
.count()
#with the collect function we print each unique format value along with its frequency in the dataset
formatCounts.collect()

[Row(format='Market', count=64),
 Row(format='Mass', count=64),
 Row(format='Library', count=2),
 Row(format='Paperback', count=2698),
 Row(format='paperback', count=2),
 Row(format='Klappenbroschur', count=1),
 Row(format='with', count=2),
 Row(format='Nook', count=1),
 Row(format='Illustrated', count=2),
 Row(format='Hardcover', count=830),
 Row(format='Issue', count=2),
 Row(format='dust', count=2),
 Row(format='Album', count=2),
 Row(format='Book', count=16),
 Row(format='Webtoon', count=2),
 Row(format='Unknown', count=7),
 Row(format='Comics', count=2),
 Row(format='Bolsillo', count=2),
 Row(format='con', count=3),
 Row(format='hardcover', count=1),
 Row(format='Digital', count=1),
 Row(format='Rustica', count=1),
 Row(format='comic', count=2),
 Row(format='Comic', count=32),
 Row(format='Manga', count=2),
 Row(format='comics', count=1),
 Row(format='Audible', count=1),
 Row(format='Edition', count=41),
 Row(format='Kindle', count=41),
 Row(format='Audio', count=4),
 Row(format='

In [170]:
#We create a global view of the books dataframe, so it can be accessed from any application in different sessions
books.createOrReplaceGlobalTempView("books");