# Práctica Good Reads

In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession
        .builder
        .appName("GoodReads")
        .getOrCreate())

In [2]:
read_file = "C:/Users/xabier.jimenez/Downloads/Goodreads Dataset Project/archive/book*"
df = (spark.read.format("csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(read_file))

In [3]:
from pyspark.sql import functions as F
from os import listdir
from os.path import isfile, join

mypath="C:/Users/xabier.jimenez/Downloads/Goodreads Dataset Project/archive/"
onlyfiles = [mypath+f for f in listdir(mypath) if isfile(join(mypath, f))]

df = (spark.read.format("csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .option("quote", "\"")
        .option("escape","\"")
        .option("multiLine","true")
        .load(onlyfiles[0]))

onlyfiles=onlyfiles[1:23]

for i in onlyfiles:
    df_aux = (spark.read.format("csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .option("quote", "\"")
        .option("escape","\"")
        .option("multiLine","true").load(i))
    df_aux=df_aux.select('Id','Name','RatingDist1','pagesNumber','RatingDist4','RatingDistTotal','PublishMonth','PublishDay','Publisher','CountsOfReview','PublishYear','Language','Authors','Rating','RatingDist2','RatingDist5','ISBN','RatingDist3')
    df=df.union(df_aux)

In [4]:
(df.write.format("parquet")
 .mode("overwrite")
 .option("compression", "snappy")
 .save("/tmp/data/parquet/df_parquet_goodreads"))

file = """/tmp/data/parquet/df_parquet_goodreads"""
df_parquet = spark.read.option("mergeSchema","true").format("parquet").load(file)

In [5]:
print(df_parquet.count())
df_parquet.show(5)

1850310
+-------+--------------------+-----------+-----------+-----------+---------------+------------+----------+--------------------+--------------+-----------+--------+------------------+------+-----------+-----------+----------+-----------+
|     Id|                Name|RatingDist1|pagesNumber|RatingDist4|RatingDistTotal|PublishMonth|PublishDay|           Publisher|CountsOfReview|PublishYear|Language|           Authors|Rating|RatingDist2|RatingDist5|      ISBN|RatingDist3|
+-------+--------------------+-----------+-----------+-----------+---------------+------------+----------+--------------------+--------------+-----------+--------+------------------+------+-----------+-----------+----------+-----------+
|2000004|Geschichte der Pr...|        1:0|        296|       4:24|       total:62|           8|         1|          Kohlhammer|             0|       1998|    null|Joseph Blenkinsopp|  3.66|        2:3|       5:10|3170117742|       3:25|
|2000006|Josué et l'allian...|        1:0|  

## 1.- Rating promedio de todos los libros

In [6]:
from pyspark.sql.functions import avg

sol1= df.select(avg(F.col("rating"))).show()

+-----------------+
|      avg(rating)|
+-----------------+
|2.894242694467414|
+-----------------+



## 2.- Rating promedio de los libros por autor

In [7]:
sol2= df_parquet.groupBy("authors").agg(avg(F.col("rating"))).show()

+--------------------+------------------+
|             authors|       avg(rating)|
+--------------------+------------------+
|    Vera Albuquerque|               4.0|
|       Thierry Lentz|            2.7775|
|       Georges Nania|               0.0|
|        Fred Allison|               0.0|
|    Frances Bellerby|               3.0|
|    Nathaniel Harris|3.3552631578947367|
|       David   Baird| 3.030769230769231|
|      Alison Daniels|              1.53|
|         Ken England|               3.0|
|         Bill Bright|3.3900000000000006|
|        Mary O'Brien|              2.11|
|        John Farndon| 3.167954545454545|
|   Edgar M. Bronfman|             3.165|
|     Louis Althusser| 3.892727272727273|
|Maria Julia Bertomeu|               0.0|
|     Mario Benedetti|3.9457407407407405|
|  The New York Times|2.9677848101265822|
|    Albert J. Schütz|2.6366666666666667|
|      Eloise Jelinek|               0.0|
|      Elizabeth Chan|              3.67|
+--------------------+------------

## 3.- Rating promedio de los libros por Publisher

In [8]:
sol3 = df_parquet.groupBy("publisher").agg(avg(F.col("rating"))).show()

+--------------------+------------------+
|           publisher|       avg(rating)|
+--------------------+------------------+
|           IVP Books|3.7813559322033896|
|    Ycp Publications|              3.95|
|John Benjamins Pu...|1.5338690861957232|
|                 DAW| 3.743121852970796|
|Regina Press Malh...| 3.054615384615385|
| Prospect Books (UK)|2.9373076923076926|
|            Capstone| 2.639642857142857|
|        Lorenz Books|2.9970674157303376|
|       The New Press|3.7651210428305397|
|     Militzke Verlag|               0.0|
|         Cleis Press|   3.7778073089701|
|Arcadia Publishin...| 3.143251088534107|
|      Celestial Arts|3.3254133858267725|
|Chicago Review Press| 3.399660441426147|
|     Dance Books Ltd|  2.64030303030303|
|        Chosen Books|3.7785123966942162|
| Research Press (IL)|2.2773214285714287|
|Civilized Publica...|4.1499999999999995|
| Orange Frazer Press|2.7602631578947374|
|   R.W. Secord Press|              3.67|
+--------------------+------------

## 4.- Número promedio de páginas de todos los libros

In [9]:
sol4 = df_parquet.select(avg(F.col("pagesNumber"))).show()

+------------------+
|  avg(pagesNumber)|
+------------------+
|276.55165080445977|
+------------------+



## 5.- Número promedio de páginas de todos los libros por autor

In [10]:
sol5 = df_parquet.groupBy("authors").agg(avg(F.col("pagesNumber"))).show()

+--------------------+------------------+
|             authors|  avg(pagesNumber)|
+--------------------+------------------+
|    Vera Albuquerque|             472.0|
|       Thierry Lentz|             332.0|
|       Georges Nania|             916.0|
|        Fred Allison|             452.0|
|    Frances Bellerby|             184.0|
|    Nathaniel Harris|112.89473684210526|
|       David   Baird| 383.9230769230769|
|      Alison Daniels|             160.0|
|         Ken England|             409.5|
|         Bill Bright| 271.4107142857143|
|        Mary O'Brien|215.33333333333334|
|        John Farndon| 138.0681818181818|
|   Edgar M. Bronfman|             226.5|
|     Louis Althusser| 306.3333333333333|
|Maria Julia Bertomeu|             199.0|
|     Mario Benedetti| 239.4814814814815|
|  The New York Times| 234.0759493670886|
|    Albert J. Schütz| 306.6666666666667|
|      Eloise Jelinek|             490.0|
|      Elizabeth Chan|              64.0|
+--------------------+------------

## 6.- Número promedio de páginas de todos los libros por Publisher

In [11]:
sol6 = df_parquet.groupBy("publisher").agg(avg(F.col("pagesNumber"))).show()

+--------------------+------------------+
|           publisher|  avg(pagesNumber)|
+--------------------+------------------+
|           IVP Books| 192.8502824858757|
|    Ycp Publications|             280.0|
|John Benjamins Pu...| 325.4698639014906|
|                 DAW| 355.0060422960725|
|Regina Press Malh...| 75.28205128205128|
| Prospect Books (UK)| 281.1923076923077|
|            Capstone|154.03571428571428|
|        Lorenz Books|203.50786516853933|
|       The New Press| 290.0148975791434|
|     Militzke Verlag|             236.0|
|         Cleis Press|222.02325581395348|
|Arcadia Publishin...| 129.7010159651669|
|      Celestial Arts|194.91929133858267|
|Chicago Review Press| 259.4193548387097|
|     Dance Books Ltd|             209.0|
|        Chosen Books|219.05371900826447|
| Research Press (IL)|255.21428571428572|
|Civilized Publica...|200.33333333333334|
| Orange Frazer Press|237.64473684210526|
|   R.W. Secord Press|             471.0|
+--------------------+------------

## 7.- Número promedio de libros publicados por autor

In [12]:
sol7 = df_parquet.groupBy("authors").count().show()

+--------------------+-----+
|             authors|count|
+--------------------+-----+
|    Vera Albuquerque|    1|
|       Thierry Lentz|    8|
|       Georges Nania|    1|
|        Fred Allison|    1|
|    Frances Bellerby|    1|
|    Nathaniel Harris|   19|
|       David   Baird|   13|
|      Alison Daniels|    2|
|         Ken England|    2|
|         Bill Bright|   56|
|        Mary O'Brien|    3|
|        John Farndon|   44|
|   Edgar M. Bronfman|    4|
|     Louis Althusser|   33|
|Maria Julia Bertomeu|    1|
|     Mario Benedetti|   54|
|  The New York Times|  158|
|    Albert J. Schütz|    3|
|      Eloise Jelinek|    1|
|      Elizabeth Chan|    1|
+--------------------+-----+
only showing top 20 rows



## 8.- Ordenar los libros de mayor a menor (Top 15) por número de ratings dados por usuarios (excluir aquellos valores sin rating)

In [13]:
sol8 = df_parquet.groupBy("name").count().orderBy(F.col("count").desc()).show(15)

+--------------------+-----+
|                name|count|
+--------------------+-----+
|     Collected Poems|   24|
|      Selected Poems|   24|
|           Jane Eyre|   24|
|        Little Women|   23|
|              Hamlet|   23|
|             Othello|   22|
|   The Secret Garden|   22|
|A Midsummer Night...|   21|
|     Robinson Crusoe|   21|
|          The Prince|   21|
|   Collected Stories|   21|
|                Emma|   21|
|     Treasure Island|   21|
|       Madame Bovary|   21|
|  The Scarlet Letter|   21|
+--------------------+-----+
only showing top 15 rows



## 9.- Obtener Top 5 de ratings más frecuentes otorgados por usuarios

In [14]:
sol9 = df_parquet.groupBy("rating").count().orderBy(F.col("count").desc()).show(5)

+------+------+
|rating| count|
+------+------+
|   0.0|451783|
|   4.0|151979|
|   3.0| 87288|
|   5.0| 79827|
|   3.5| 45222|
+------+------+
only showing top 5 rows

