# Dataframe feladatok

In [1]:
from pyspark.sql import  *
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

## 3. feladat

In [2]:
book_df = spark.read.option('header', True).option('inferSchema', True).csv('books.csv')
book_df.printSchema()

root
 |-- rowID: string (nullable = true)
 |-- itemID: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author: string (nullable = true)
 |-- publisher: string (nullable = true)


In [3]:
book_df.createOrReplaceTempView('BOOKS')

In [4]:
spark.sql(
    """
    SELECT author, COUNT(DISTINCT itemID) as cnt
    FROM BOOKS
    WHERE author IS NOT NULL
    GROUP BY author
    ORDER BY cnt DESC
    LIMIT 3
    """
).show()

+--------------------+----+
|              author| cnt|
+--------------------+----+
|     Garcia Santiago|1479|
|Shelley Admont, K...| 228|
|       James Manning| 180|
+--------------------+----+


## 4. feladat

In [5]:
spark.sql(
    """
    SELECT author, count(distinct publisher) cnt
    FROM BOOKS
    WHERE author IS NOT NULL
    GROUP BY author
    HAVING cnt > 35
    """
).show()

+--------------------+---+
|              author|cnt|
+--------------------+---+
|       Lewis Carroll| 43|
|         Jules Verne| 37|
|         H. G. Wells| 52|
|Frances Hodgson B...| 38|
+--------------------+---+


## 5. feladat

In [6]:
order_df = spark.read.option('header', True).option('inferSchema', True).csv('orders.csv')
order_df.printSchema()

root
 |-- rowID: integer (nullable = true)
 |-- sessionID: integer (nullable = true)
 |-- itemID: integer (nullable = true)
 |-- click: integer (nullable = true)
 |-- basket: integer (nullable = true)
 |-- order: integer (nullable = true)


In [7]:
order_df.createOrReplaceTempView('ORDERS')

In [8]:
spark.sql(
    """
    SELECT AVG(sessionClick)
    FROM (
    SELECT sum(click) as sessionClick
    FROM ORDERS
    GROUP BY sessionID)
    """
).show()

+-----------------+
|avg(sessionClick)|
+-----------------+
|1.655570384913763|
+-----------------+


## 6. feladat

In [12]:
spark.sql(
    """
    SELECT  author, title, sum(ORDERS.ORDER) as cnt
    FROM (ORDERS JOIN BOOKS ON ORDERS.itemID = BOOKS.itemID)
    GROUP BY title, author
    ORDER BY cnt DESC
    LIMIT 1
    """
).show()

+------------------+--------------------+---+
|            author|               title|cnt|
+------------------+--------------------+---+
|Andreas Steinhöfel|Rico, Oskar und d...|170|
+------------------+--------------------+---+
