# Dataframe feladatok

In [1]:
from pyspark.sql import  *
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

In [4]:
menu_df = spark.read.option('header', True).option('inferschema', True).csv('menu.csv')
menu_df.show(5)

+--------+--------------------+------------+--------+-----------------+---------+-------------------------+-------------+-----------------------------+---------+-----------+---------------------------+------+----------------------+-------------+-----------------------------+-------------+-----------------------------+------+-------+-------------------------+-------------------------+-----------------------+--------------------+
|Category|                Item|Serving Size|Calories|Calories from Fat|Total Fat|Total Fat (% Daily Value)|Saturated Fat|Saturated Fat (% Daily Value)|Trans Fat|Cholesterol|Cholesterol (% Daily Value)|Sodium|Sodium (% Daily Value)|Carbohydrates|Carbohydrates (% Daily Value)|Dietary Fiber|Dietary Fiber (% Daily Value)|Sugars|Protein|Vitamin A (% Daily Value)|Vitamin C (% Daily Value)|Calcium (% Daily Value)|Iron (% Daily Value)|
+--------+--------------------+------------+--------+-----------------+---------+-------------------------+-------------+---------------

In [5]:
menu_df.schema

StructType([StructField('Category', IntegerType(), True), StructField('Item', StringType(), True), StructField('Serving Size', StringType(), True), StructField('Calories', IntegerType(), True), StructField('Calories from Fat', IntegerType(), True), StructField('Total Fat', DoubleType(), True), StructField('Total Fat (% Daily Value)', IntegerType(), True), StructField('Saturated Fat', DoubleType(), True), StructField('Saturated Fat (% Daily Value)', IntegerType(), True), StructField('Trans Fat', DoubleType(), True), StructField('Cholesterol', IntegerType(), True), StructField('Cholesterol (% Daily Value)', IntegerType(), True), StructField('Sodium', IntegerType(), True), StructField('Sodium (% Daily Value)', IntegerType(), True), StructField('Carbohydrates', IntegerType(), True), StructField('Carbohydrates (% Daily Value)', IntegerType(), True), StructField('Dietary Fiber', IntegerType(), True), StructField('Dietary Fiber (% Daily Value)', IntegerType(), True), StructField('Sugars', Int

In [10]:
menu_df.createTempView('menu')

# 3. feladat
Melyek azok az ételek, amelyek meghaladják az ajánlott napi zsír bevitelt? (`Total Fat (% Daily Value)`)

Elvárt oszlopok: `[Item]`

In [36]:
d3 = menu_df\
    .where(menu_df['Total Fat (% Daily Value)'] > 100)\
    .select(menu_df['Item'])
      
d3.show(5)

+--------------------+
|                Item|
+--------------------+
|Chicken McNuggets...|
+--------------------+


In [32]:
s3 = spark.sql(
    """
    SELECT Item
    FROM menu
    WHERE `Total Fat (% Daily Value)` > 100
    """
)
      
s3.show(5)

+--------------------+
|                Item|
+--------------------+
|Chicken McNuggets...|
+--------------------+


In [37]:
d3.sameSemantics(s3)

True

# 4. feladat
Melyik ételnek van a maximális `Sugars` értéke?
Elvárt oszlopok: `[Item, Sugars]`

In [21]:
d4 = menu_df\
    .select(['Item', 'Sugars'])\
    .orderBy(desc('Sugars'))\
    .limit(1)

d4.show()

+--------------------+------+
|                Item|Sugars|
+--------------------+------+
|McFlurry with M&M...|   128|
+--------------------+------+


In [24]:
s4 = spark.sql(
    """
    SELECT Item, Sugars
    FROM menu
    ORDER BY Sugars
    DESC LIMIT 1
    """
)

s4.show()

+--------------------+------+
|                Item|Sugars|
+--------------------+------+
|McFlurry with M&M...|   128|
+--------------------+------+


In [25]:
s4.sameSemantics(d4)

True

## 5. feladat
Hány elem van kategóriánként? Rendezzük csökkenő sorrendbe és adjuk meg a kategóriák nevét is.

Elvárt oszlopok: `[Name, Cnt]`

In [40]:
category_df = spark.read\
    .option('header', True)\
    .option('inferschema', True)\
    .csv('menuCategory.csv')

category_df.schema

StructType([StructField('Id', IntegerType(), True), StructField('Name', StringType(), True)])

In [42]:
category_df.createTempView('category')

In [62]:
s5 = spark.sql(
    """
    SELECT Name, count
    FROM ((SELECT Category, COUNT(*) as count FROM menu GROUP BY Category) JOIN category ON Category = Id)
    """
)

s5.show()

+------------------+-----+
|              Name|count|
+------------------+-----+
|         Breakfast|   42|
|          Desserts|    7|
|    Chicken & Fish|   27|
|    Snacks & Sides|   13|
|Smoothies & Shakes|   28|
|            Salads|    6|
|      Coffee & Tea|   95|
|         Beverages|   27|
|       Beef & Pork|   15|
+------------------+-----+


In [63]:
d5 = menu_df\
    .groupBy('Category')\
    .count()\
    .join(category_df, col('Category') == category_df['Id'])\
    .select(['Name', 'count'])

d5.show()

+------------------+-----+
|              Name|count|
+------------------+-----+
|         Breakfast|   42|
|          Desserts|    7|
|    Chicken & Fish|   27|
|    Snacks & Sides|   13|
|Smoothies & Shakes|   28|
|            Salads|    6|
|      Coffee & Tea|   95|
|         Beverages|   27|
|       Beef & Pork|   15|
+------------------+-----+


In [64]:
d5.sameSemantics(s5)

True

## 6. feladat
Átlagosan mennyi kalóriát tartalmaznak az egyes kategóriák? Adjuk meg a kategória nevét és rendezzük átlag alapján csökkenő sorrendbe. A `Coffee and Tea` és a 
`Beverages` kategóriákat ne vegyük figyelembe.

Elvárt oszlopok: `[Name, AvgCal]`

In [81]:
s6 = spark.sql(
    """
    SELECT NAME, avg FROM
    (
    SELECT Category, AVG(Calories) as avg
    FROM menu
    GROUP BY CATEGORY
    )
    JOIN
    (
    SELECT id, name
    FROM CATEGORY
    WHERE (name != 'Coffee & Tea' AND name != 'Beverages')
    )
    ON category = id
    """
)

s6.show()

+------------------+------------------+
|              NAME|               avg|
+------------------+------------------+
|         Breakfast| 526.6666666666666|
|          Desserts|222.14285714285714|
|    Chicken & Fish| 552.9629629629629|
|    Snacks & Sides|245.76923076923077|
|Smoothies & Shakes| 531.4285714285714|
|            Salads|             270.0|
|       Beef & Pork|             494.0|
+------------------+------------------+


In [94]:
d6 = menu_df.groupBy('Category')\
    .agg(avg('Calories').alias('avg'))\
    .join(category_df.where((col('Name') != 'Beverages') & (col('Name') != 'Coffee & Tea')), col('Category') == col('Id')).select('Name', 'avg')

d6.show()

+------------------+------------------+
|              Name|               avg|
+------------------+------------------+
|         Breakfast| 526.6666666666666|
|          Desserts|222.14285714285714|
|    Chicken & Fish| 552.9629629629629|
|    Snacks & Sides|245.76923076923077|
|Smoothies & Shakes| 531.4285714285714|
|            Salads|             270.0|
|       Beef & Pork|             494.0|
+------------------+------------------+


In [95]:
d6.sameSemantics(s6) # ezt meg se próbálom inkább azonos fizikai tervre kihozni

False