# Ex1 - Filtering and Sorting Data

In [20]:
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.functions import *
import pandas as pd
from pyspark.sql.types import *

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv).

In [21]:
spark = SparkSession.builder.appName("ChipotleData").getOrCreate()

### Step 3. Assign it to a variable called chipo.

In [22]:
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"
chipo_pd = pd.read_csv(url, sep="\t")
chipo_df = spark.createDataFrame(chipo_pd)

In [23]:
chipo_df.show()

+--------+--------+--------------------+--------------------+----------+
|order_id|quantity|           item_name|  choice_description|item_price|
+--------+--------+--------------------+--------------------+----------+
|       1|       1|Chips and Fresh T...|                 NaN|    $2.39 |
|       1|       1|                Izze|        [Clementine]|    $3.39 |
|       1|       1|    Nantucket Nectar|             [Apple]|    $3.39 |
|       1|       1|Chips and Tomatil...|                 NaN|    $2.39 |
|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|   $16.98 |
|       3|       1|        Chicken Bowl|[Fresh Tomato Sal...|   $10.98 |
|       3|       1|       Side of Chips|                 NaN|    $1.69 |
|       4|       1|       Steak Burrito|[Tomatillo Red Ch...|   $11.75 |
|       4|       1|    Steak Soft Tacos|[Tomatillo Green ...|    $9.25 |
|       5|       1|       Steak Burrito|[Fresh Tomato Sal...|    $9.25 |
|       5|       1| Chips and Guacamole|           

In [24]:
chipo_df = chipo_df.withColumn(
    "item_price",
    regexp_replace(col("item_price"), "[$]", "").cast("float")
)

### Step 4. How many products cost more than $10.00?

In [26]:
chipo_df.drop_duplicates(['item_name']).filter(col('item_price')>10.00).show()

+--------+--------+--------------------+--------------------+----------+
|order_id|quantity|           item_name|  choice_description|item_price|
+--------+--------+--------------------+--------------------+----------+
|      19|       1|       Barbacoa Bowl|[Roasted Chili Co...|     11.75|
|      75|       1|Barbacoa Crispy T...|[Tomatillo Red Ch...|     11.75|
|     501|       1| Barbacoa Salad Bowl|[Fresh Tomato Sal...|     11.89|
|     193|       3|                Bowl|[Braised Carnitas...|      22.2|
|     468|       1| Carnitas Salad Bowl|[Fresh Tomato Sal...|     11.89|
|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|     16.98|
|     109|       1|       Chicken Salad|[Roasted Chili Co...|     10.98|
|       4|       1|       Steak Burrito|[Tomatillo Red Ch...|     11.75|
|     123|       2|    Steak Salad Bowl|[Tomatillo Red Ch...|     23.78|
|      28|       1|         Veggie Bowl|[Fresh Tomato Sal...|     11.25|
|      26|       1|      Veggie Burrito|[Tomatillo 

### Step 5. What is the price of each item?
###### print a data frame with only two columns item_name and item_price

In [37]:
each_item_price = chipo_df.select(col('item_name'),col('item_price')).drop_duplicates(['item_name'])
each_item_price.show()

+--------------------+----------+
|           item_name|item_price|
+--------------------+----------+
| Carnitas Soft Tacos|      9.25|
|  Chicken Soft Tacos|      8.75|
|               Salad|       7.4|
|         Steak Salad|      8.99|
|                Bowl|      22.2|
| Veggie Crispy Tacos|      8.49|
|   6 Pack Soft Drink|      6.49|
|Chicken Crispy Tacos|      8.75|
|   Veggie Soft Tacos|     11.25|
| Carnitas Salad Bowl|     11.89|
|     Chicken Burrito|      8.49|
|   Veggie Salad Bowl|     11.25|
|         Veggie Bowl|     11.25|
|       Steak Burrito|     11.75|
|  Steak Crispy Tacos|      9.25|
|   Canned Soft Drink|      1.25|
|        Chicken Bowl|     16.98|
|       Side of Chips|      1.69|
|                Izze|      3.39|
|  Chicken Salad Bowl|      8.75|
+--------------------+----------+
only showing top 20 rows



In [38]:
each_item_price.select('item_name').count()

50

### Step 6. Sort by the name of the item

In [40]:
chipo_df.sort(['item_name']).drop_duplicates(['item_name']).show()

+--------+--------+--------------------+--------------------+----------+
|order_id|quantity|           item_name|  choice_description|item_price|
+--------+--------+--------------------+--------------------+----------+
|     129|       1|   6 Pack Soft Drink|            [Sprite]|      6.49|
|      19|       1|       Barbacoa Bowl|[Roasted Chili Co...|     11.75|
|      11|       1|    Barbacoa Burrito|[[Fresh Tomato Sa...|      8.99|
|      75|       1|Barbacoa Crispy T...|[Tomatillo Red Ch...|     11.75|
|     501|       1| Barbacoa Salad Bowl|[Fresh Tomato Sal...|     11.89|
|      26|       1| Barbacoa Soft Tacos|[Fresh Tomato Sal...|      9.25|
|      17|       1|       Bottled Water|                 NaN|      1.09|
|     193|       3|                Bowl|[Braised Carnitas...|      22.2|
|     214|       1|             Burrito|[Adobo-Marinated ...|       7.4|
|       9|       2|         Canned Soda|            [Sprite]|      2.18|
|     114|       1|   Canned Soft Drink|           

### Step 7. What was the quantity of the most expensive item ordered?

In [49]:
chipo_df.orderBy('item_price',ascending =False).show(1)

+--------+--------+--------------------+------------------+----------+
|order_id|quantity|           item_name|choice_description|item_price|
+--------+--------+--------------------+------------------+----------+
|    1443|      15|Chips and Fresh T...|               NaN|     44.25|
+--------+--------+--------------------+------------------+----------+
only showing top 1 row



### Step 8. How many times was a Veggie Salad Bowl ordered?

In [54]:
chipo_df.filter(col('item_name') == 'Veggie Salad Bowl').agg(sum('quantity')).show()

+-------------+
|sum(quantity)|
+-------------+
|           18|
+-------------+



### Step 9. How many times did someone order more than one Canned Soda?

In [63]:
chipo_df.filter((col('item_name') == 'Canned Soda') & (col('quantity') == 1)).count()

84