# Ex2 - Getting and Knowing your Data

### Step 1. Import the necessary libraries

In [None]:
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.functions import *
import pandas as pd
from pyspark.sql.types import *

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv).

In [None]:
spark = SparkSession.builder.appName("ChipotleData").getOrCreate()

In [None]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'
# Fetch data using Pandas
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"
chipo_pd = pd.read_csv(url, sep="\t")

# Convert Pandas DataFrame to PySpark DataFrame
chipo_df = spark.createDataFrame(chipo_pd)

### Step 3. Assign it to a variable called chipo.

In [None]:
chipo = chipo_df

### Step 4. See the first 10 entries

In [None]:
chipo_df.limit(10).show()

+--------+--------+--------------------+--------------------+----------+
|order_id|quantity|           item_name|  choice_description|item_price|
+--------+--------+--------------------+--------------------+----------+
|       1|       1|Chips and Fresh T...|                 NaN|    $2.39 |
|       1|       1|                Izze|        [Clementine]|    $3.39 |
|       1|       1|    Nantucket Nectar|             [Apple]|    $3.39 |
|       1|       1|Chips and Tomatil...|                 NaN|    $2.39 |
|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|   $16.98 |
|       3|       1|        Chicken Bowl|[Fresh Tomato Sal...|   $10.98 |
|       3|       1|       Side of Chips|                 NaN|    $1.69 |
|       4|       1|       Steak Burrito|[Tomatillo Red Ch...|   $11.75 |
|       4|       1|    Steak Soft Tacos|[Tomatillo Green ...|    $9.25 |
|       5|       1|       Steak Burrito|[Fresh Tomato Sal...|    $9.25 |
+--------+--------+--------------------+-----------

### Step 5. What is the number of observations in the dataset?

In [None]:

chipo_df.count()


4622

### Step 6. What is the number of columns in the dataset?

In [None]:
len(chipo_df.columns)

5

### Step 7. Print the name of all the columns.

In [None]:
chipo_df.columns

['order_id', 'quantity', 'item_name', 'choice_description', 'item_price']

### Step 8. How is the dataset indexed?

In [None]:
chipo_df.withColumn('index',row_number().over(Window.orderBy('order_id')))

DataFrame[order_id: bigint, quantity: bigint, item_name: string, choice_description: string, item_price: string, index: int]

### Step 9. Which was the most-ordered item?

In [None]:
chipo_df.groupBy('item_name').agg(count('item_name').alias('Num_of_orders')).orderBy('Num_of_orders',ascending=False).limit(1).show()

+------------+-------------+
|   item_name|Num_of_orders|
+------------+-------------+
|Chicken Bowl|          726|
+------------+-------------+



### Step 10. For the most-ordered item, how many items were ordered?

In [None]:
most_ordered_item = chipo_df.groupBy("item_name") \
    .agg(sum("quantity").alias("total_quantity")) \
    .orderBy("total_quantity", ascending=False) \
    .limit(1)

most_ordered_item.show()


+------------+--------------+
|   item_name|total_quantity|
+------------+--------------+
|Chicken Bowl|           761|
+------------+--------------+



### Step 11. What was the most ordered item in the choice_description column?

In [None]:
most_ordered_choice = chipo_df.groupBy('choice_description').agg(sum('quantity').alias('total_quantity')).orderBy('total_quantity',ascending=False).limit(1)
most_ordered_choice.show()

+------------------+--------------+
|choice_description|total_quantity|
+------------------+--------------+
|               NaN|          1382|
+------------------+--------------+



### Step 12. How many items were orderd in total?

In [None]:
total_items = chipo_df.agg(sum('quantity').alias('Total_Items_Ordered'))
total_items.show()

+-------------------+
|Total_Items_Ordered|
+-------------------+
|               4972|
+-------------------+



### Step 13. Turn the item price into a float

#### Step 13.a. Check the item price type

In [None]:
chipo_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- quantity: long (nullable = true)
 |-- item_name: string (nullable = true)
 |-- choice_description: string (nullable = true)
 |-- item_price: string (nullable = true)



#### Step 13.b. Create a lambda function and change the type of item price

In [None]:

to_double = udf(lambda x: float(x.replace('$','')) if x else None,DoubleType())
chipo_df = chipo_df.withColumn('item_price',to_double('item_price'))

#### Step 13.c. Check the item price type

In [None]:
chipo_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- quantity: long (nullable = true)
 |-- item_name: string (nullable = true)
 |-- choice_description: string (nullable = true)
 |-- item_price: double (nullable = true)



In [None]:
chipo_df.show()

+--------+--------+--------------------+--------------------+----------+
|order_id|quantity|           item_name|  choice_description|item_price|
+--------+--------+--------------------+--------------------+----------+
|       1|       1|Chips and Fresh T...|                 NaN|      2.39|
|       1|       1|                Izze|        [Clementine]|      3.39|
|       1|       1|    Nantucket Nectar|             [Apple]|      3.39|
|       1|       1|Chips and Tomatil...|                 NaN|      2.39|
|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|     16.98|
|       3|       1|        Chicken Bowl|[Fresh Tomato Sal...|     10.98|
|       3|       1|       Side of Chips|                 NaN|      1.69|
|       4|       1|       Steak Burrito|[Tomatillo Red Ch...|     11.75|
|       4|       1|    Steak Soft Tacos|[Tomatillo Green ...|      9.25|
|       5|       1|       Steak Burrito|[Fresh Tomato Sal...|      9.25|
|       5|       1| Chips and Guacamole|           

### Step 14. How much was the revenue for the period in the dataset?

In [None]:
total_revenue = chipo_df.agg(sum('item_price').alias('Total_revenue'))
total_revenue.show()

+-----------------+
|    Total_revenue|
+-----------------+
|34500.15999999998|
+-----------------+



### Step 15. How many orders were made in the period?

In [None]:
chipo_df.agg(countDistinct('order_id').alias('unique_orders')).show()

+-------------+
|unique_orders|
+-------------+
|         1834|
+-------------+



### Step 16. What is the average revenue amount per order?

In [None]:
revenue_per_order = chipo_df.groupBy('order_id').agg(sum('item_price').alias('order_total'))

revenue_per_order.agg(avg('order_total').alias('avg_revenue_per_order')).show()

+---------------------+
|avg_revenue_per_order|
+---------------------+
|   18.811428571428717|
+---------------------+



### Step 17. How many different items are sold?

In [None]:
chipo_df.agg(countDistinct('item_name')).show()

+-------------------------+
|count(DISTINCT item_name)|
+-------------------------+
|                       50|
+-------------------------+

