# Ex1 - Filtering and Sorting Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv).

In [36]:
df = pd.read_csv("https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv", sep="\t")
df.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


### Step 3. Assign it to a variable called chipo.

In [37]:
chipo = df
chipo.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [38]:
chipo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   order_id            4622 non-null   int64 
 1   quantity            4622 non-null   int64 
 2   item_name           4622 non-null   object
 3   choice_description  3376 non-null   object
 4   item_price          4622 non-null   object
dtypes: int64(2), object(3)
memory usage: 180.7+ KB


### Step 4. How many products cost more than $10.00?

In [39]:
chipo["item_price"] = chipo["item_price"] \
                        .apply(lambda x: x[1:]) \
                        .apply(lambda x: float(x))
chipo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   order_id            4622 non-null   int64  
 1   quantity            4622 non-null   int64  
 2   item_name           4622 non-null   object 
 3   choice_description  3376 non-null   object 
 4   item_price          4622 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 180.7+ KB


In [40]:
price_filter = (chipo["item_price"] > 10.0) & (chipo["quantity"] == 1)
chipo[price_filter]["item_name"].unique()

array(['Chicken Bowl', 'Steak Burrito', 'Chicken Burrito',
       'Barbacoa Bowl', 'Veggie Burrito', 'Veggie Bowl',
       'Chicken Soft Tacos', 'Steak Bowl', 'Carnitas Burrito',
       'Carnitas Bowl', 'Barbacoa Burrito', 'Barbacoa Crispy Tacos',
       'Veggie Salad Bowl', 'Chicken Salad', 'Chicken Crispy Tacos',
       'Steak Salad Bowl', 'Veggie Soft Tacos', 'Barbacoa Soft Tacos',
       'Carnitas Crispy Tacos', 'Carnitas Salad Bowl',
       'Chicken Salad Bowl', 'Barbacoa Salad Bowl', 'Steak Soft Tacos',
       'Carnitas Soft Tacos', 'Steak Crispy Tacos'], dtype=object)

In [41]:
chipo[price_filter]["item_name"].unique().shape[0]

25

### Step 5. What is the price of each item?
###### print a data frame with only two columns item_name and item_price

In [42]:
# first obtiene el primer valor de esa columna de cada grupo
chipo[chipo['quantity'] == 1] \
  .groupby("item_name") \
  .agg({ "item_price": "mean" }) \
  .reset_index() \
  .round({"item_price": 2}) \
  .rename(columns={"item_price": "item_mean_price"}) \
  .sort_values(by="item_mean_price")

Unnamed: 0,item_name,item_mean_price
9,Canned Soda,1.09
10,Canned Soft Drink,1.25
6,Bottled Water,1.44
37,Side of Chips,1.69
23,Chips,2.15
32,Chips and Tomatillo-Red Chili Salsa,2.39
31,Chips and Tomatillo-Green Chili Salsa,2.39
28,Chips and Roasted Chili-Corn Salsa,2.39
24,Chips and Fresh Tomato Salsa,2.74
30,Chips and Tomatillo Red Chili Salsa,2.95


### Step 6. Sort by the name of the item

In [43]:
chipo.sort_values(by=["item_name"], ascending= True, inplace=True)
chipo

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
3389,1360,2,6 Pack Soft Drink,[Diet Coke],12.98
341,148,1,6 Pack Soft Drink,[Diet Coke],6.49
1849,749,1,6 Pack Soft Drink,[Coke],6.49
1860,754,1,6 Pack Soft Drink,[Diet Coke],6.49
2713,1076,1,6 Pack Soft Drink,[Coke],6.49
...,...,...,...,...,...
2384,948,1,Veggie Soft Tacos,"[Roasted Chili Corn Salsa, [Fajita Vegetables,...",8.75
781,322,1,Veggie Soft Tacos,"[Fresh Tomato Salsa, [Black Beans, Cheese, Sou...",8.75
2851,1132,1,Veggie Soft Tacos,"[Roasted Chili Corn Salsa (Medium), [Black Bea...",8.49
1699,688,1,Veggie Soft Tacos,"[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...",11.25


### Step 7. What was the quantity of the most expensive item ordered?

In [47]:
most_expensive = chipo.sort_values(by=["item_price"], ascending=True) \
     .iloc[chipo.shape[0] - 1:]
most_expensive

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
3598,1443,15,Chips and Fresh Tomato Salsa,,44.25


In [48]:
most_expensive.loc[:, ["quantity"]]

Unnamed: 0,quantity
3598,15


### Step 8. How many times was a Veggie Salad Bowl ordered?

In [49]:
veggie_filter = chipo["item_name"].str.contains(pat="Veggie Salad Bowl", case=False)
chipo[veggie_filter].shape[0]

18

### Step 9. How many times did someone order more than one Canned Soda?

In [50]:
canned_soda_filter = (chipo["item_name"].str.contains(pat="Canned Soda", case=False)) \
                & (chipo['quantity'] > 1)
chipo[canned_soda_filter].shape[0]

20