In [44]:
! pip install qeds
import pandas as pd
import numpy as np
import qeds



# Cleaning Data - Exercises

## Exercise 1

**Convert the string below into a number.**

In [45]:
c2n = "#39"

new_c2n = c2n.replace("#", "")

print(float(new_c2n))

39.0


## Exercises 2-3

**For these exercises, we create the following DataFrame:**

In [46]:
df = pd.DataFrame({"numbers": ["#23", "#24", "#18", "#14", "#12", "#10", "#35"],
                   "nums": ["23", "24", "18", "14", np.nan, "XYZ", "35"],
                   "colors": ["green", "red", "yellow", "orange", "purple", "blue", "pink"],
                   "other_column": [0, 1, 0, 2, 1, 0, 2]})
df.head()

Unnamed: 0,numbers,nums,colors,other_column
0,#23,23.0,green,0
1,#24,24.0,red,1
2,#18,18.0,yellow,0
3,#14,14.0,orange,2
4,#12,,purple,1


## Exercise 2

   **Make a new column called `colors_upper` that contains the elements of `colors` with all uppercase letters.**

In [47]:
df['colors_upper']=df['colors'].str.upper()
df.head()

#if we want only the first letter as capital letter 
#df['colors_upper'] = df["colors"].str.capitalize()

Unnamed: 0,numbers,nums,colors,other_column,colors_upper
0,#23,23.0,green,0,GREEN
1,#24,24.0,red,1,RED
2,#18,18.0,yellow,0,YELLOW
3,#14,14.0,orange,2,ORANGE
4,#12,,purple,1,PURPLE


## Exercise 3

1. **Convert the column `"nums"` to a numeric type using `pd.to_numeric` and save it to the DataFrame as `"nums_tonumeric"`.**

    - Notice that there is a missing value, and a value that is not a number. Look at the documentation for `pd.to_numeric` and think about how to overcome this.

In [48]:
df["nums_tonumeric"] = pd.to_numeric(df["nums"], errors='coerce')
df
# errors "coerce" means that we force the conversion of string to NaN

Unnamed: 0,numbers,nums,colors,other_column,colors_upper,nums_tonumeric
0,#23,23,green,0,GREEN,23.0
1,#24,24,red,1,RED,24.0
2,#18,18,yellow,0,YELLOW,18.0
3,#14,14,orange,2,ORANGE,14.0
4,#12,,purple,1,PURPLE,
5,#10,XYZ,blue,0,BLUE,
6,#35,35,pink,2,PINK,35.0


2. **Think about why this could be a bad idea of used without knowing what your data looks like. (Think about what happens when you apply it to the `"numbers"` column before replacing the `"#"`.)**

In [49]:
#that would set all values to NaN

## Exercise 4

**For this exercise, we use data from an article written by The Upshot at the NYTimes, which has order information from almost 2,000 Chipotle orders and includes information on what was ordered and how much it cost.**

In [50]:
chipotle = qeds.data.load("chipotle_raw")
chipotle.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


1. **Make sure the `item_price` column has a numeric `dtype` (probably float).**

In [51]:
chipotle["item_price_numeric"] = pd.to_numeric(chipotle["item_price"].str.replace("$", ""))
chipotle.dtypes
chipotle.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,item_price_numeric
0,1,1,Chips and Fresh Tomato Salsa,,$2.39,2.39
1,1,1,Izze,[Clementine],$3.39,3.39
2,1,1,Nantucket Nectar,[Apple],$3.39,3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98,16.98


2. **What is the average price of an item with chicken?**

In [52]:
chipotle.loc[chipotle['item_name'].str.match('Chicken'),"item_price_numeric"].mean()

10.133724358974309

3. **What is the average price of an item with steak?**   

In [53]:
chipotle.loc[chipotle['item_name'].str.match('Steak'),"item_price_numeric"].mean()

10.518888888888851

4. **Did chicken or steak produce more revenue (total)?**   

In [54]:
chipotle.loc[chipotle['item_name'].str.match('Chicken'),"item_price_numeric"].sum()>chipotle.loc[chipotle['item_name'].str.match('Steak'),"item_price_numeric"].sum()

#we create a boolean which confirms that chicken produces more total revenue compared to steak

True

In [55]:
#Alternative solution:
chipotle['revenue']=chipotle['item_price_numeric']*chipotle['quantity']

steak= chipotle[chipotle['item_name'].str.contains(pat='Steak')].sum()
print('Product containing steak generated a revenue of', steak.loc['revenue'], 'dollars')
steak= chipotle[chipotle['item_name'].str.contains(pat='Chicken')].sum()
print('Product containing chicken generated a revenue of', steak.loc['revenue'], 'dollars')

Product containing steak generated a revenue of 8072.619999999973 dollars
Product containing chicken generated a revenue of 17742.14999999992 dollars


5. **How many missing items are there in this dataset? How many missing items in each column?**

In [56]:
#in the dataset
chipotle.isnull().sum().sum()

1246

In [57]:
#in each column
chipotle.isnull().sum()

order_id                 0
quantity                 0
item_name                0
choice_description    1246
item_price               0
item_price_numeric       0
revenue                  0
dtype: int64