# Nutrition Analysis Tour
## Cleaning initial .csv file using pandas

In [1]:
import pandas as pd

In [2]:
sb_clean = pd.read_csv("starbucks_drink_menu.csv")
sb_clean.head()

Unnamed: 0,Beverage_category,Beverage,Beverage_prep,Calories,Total Fat (g),Trans Fat (g),Saturated Fat (g),Sodium (mg),Total Carbohydrates (g),Cholesterol (mg),Dietary Fibre (g),Sugars (g),Protein (g),Vitamin A (% DV),Vitamin C (% DV),Calcium (% DV),Iron (% DV),Caffeine (mg)
0,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10%,0%,20%,0%,75
1,Classic Espresso Drinks,Caffè Latte,2% Milk,100,3.5,2.0,0.1,15,85,10,0,9,6.0,10%,0%,20%,0%,75
2,Classic Espresso Drinks,Caffè Latte,Soymilk,70,2.5,0.4,0.0,0,65,6,1,4,5.0,6%,0%,20%,8%,75
3,Classic Espresso Drinks,Caffè Latte,Tall Nonfat Milk,100,0.2,0.2,0.0,5,120,15,0,14,10.0,15%,0%,30%,0%,75
4,Classic Espresso Drinks,Caffè Latte,2% Milk,150,6.0,3.0,0.2,25,135,15,0,14,10.0,15%,0%,30%,0%,75


In [3]:
# Let's see what we are up against here 
# Check for missing data & identify data types
sb_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Beverage_category          240 non-null    object 
 1   Beverage                   240 non-null    object 
 2   Beverage_prep              240 non-null    object 
 3   Calories                   240 non-null    int64  
 4    Total Fat (g)             240 non-null    object 
 5   Trans Fat (g)              240 non-null    float64
 6   Saturated Fat (g)          240 non-null    float64
 7    Sodium (mg)               240 non-null    int64  
 8    Total Carbohydrates (g)   240 non-null    int64  
 9   Cholesterol (mg)           240 non-null    int64  
 10   Dietary Fibre (g)         240 non-null    int64  
 11   Sugars (g)                240 non-null    int64  
 12   Protein (g)               240 non-null    float64
 13  Vitamin A (% DV)           240 non-null    object 

In [4]:
# The Caffeine (mg) column is minus 1 value compared to all other columns
# Identify record 1 missing at least 1 value
sb_clean[sb_clean.isna().any(axis = 1)]

Unnamed: 0,Beverage_category,Beverage,Beverage_prep,Calories,Total Fat (g),Trans Fat (g),Saturated Fat (g),Sodium (mg),Total Carbohydrates (g),Cholesterol (mg),Dietary Fibre (g),Sugars (g),Protein (g),Vitamin A (% DV),Vitamin C (% DV),Calcium (% DV),Iron (% DV),Caffeine (mg)
133,Shaken Iced Beverages,Iced Brewed Coffee (With Milk & Classic Syrup),2% Milk,90,1,0.5,0.0,5,25,18,0,18,2.0,2%,0%,6%,0.00%,


In [5]:
# Manual observation of data in this region: the same drink (with other milk types) contains 90mg caffeine, will add in this value
sb_clean["Caffeine (mg)"].replace([None], [90], inplace = True)

In [6]:
# Verify this value is added properly
sb_clean.loc[158, "Caffeine (mg)"]

'130'

In [7]:
# Check dataset for duplicates
sb_clean.duplicated().sum()

0

In [8]:
# Change header names to be more data-agnostic (e.g. make lowercase with underscore)
# Haha!  White spaces abound in column header names.  
dict = {"Beverage_category":"category", "Beverage":"name", "Beverage_prep":"prep", "Calories":"calories", " Total Fat (g)":"total_fat_g", "Trans Fat (g) ":"trans_fat_g", 
        "Saturated Fat (g)":"sat_fat_g", " Sodium (mg)":"sodium_mg", " Total Carbohydrates (g) ":"carbs_g", "Cholesterol (mg)":"chol_mg", " Dietary Fibre (g)":"fiber_g",
        " Sugars (g)":"sugars_g", " Protein (g) ":"protein_g", "Vitamin A (% DV) ":"vit_a_%dv", "Vitamin C (% DV)":"vit_c_%dv", " Calcium (% DV) ":"calcium_%dv", 
        "Iron (% DV) ":"iron_%dv", "Caffeine (mg)":"caffeine_mg"
}
sb_clean.rename(columns = dict, inplace = True)
sb_clean.head()

Unnamed: 0,category,name,prep,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
0,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10%,0%,20%,0%,75
1,Classic Espresso Drinks,Caffè Latte,2% Milk,100,3.5,2.0,0.1,15,85,10,0,9,6.0,10%,0%,20%,0%,75
2,Classic Espresso Drinks,Caffè Latte,Soymilk,70,2.5,0.4,0.0,0,65,6,1,4,5.0,6%,0%,20%,8%,75
3,Classic Espresso Drinks,Caffè Latte,Tall Nonfat Milk,100,0.2,0.2,0.0,5,120,15,0,14,10.0,15%,0%,30%,0%,75
4,Classic Espresso Drinks,Caffè Latte,2% Milk,150,6.0,3.0,0.2,25,135,15,0,14,10.0,15%,0%,30%,0%,75


In [9]:
# Remove extraneous % symbol from values
sb_clean['vit_a_%dv'] = sb_clean['vit_a_%dv'].str.strip('%')
sb_clean['vit_c_%dv'] = sb_clean['vit_c_%dv'].str.strip('%')
sb_clean['calcium_%dv'] = sb_clean['calcium_%dv'].str.strip('%')
sb_clean['iron_%dv'] = sb_clean['iron_%dv'].str.strip('%')
sb_clean

Unnamed: 0,category,name,prep,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
0,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10,0,20,0,75
1,Classic Espresso Drinks,Caffè Latte,2% Milk,100,3.5,2.0,0.1,15,85,10,0,9,6.0,10,0,20,0,75
2,Classic Espresso Drinks,Caffè Latte,Soymilk,70,2.5,0.4,0.0,0,65,6,1,4,5.0,6,0,20,8,75
3,Classic Espresso Drinks,Caffè Latte,Tall Nonfat Milk,100,0.2,0.2,0.0,5,120,15,0,14,10.0,15,0,30,0,75
4,Classic Espresso Drinks,Caffè Latte,2% Milk,150,6,3.0,0.2,25,135,15,0,14,10.0,15,0,30,0,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,Tazo® Tea Drinks,Tazo® Tea,Grande,0,0,0.0,0.0,0,0,0,0,0,0.0,0,0,0,0,Varies
236,Tazo® Tea Drinks,Tazo® Tea,Venti,0,0,0.0,0.0,0,0,0,0,0,0.0,0,0,0,0,Varies
237,Shaken Iced Beverages,Iced Brewed Coffee (With Classic Syrup),Tall,60,0,0.0,0.0,0,4,15,0,15,0.2,0,0,0,0,120
238,Shaken Iced Beverages,Iced Brewed Coffee (With Classic Syrup),Grande,90,0.1,0.0,0.0,0,5,21,0,21,0.3,0,0,0,0,165


In [10]:
# For vitamins/minerals and caffeine, verify that columns contain only integer-friendly strings
sb_clean['iron_%dv'].value_counts()
# Oopsie daisy some floats are lurking

0        97
2        20
10       17
20       16
8        15
6        14
15       11
4        11
30        9
25        9
0.00      9
40        3
35        3
50        2
6.00      1
8.00      1
10.00     1
15.00     1
Name: iron_%dv, dtype: int64

In [11]:
# Remove float nomenclature
sb_clean['iron_%dv'] = sb_clean['iron_%dv'].apply(lambda x: x.replace('.00', '') if '.00' in x else x)

In [12]:
sb_clean['iron_%dv'].value_counts()

0     106
2      20
10     18
8      16
20     16
6      15
15     12
4      11
25      9
30      9
40      3
35      3
50      2
Name: iron_%dv, dtype: int64

In [13]:
sb_clean['caffeine_mg'].value_counts()
# Alas.  A mg value of "varies" is less than useless

75        36
0         35
150       33
70        14
varies    12
95        11
Varies     9
110        9
130        7
120        6
25         6
15         4
90         4
175        4
55         3
170        3
85         3
145        3
80         3
140        3
180        3
100        3
125        3
10         3
20         3
30         3
50         3
165        2
90         1
260        1
330        1
410        1
225        1
300        1
65         1
105        1
235        1
Name: caffeine_mg, dtype: int64

In [14]:
# Filtering data to remove non-numeric caffeine values
sb_clean = sb_clean[sb_clean['caffeine_mg'] != 'varies']
sb_clean = sb_clean[sb_clean['caffeine_mg'] != 'Varies']
sb_clean['caffeine_mg'].value_counts()

75     36
0      35
150    33
70     14
95     11
110     9
130     7
25      6
120     6
15      4
90      4
175     4
50      3
125     3
145     3
100     3
30      3
85      3
170     3
140     3
180     3
10      3
20      3
80      3
55      3
165     2
90      1
260     1
330     1
410     1
225     1
300     1
65      1
105     1
235     1
Name: caffeine_mg, dtype: int64

In [15]:
sb_clean['total_fat_g'].value_counts()

0.1    32
1.5    15
5      14
3      14
1      13
6      13
4      12
0.2    12
2.5    12
3.5    11
7      10
0      10
4.5     9
2       9
9       6
8       6
0.3     6
0.5     4
11      3
10      3
0.4     2
15      1
13      1
3 2     1
Name: total_fat_g, dtype: int64

In [16]:
# Fix the typo containing an extra space
sb_clean[sb_clean.total_fat_g == '3 2']
sb_clean.total_fat_g.replace("3 2", "32", inplace = True)
sb_clean['total_fat_g'].value_counts()

0.1    32
1.5    15
5      14
3      14
1      13
6      13
4      12
0.2    12
2.5    12
3.5    11
7      10
0      10
4.5     9
2       9
9       6
8       6
0.3     6
0.5     4
11      3
10      3
0.4     2
15      1
13      1
32      1
Name: total_fat_g, dtype: int64

In [17]:
# Cast vitamin/mineral and caffeine data type from object to integer
sb_clean['total_fat_g'] = sb_clean['total_fat_g'].astype(float)
sb_clean['vit_a_%dv'] = sb_clean['vit_a_%dv'].astype(int)
sb_clean['vit_c_%dv'] = sb_clean['vit_c_%dv'].astype(int)
sb_clean['calcium_%dv'] = sb_clean['calcium_%dv'].astype(int)
sb_clean['iron_%dv'] = sb_clean['iron_%dv'].astype(int)
sb_clean['caffeine_mg'] = sb_clean['caffeine_mg'].astype(int)
sb_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 219 entries, 0 to 239
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   category     219 non-null    object 
 1   name         219 non-null    object 
 2   prep         219 non-null    object 
 3   calories     219 non-null    int64  
 4   total_fat_g  219 non-null    float64
 5   trans_fat_g  219 non-null    float64
 6   sat_fat_g    219 non-null    float64
 7   sodium_mg    219 non-null    int64  
 8   carbs_g      219 non-null    int64  
 9   chol_mg      219 non-null    int64  
 10  fiber_g      219 non-null    int64  
 11  sugars_g     219 non-null    int64  
 12  protein_g    219 non-null    float64
 13  vit_a_%dv    219 non-null    int32  
 14  vit_c_%dv    219 non-null    int32  
 15  calcium_%dv  219 non-null    int32  
 16  iron_%dv     219 non-null    int32  
 17  caffeine_mg  219 non-null    int32  
dtypes: float64(4), int32(5), int64(6), object(3)
memor

In [18]:
# Normalize prep column: add column for milk type
sb_clean.insert(3, "milk_type", sb_clean["prep"])

In [19]:
sb_clean.head()

Unnamed: 0,category,name,prep,milk_type,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
0,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,Short Nonfat Milk,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10,0,20,0,75
1,Classic Espresso Drinks,Caffè Latte,2% Milk,2% Milk,100,3.5,2.0,0.1,15,85,10,0,9,6.0,10,0,20,0,75
2,Classic Espresso Drinks,Caffè Latte,Soymilk,Soymilk,70,2.5,0.4,0.0,0,65,6,1,4,5.0,6,0,20,8,75
3,Classic Espresso Drinks,Caffè Latte,Tall Nonfat Milk,Tall Nonfat Milk,100,0.2,0.2,0.0,5,120,15,0,14,10.0,15,0,30,0,75
4,Classic Espresso Drinks,Caffè Latte,2% Milk,2% Milk,150,6.0,3.0,0.2,25,135,15,0,14,10.0,15,0,30,0,75


In [20]:
# Update milk_type with selection of soy, 2%, whole, or nonfat
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: 'Soy' if 'Soy' in x else x)
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: 'Nonfat' if 'Nonfat' in x else x)
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: '2%' if '2%' in x else x)
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: 'Whole' if 'Whole Milk' in x else x)
sb_clean['milk_type'].value_counts()

Nonfat    79
Soy       62
2%        46
Whole     16
Tall       5
Grande     4
Venti      4
Short      3
Name: milk_type, dtype: int64

In [21]:
# Remove extraneous descriptors besides types of milk
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: '' if 'Tall' in x or 'Grande' in x or 'Venti' in x or 'Short' in x else x)
sb_clean['milk_type'].value_counts()

Nonfat    79
Soy       62
2%        46
Whole     16
          16
Name: milk_type, dtype: int64

In [22]:
# Normalize prep column: add column for size_oz
sb_clean.insert(3, "size_oz", sb_clean["prep"])
sb_clean.head()

Unnamed: 0,category,name,prep,size_oz,milk_type,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
0,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,Short Nonfat Milk,Nonfat,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10,0,20,0,75
1,Classic Espresso Drinks,Caffè Latte,2% Milk,2% Milk,2%,100,3.5,2.0,0.1,15,85,10,0,9,6.0,10,0,20,0,75
2,Classic Espresso Drinks,Caffè Latte,Soymilk,Soymilk,Soy,70,2.5,0.4,0.0,0,65,6,1,4,5.0,6,0,20,8,75
3,Classic Espresso Drinks,Caffè Latte,Tall Nonfat Milk,Tall Nonfat Milk,Nonfat,100,0.2,0.2,0.0,5,120,15,0,14,10.0,15,0,30,0,75
4,Classic Espresso Drinks,Caffè Latte,2% Milk,2% Milk,2%,150,6.0,3.0,0.2,25,135,15,0,14,10.0,15,0,30,0,75


In [23]:
# Store dataframe containing workable pattern (e.g. Short Nonfat Milk, 2% Milk, Soymilk)
sb_clean_slice = sb_clean.loc[0:197]

In [24]:
# Attribute integers to size_oz column based upon pattern in sb_clean_slice.prep
sb_clean_slice.loc[sb_clean_slice['prep'].shift(+1).eq('Short Nonfat Milk'), 'size_oz'] = 8
sb_clean_slice.loc[sb_clean_slice['prep'].shift(+2).eq('Short Nonfat Milk'), 'size_oz'] = 8

sb_clean_slice.loc[sb_clean_slice['prep'].shift(+1).eq('Tall Nonfat Milk'), 'size_oz'] = 12
sb_clean_slice.loc[sb_clean_slice['prep'].shift(+2).eq('Tall Nonfat Milk'), 'size_oz'] = 12

sb_clean_slice.loc[sb_clean_slice['prep'].shift(+1).eq('Grande Nonfat Milk'), 'size_oz'] = 16
sb_clean_slice.loc[sb_clean_slice['prep'].shift(+2).eq('Grande Nonfat Milk'), 'size_oz'] = 16

sb_clean_slice.loc[sb_clean_slice['prep'].shift(+1).eq('Venti Nonfat Milk'), 'size_oz'] = 20
sb_clean_slice.loc[sb_clean_slice['prep'].shift(+2).eq('Venti Nonfat Milk'), 'size_oz'] = 20

In [25]:
# Almost there...all size_oz column values containing only a milk type description have been properly changed to an integer of ounce size
sb_clean['size_oz'].value_counts()

16                    38
12                    34
20                    32
Grande Nonfat Milk    25
Tall Nonfat Milk      22
Venti Nonfat Milk     21
8                     20
Short Nonfat Milk     11
Tall                   5
Grande                 4
Venti                  4
Short                  3
Name: size_oz, dtype: int64

In [26]:
# Update remaining values containing milk desciptions to become only integer values 
sb_clean['size_oz'].replace(["Short", "Short Nonfat Milk"], 8, inplace = True)
sb_clean['size_oz'].replace(["Tall", "Tall Nonfat Milk"], 12, inplace = True)
sb_clean['size_oz'].replace(["Grande", "Grande Nonfat Milk"], 16, inplace = True)
sb_clean['size_oz'].replace(["Venti", "Venti Nonfat Milk"], 20, inplace = True)
sb_clean['size_oz'].value_counts()

16    67
12    61
20    57
8     34
Name: size_oz, dtype: int64

In [27]:
# Move the caffeine_mg column
col = sb_clean.pop("caffeine_mg")
sb_clean.insert(6, "caffeine_mg", col)

In [28]:
sb_clean.head()

Unnamed: 0,category,name,prep,size_oz,milk_type,calories,caffeine_mg,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv
0,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,8,Nonfat,70,75,0.1,0.1,0.0,5,75,10,0,9,6.0,10,0,20,0
1,Classic Espresso Drinks,Caffè Latte,2% Milk,8,2%,100,75,3.5,2.0,0.1,15,85,10,0,9,6.0,10,0,20,0
2,Classic Espresso Drinks,Caffè Latte,Soymilk,8,Soy,70,75,2.5,0.4,0.0,0,65,6,1,4,5.0,6,0,20,8
3,Classic Espresso Drinks,Caffè Latte,Tall Nonfat Milk,12,Nonfat,100,75,0.2,0.2,0.0,5,120,15,0,14,10.0,15,0,30,0
4,Classic Espresso Drinks,Caffè Latte,2% Milk,12,2%,150,75,6.0,3.0,0.2,25,135,15,0,14,10.0,15,0,30,0


In [29]:
# Add % DV columns for calories, caffeine_mg, total_fat, sugars_g
cal_col = (sb_clean.calories/2000) * 100
sb_clean.insert(loc = 6, column = 'calories_%dv', value = cal_col)

In [30]:
caff_col = (sb_clean.caffeine_mg/400) * 100
sb_clean.insert(loc = 8, column = 'caffeine_%dv', value = caff_col)

In [31]:
fat_col = (sb_clean.total_fat_g/78) * 100
sb_clean.insert(loc = 10, column = 'fat_%dv', value = fat_col)

In [32]:
sug_col = (sb_clean.sugars_g/50) * 100
sb_clean.insert(loc = 18, column = 'sugars_%dv', value = sug_col)

In [33]:
# Remove the unnecessary 'prep' column
sb_clean.drop(columns= "prep", inplace = True)

In [34]:
# Round floats nicely
sb_clean = sb_clean.round(0)

In [36]:
# Export clean file
sb_clean.to_csv('sb_clean.csv', index = False)