# Nutrition Analysis Tour
## Cleaning initial .csv file using pandas

In [1]:
import pandas as pd

In [2]:
sb_clean = pd.read_csv("starbucks_drink_menu.csv")
sb_clean.head()

Unnamed: 0,Beverage_category,Beverage,Beverage_prep,Calories,Total Fat (g),Trans Fat (g),Saturated Fat (g),Sodium (mg),Total Carbohydrates (g),Cholesterol (mg),Dietary Fibre (g),Sugars (g),Protein (g),Vitamin A (% DV),Vitamin C (% DV),Calcium (% DV),Iron (% DV),Caffeine (mg)
0,Coffee,Brewed Coffee,Short,3,0.1,0.0,0.0,0,5,0,0,0,0.3,0%,0%,0%,0%,175
1,Coffee,Brewed Coffee,Tall,4,0.1,0.0,0.0,0,10,0,0,0,0.5,0%,0%,0%,0%,260
2,Coffee,Brewed Coffee,Grande,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0%,0%,0%,0%,330
3,Coffee,Brewed Coffee,Venti,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0%,0%,2%,0%,410
4,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10%,0%,20%,0%,75


In [3]:
# Let's see what we are up against here 
# check for missing data & data types
sb_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242 entries, 0 to 241
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Beverage_category          242 non-null    object 
 1   Beverage                   242 non-null    object 
 2   Beverage_prep              242 non-null    object 
 3   Calories                   242 non-null    int64  
 4    Total Fat (g)             242 non-null    object 
 5   Trans Fat (g)              242 non-null    float64
 6   Saturated Fat (g)          242 non-null    float64
 7    Sodium (mg)               242 non-null    int64  
 8    Total Carbohydrates (g)   242 non-null    int64  
 9   Cholesterol (mg)           242 non-null    int64  
 10   Dietary Fibre (g)         242 non-null    int64  
 11   Sugars (g)                242 non-null    int64  
 12   Protein (g)               242 non-null    float64
 13  Vitamin A (% DV)           242 non-null    object 

In [4]:
# check for at least 1 missing value in a row
sb_clean[sb_clean.isna().any(axis = 1)]
# observation of data in this region: the same drink (with other milk types) contains 90mg caffeine, will add in this value

Unnamed: 0,Beverage_category,Beverage,Beverage_prep,Calories,Total Fat (g),Trans Fat (g),Saturated Fat (g),Sodium (mg),Total Carbohydrates (g),Cholesterol (mg),Dietary Fibre (g),Sugars (g),Protein (g),Vitamin A (% DV),Vitamin C (% DV),Calcium (% DV),Iron (% DV),Caffeine (mg)
158,Shaken Iced Beverages,Iced Brewed Coffee (With Milk & Classic Syrup),2% Milk,90,1,0.5,0.0,5,25,18,0,18,2.0,2%,0%,6%,0.00%,


In [5]:
# add missing caffeine value
sb_clean["Caffeine (mg)"].replace([None], [90], inplace = True)

In [6]:
sb_clean.loc[158, "Caffeine (mg)"]

90

In [7]:
# check for duplicates
sb_clean.duplicated().sum()

0

In [10]:
# Clean up header names (lowercase with underscore)
# haha!  White spaces abount in column header names.  I learned a valuable lesson.
dict = {"Beverage_category":"category", "Beverage":"name", "Beverage_prep":"prep", "Calories":"calories", " Total Fat (g)":"total_fat_g", "Trans Fat (g) ":"trans_fat_g", 
        "Saturated Fat (g)":"sat_fat_g", " Sodium (mg)":"sodium_mg", " Total Carbohydrates (g) ":"carbs_g", "Cholesterol (mg)":"chol_mg", " Dietary Fibre (g)":"fiber_g",
        " Sugars (g)":"sugars_g", " Protein (g) ":"protein_g", "Vitamin A (% DV) ":"vit_a_%dv", "Vitamin C (% DV)":"vit_c_%dv", " Calcium (% DV) ":"calcium_%dv", 
        "Iron (% DV) ":"iron_%dv", "Caffeine (mg)":"caffeine_mg"
}
sb_clean.rename(columns = dict, inplace = True)
sb_clean.head()

Unnamed: 0,category,name,prep,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
0,Coffee,Brewed Coffee,Short,3,0.1,0.0,0.0,0,5,0,0,0,0.3,0,0%,0%,0%,175
1,Coffee,Brewed Coffee,Tall,4,0.1,0.0,0.0,0,10,0,0,0,0.5,0,0%,0%,0%,260
2,Coffee,Brewed Coffee,Grande,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0,0%,0%,0%,330
3,Coffee,Brewed Coffee,Venti,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0,0%,2%,0%,410
4,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10,0%,20%,0%,75


In [11]:
# remove extraneous % symbol
sb_clean['vit_a_%dv'] = sb_clean['vit_a_%dv'].str.strip('%')
sb_clean['vit_c_%dv'] = sb_clean['vit_c_%dv'].str.strip('%')
sb_clean['calcium_%dv'] = sb_clean['calcium_%dv'].str.strip('%')
sb_clean['iron_%dv'] = sb_clean['iron_%dv'].str.strip('%')
sb_clean

Unnamed: 0,category,name,prep,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
0,Coffee,Brewed Coffee,Short,3,0.1,0.0,0.0,0,5,0,0,0,0.3,0,0,0,0,175
1,Coffee,Brewed Coffee,Tall,4,0.1,0.0,0.0,0,10,0,0,0,0.5,0,0,0,0,260
2,Coffee,Brewed Coffee,Grande,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0,0,0,0,330
3,Coffee,Brewed Coffee,Venti,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0,0,2,0,410
4,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10,0,20,0,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,Frappuccino® Blended Crème,Strawberries & Crème (Without Whipped Cream),Soymilk,320,3 2,0.4,0.0,0,250,67,1,64,5.0,6,8,20,10,0
238,Frappuccino® Blended Crème,Vanilla Bean (Without Whipped Cream),Tall Nonfat Milk,170,0.1,0.1,0.0,0,160,39,0,38,4.0,6,0,10,0,0
239,Frappuccino® Blended Crème,Vanilla Bean (Without Whipped Cream),Whole Milk,200,3.5,2.0,0.1,10,160,39,0,38,3.0,6,0,10,0,0
240,Frappuccino® Blended Crème,Vanilla Bean (Without Whipped Cream),Soymilk,180,1.5,0.2,0.0,0,160,37,1,35,3.0,4,0,10,6,0


In [12]:
# now verify that these columns contain only integer-like strings for vitamins/minerals and caffeine
sb_clean['iron_%dv'].value_counts()
# ooo gross random floats, fix it!

0        99
2        20
10       17
20       16
8        15
6        14
15       11
4        11
30        9
25        9
0.00      9
40        3
35        3
50        2
6.00      1
8.00      1
10.00     1
15.00     1
Name: iron_%dv, dtype: int64

In [13]:
sb_clean['iron_%dv'] = sb_clean['iron_%dv'].apply(lambda x: x.replace('.00', '') if '.00' in x else x)

In [14]:
sb_clean['iron_%dv'].value_counts()

0     108
2      20
10     18
8      16
20     16
6      15
15     12
4      11
25      9
30      9
40      3
35      3
50      2
Name: iron_%dv, dtype: int64

In [19]:
sb_clean['caffeine_mg'].value_counts()
# Alas.  A mg value of "varies" is less than useless, so we will omit this data.

75        37
0         35
150       34
70        14
varies    12
95        11
Varies    10
110        9
130        7
25         6
120        6
90         4
175        4
20         3
15         3
125        3
140        3
30         3
85         3
180        3
80         3
55         3
100        3
145        3
50         3
10         3
170        3
165        2
65         1
330        1
235        1
90         1
410        1
225        1
260        1
300        1
105        1
Name: caffeine_mg, dtype: int64

In [23]:
# Filtering data to remove non-numeric caffeine values
sb_clean = sb_clean[sb_clean['caffeine_mg'] != 'varies']
sb_clean = sb_clean[sb_clean['caffeine_mg'] != 'Varies']
sb_clean['caffeine_mg'].value_counts()

75     37
0      35
150    34
70     14
95     11
110     9
130     7
25      6
120     6
90      4
175     4
10      3
15      3
125     3
140     3
20      3
85      3
80      3
55      3
180     3
100     3
50      3
145     3
30      3
170     3
165     2
330     1
65      1
235     1
90      1
410     1
260     1
225     1
300     1
105     1
Name: caffeine_mg, dtype: int64

In [25]:
# cast vitamin/mineral data type from object to int
sb_clean['vit_a_%dv'] = sb_clean['vit_a_%dv'].astype(int)
sb_clean['vit_c_%dv'] = sb_clean['vit_c_%dv'].astype(int)
sb_clean['calcium_%dv'] = sb_clean['calcium_%dv'].astype(int)
sb_clean['iron_%dv'] = sb_clean['iron_%dv'].astype(int)
sb_clean['caffeine_mg'] = sb_clean['caffeine_mg'].astype(int)
sb_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 220 entries, 0 to 241
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   category     220 non-null    object 
 1   name         220 non-null    object 
 2   prep         220 non-null    object 
 3   calories     220 non-null    int64  
 4   total_fat_g  220 non-null    object 
 5   trans_fat_g  220 non-null    float64
 6   sat_fat_g    220 non-null    float64
 7   sodium_mg    220 non-null    int64  
 8   carbs_g      220 non-null    int64  
 9   chol_mg      220 non-null    int64  
 10  fiber_g      220 non-null    int64  
 11  sugars_g     220 non-null    int64  
 12  protein_g    220 non-null    float64
 13  vit_a_%dv    220 non-null    int32  
 14  vit_c_%dv    220 non-null    int32  
 15  calcium_%dv  220 non-null    int32  
 16  iron_%dv     220 non-null    int32  
 17  caffeine_mg  220 non-null    int32  
dtypes: float64(3), int32(5), int64(6), object(4)
memor

In [27]:
# Normalize prep column: add column for milk type
sb_clean.insert(3, "milk_type", sb_clean["prep"])

ValueError: cannot insert milk_type, already exists

In [28]:
sb_clean.head(15)

Unnamed: 0,category,name,prep,milk_type,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
0,Coffee,Brewed Coffee,Short,Short,3,0.1,0.0,0.0,0,5,0,0,0,0.3,0,0,0,0,175
1,Coffee,Brewed Coffee,Tall,Tall,4,0.1,0.0,0.0,0,10,0,0,0,0.5,0,0,0,0,260
2,Coffee,Brewed Coffee,Grande,Grande,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0,0,0,0,330
3,Coffee,Brewed Coffee,Venti,Venti,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0,0,2,0,410
4,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,Short Nonfat Milk,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10,0,20,0,75
5,Classic Espresso Drinks,Caffè Latte,2% Milk,2% Milk,100,3.5,2.0,0.1,15,85,10,0,9,6.0,10,0,20,0,75
6,Classic Espresso Drinks,Caffè Latte,Soymilk,Soymilk,70,2.5,0.4,0.0,0,65,6,1,4,5.0,6,0,20,8,75
7,Classic Espresso Drinks,Caffè Latte,Tall Nonfat Milk,Tall Nonfat Milk,100,0.2,0.2,0.0,5,120,15,0,14,10.0,15,0,30,0,75
8,Classic Espresso Drinks,Caffè Latte,2% Milk,2% Milk,150,6.0,3.0,0.2,25,135,15,0,14,10.0,15,0,30,0,75
9,Classic Espresso Drinks,Caffè Latte,Soymilk,Soymilk,110,4.5,0.5,0.0,0,105,10,1,6,8.0,10,0,30,15,75


In [32]:
# Using lambdas to update milk_type with selection of soy, 2%, whole, or nonfat
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: 'Soy' if 'Soy' in x else x)
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: 'Nonfat' if 'Nonfat' in x else x)
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: '2%' if '2%' in x else x)
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: 'Whole' if 'Whole Milk' in x else x)
sb_clean['milk_type'].value_counts()

Nonfat    78
Soy       62
2%        46
          18
Whole     16
Name: milk_type, dtype: int64

In [31]:
# Remove extraneous descriptors besides types of milk
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: '' if 'Tall' in x or 'Grande' in x or 'Venti' in x or 'Short' in x or 'Solo' in x or 'Doppio' in x else x)
sb_clean['milk_type'].value_counts()

Nonfat        78
Soy           62
2%            46
              18
Whole Milk    16
Name: milk_type, dtype: int64

In [34]:
# Normalize prep column: add column for size_oz
sb_clean.insert(3, "size_oz", sb_clean["prep"])
sb_clean.head()

Unnamed: 0,category,name,prep,size_oz,milk_type,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
0,Coffee,Brewed Coffee,Short,Short,,3,0.1,0.0,0.0,0,5,0,0,0,0.3,0,0,0,0,175
1,Coffee,Brewed Coffee,Tall,Tall,,4,0.1,0.0,0.0,0,10,0,0,0,0.5,0,0,0,0,260
2,Coffee,Brewed Coffee,Grande,Grande,,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0,0,0,0,330
3,Coffee,Brewed Coffee,Venti,Venti,,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0,0,2,0,410
4,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,Short Nonfat Milk,Nonfat,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10,0,20,0,75


In [35]:
# Using replace() to update values (solo = 0.75oz, doppio = 1.5oz, short = 8 oz, tall = 12 oz, grande = 16oz, venti hot = 20oz, venti cold = 24 oz) 
# Column will be cold and = 24 if Beverage_category contains “Iced”, “Smoothies” or “Blended”
# 1st param = thing we're trying to replace, 2nd param = what we want to replace with
sb_clean['size_oz'].replace("Short", 8, inplace = True)
sb_clean['size_oz'].replace("Tall", 12, inplace = True)
sb_clean['size_oz'].replace("Grande", 16, inplace = True)
sb_clean['size_oz'].replace("Venti", 20, inplace = True)
sb_clean['size_oz'].value_counts()

Soymilk               62
2% Milk               46
Grande Nonfat Milk    24
Tall Nonfat Milk      22
Venti Nonfat Milk     21
Whole Milk            16
Short Nonfat Milk     11
12                     5
16                     4
20                     4
8                      3
Solo                   1
Doppio                 1
Name: size_oz, dtype: int64

In [36]:
sb_clean.head(20)

Unnamed: 0,category,name,prep,size_oz,milk_type,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
0,Coffee,Brewed Coffee,Short,8,,3,0.1,0.0,0.0,0,5,0,0,0,0.3,0,0,0,0,175
1,Coffee,Brewed Coffee,Tall,12,,4,0.1,0.0,0.0,0,10,0,0,0,0.5,0,0,0,0,260
2,Coffee,Brewed Coffee,Grande,16,,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0,0,0,0,330
3,Coffee,Brewed Coffee,Venti,20,,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0,0,2,0,410
4,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,Short Nonfat Milk,Nonfat,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10,0,20,0,75
5,Classic Espresso Drinks,Caffè Latte,2% Milk,2% Milk,2%,100,3.5,2.0,0.1,15,85,10,0,9,6.0,10,0,20,0,75
6,Classic Espresso Drinks,Caffè Latte,Soymilk,Soymilk,Soy,70,2.5,0.4,0.0,0,65,6,1,4,5.0,6,0,20,8,75
7,Classic Espresso Drinks,Caffè Latte,Tall Nonfat Milk,Tall Nonfat Milk,Nonfat,100,0.2,0.2,0.0,5,120,15,0,14,10.0,15,0,30,0,75
8,Classic Espresso Drinks,Caffè Latte,2% Milk,2% Milk,2%,150,6.0,3.0,0.2,25,135,15,0,14,10.0,15,0,30,0,75
9,Classic Espresso Drinks,Caffè Latte,Soymilk,Soymilk,Soy,110,4.5,0.5,0.0,0,105,10,1,6,8.0,10,0,30,15,75


In [38]:
# If a size_oz value == "Short Nonfat Milk":
# change it to 8
# change next 2 values to 8
# OR
# if a size_oz value == "Soymilk":
# if a value
# df.loc[df['val'].shift(-1).eq('yes'), 'val'] = 'yes'
sb_clean.to_csv('sb_clean_TEST.csv', index = False)

In [None]:
# Add % DV calculations for Fat, Sugar, Caffeine

In [None]:
# Descriptive stats (here or in new NB entry)?
# which drinks have the most sugar?  Overall caffeine in tea vs coffee drinks?

In [None]:
# export clean csv file

In [None]:
# add unique ids for cat_id, name_id, type_id