# Nutrition Analysis Tour
## Cleaning initial .csv file using pandas

In [1]:
import pandas as pd

In [2]:
sb_clean = pd.read_csv("starbucks_drink_menu.csv")
sb_clean.head()

Unnamed: 0,Beverage_category,Beverage,Beverage_prep,Calories,Total Fat (g),Trans Fat (g),Saturated Fat (g),Sodium (mg),Total Carbohydrates (g),Cholesterol (mg),Dietary Fibre (g),Sugars (g),Protein (g),Vitamin A (% DV),Vitamin C (% DV),Calcium (% DV),Iron (% DV),Caffeine (mg)
0,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10%,0%,20%,0%,75
1,Classic Espresso Drinks,Caffè Latte,2% Milk,100,3.5,2.0,0.1,15,85,10,0,9,6.0,10%,0%,20%,0%,75
2,Classic Espresso Drinks,Caffè Latte,Soymilk,70,2.5,0.4,0.0,0,65,6,1,4,5.0,6%,0%,20%,8%,75
3,Classic Espresso Drinks,Caffè Latte,Tall Nonfat Milk,100,0.2,0.2,0.0,5,120,15,0,14,10.0,15%,0%,30%,0%,75
4,Classic Espresso Drinks,Caffè Latte,2% Milk,150,6.0,3.0,0.2,25,135,15,0,14,10.0,15%,0%,30%,0%,75


In [3]:
# Let's see what we are up against here 
# check for missing data & data types
sb_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Beverage_category          240 non-null    object 
 1   Beverage                   240 non-null    object 
 2   Beverage_prep              240 non-null    object 
 3   Calories                   240 non-null    int64  
 4    Total Fat (g)             240 non-null    object 
 5   Trans Fat (g)              240 non-null    float64
 6   Saturated Fat (g)          240 non-null    float64
 7    Sodium (mg)               240 non-null    int64  
 8    Total Carbohydrates (g)   240 non-null    int64  
 9   Cholesterol (mg)           240 non-null    int64  
 10   Dietary Fibre (g)         240 non-null    int64  
 11   Sugars (g)                240 non-null    int64  
 12   Protein (g)               240 non-null    float64
 13  Vitamin A (% DV)           240 non-null    object 

In [4]:
# check for at least 1 missing value in a row
sb_clean[sb_clean.isna().any(axis = 1)]
# observation of data in this region: the same drink (with other milk types) contains 90mg caffeine, will add in this value

Unnamed: 0,Beverage_category,Beverage,Beverage_prep,Calories,Total Fat (g),Trans Fat (g),Saturated Fat (g),Sodium (mg),Total Carbohydrates (g),Cholesterol (mg),Dietary Fibre (g),Sugars (g),Protein (g),Vitamin A (% DV),Vitamin C (% DV),Calcium (% DV),Iron (% DV),Caffeine (mg)
133,Shaken Iced Beverages,Iced Brewed Coffee (With Milk & Classic Syrup),2% Milk,90,1,0.5,0.0,5,25,18,0,18,2.0,2%,0%,6%,0.00%,


In [5]:
# add missing caffeine value
sb_clean["Caffeine (mg)"].replace([None], [90], inplace = True)

In [6]:
sb_clean.loc[158, "Caffeine (mg)"]

'130'

In [7]:
# check for duplicates
sb_clean.duplicated().sum()

0

In [8]:
# Clean up header names (lowercase with underscore)
# haha!  White spaces abount in column header names.  I learned a valuable lesson.
dict = {"Beverage_category":"category", "Beverage":"name", "Beverage_prep":"prep", "Calories":"calories", " Total Fat (g)":"total_fat_g", "Trans Fat (g) ":"trans_fat_g", 
        "Saturated Fat (g)":"sat_fat_g", " Sodium (mg)":"sodium_mg", " Total Carbohydrates (g) ":"carbs_g", "Cholesterol (mg)":"chol_mg", " Dietary Fibre (g)":"fiber_g",
        " Sugars (g)":"sugars_g", " Protein (g) ":"protein_g", "Vitamin A (% DV) ":"vit_a_%dv", "Vitamin C (% DV)":"vit_c_%dv", " Calcium (% DV) ":"calcium_%dv", 
        "Iron (% DV) ":"iron_%dv", "Caffeine (mg)":"caffeine_mg"
}
sb_clean.rename(columns = dict, inplace = True)
sb_clean.head()

Unnamed: 0,category,name,prep,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
0,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10%,0%,20%,0%,75
1,Classic Espresso Drinks,Caffè Latte,2% Milk,100,3.5,2.0,0.1,15,85,10,0,9,6.0,10%,0%,20%,0%,75
2,Classic Espresso Drinks,Caffè Latte,Soymilk,70,2.5,0.4,0.0,0,65,6,1,4,5.0,6%,0%,20%,8%,75
3,Classic Espresso Drinks,Caffè Latte,Tall Nonfat Milk,100,0.2,0.2,0.0,5,120,15,0,14,10.0,15%,0%,30%,0%,75
4,Classic Espresso Drinks,Caffè Latte,2% Milk,150,6.0,3.0,0.2,25,135,15,0,14,10.0,15%,0%,30%,0%,75


In [9]:
# remove extraneous % symbol
sb_clean['vit_a_%dv'] = sb_clean['vit_a_%dv'].str.strip('%')
sb_clean['vit_c_%dv'] = sb_clean['vit_c_%dv'].str.strip('%')
sb_clean['calcium_%dv'] = sb_clean['calcium_%dv'].str.strip('%')
sb_clean['iron_%dv'] = sb_clean['iron_%dv'].str.strip('%')
sb_clean

Unnamed: 0,category,name,prep,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
0,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10,0,20,0,75
1,Classic Espresso Drinks,Caffè Latte,2% Milk,100,3.5,2.0,0.1,15,85,10,0,9,6.0,10,0,20,0,75
2,Classic Espresso Drinks,Caffè Latte,Soymilk,70,2.5,0.4,0.0,0,65,6,1,4,5.0,6,0,20,8,75
3,Classic Espresso Drinks,Caffè Latte,Tall Nonfat Milk,100,0.2,0.2,0.0,5,120,15,0,14,10.0,15,0,30,0,75
4,Classic Espresso Drinks,Caffè Latte,2% Milk,150,6,3.0,0.2,25,135,15,0,14,10.0,15,0,30,0,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,Tazo® Tea Drinks,Tazo® Tea,Grande,0,0,0.0,0.0,0,0,0,0,0,0.0,0,0,0,0,Varies
236,Tazo® Tea Drinks,Tazo® Tea,Venti,0,0,0.0,0.0,0,0,0,0,0,0.0,0,0,0,0,Varies
237,Shaken Iced Beverages,Iced Brewed Coffee (With Classic Syrup),Tall,60,0,0.0,0.0,0,4,15,0,15,0.2,0,0,0,0,120
238,Shaken Iced Beverages,Iced Brewed Coffee (With Classic Syrup),Grande,90,0.1,0.0,0.0,0,5,21,0,21,0.3,0,0,0,0,165


In [10]:
# now verify that these columns contain only integer-like strings for vitamins/minerals and caffeine
sb_clean['iron_%dv'].value_counts()
# ooo gross random floats, fix it!

0        97
2        20
10       17
20       16
8        15
6        14
15       11
4        11
30        9
25        9
0.00      9
40        3
35        3
50        2
6.00      1
8.00      1
10.00     1
15.00     1
Name: iron_%dv, dtype: int64

In [11]:
sb_clean['iron_%dv'] = sb_clean['iron_%dv'].apply(lambda x: x.replace('.00', '') if '.00' in x else x)

In [12]:
sb_clean['iron_%dv'].value_counts()

0     106
2      20
10     18
8      16
20     16
6      15
15     12
4      11
25      9
30      9
40      3
35      3
50      2
Name: iron_%dv, dtype: int64

In [13]:
sb_clean['caffeine_mg'].value_counts()
# Alas.  A mg value of "varies" is less than useless, so we will omit this data.

75        36
0         35
150       33
70        14
varies    12
95        11
Varies     9
110        9
130        7
120        6
25         6
15         4
90         4
175        4
55         3
170        3
85         3
145        3
80         3
140        3
180        3
100        3
125        3
10         3
20         3
30         3
50         3
165        2
90         1
260        1
330        1
410        1
225        1
300        1
65         1
105        1
235        1
Name: caffeine_mg, dtype: int64

In [14]:
# Filtering data to remove non-numeric caffeine values
sb_clean = sb_clean[sb_clean['caffeine_mg'] != 'varies']
sb_clean = sb_clean[sb_clean['caffeine_mg'] != 'Varies']
sb_clean['caffeine_mg'].value_counts()

75     36
0      35
150    33
70     14
95     11
110     9
130     7
25      6
120     6
15      4
90      4
175     4
50      3
125     3
145     3
100     3
30      3
85      3
170     3
140     3
180     3
10      3
20      3
80      3
55      3
165     2
90      1
260     1
330     1
410     1
225     1
300     1
65      1
105     1
235     1
Name: caffeine_mg, dtype: int64

In [15]:
# cast vitamin/mineral data type from object to int
sb_clean['vit_a_%dv'] = sb_clean['vit_a_%dv'].astype(int)
sb_clean['vit_c_%dv'] = sb_clean['vit_c_%dv'].astype(int)
sb_clean['calcium_%dv'] = sb_clean['calcium_%dv'].astype(int)
sb_clean['iron_%dv'] = sb_clean['iron_%dv'].astype(int)
sb_clean['caffeine_mg'] = sb_clean['caffeine_mg'].astype(int)
sb_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 219 entries, 0 to 239
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   category     219 non-null    object 
 1   name         219 non-null    object 
 2   prep         219 non-null    object 
 3   calories     219 non-null    int64  
 4   total_fat_g  219 non-null    object 
 5   trans_fat_g  219 non-null    float64
 6   sat_fat_g    219 non-null    float64
 7   sodium_mg    219 non-null    int64  
 8   carbs_g      219 non-null    int64  
 9   chol_mg      219 non-null    int64  
 10  fiber_g      219 non-null    int64  
 11  sugars_g     219 non-null    int64  
 12  protein_g    219 non-null    float64
 13  vit_a_%dv    219 non-null    int32  
 14  vit_c_%dv    219 non-null    int32  
 15  calcium_%dv  219 non-null    int32  
 16  iron_%dv     219 non-null    int32  
 17  caffeine_mg  219 non-null    int32  
dtypes: float64(3), int32(5), int64(6), object(4)
memor

In [16]:
# Normalize prep column: add column for milk type
sb_clean.insert(3, "milk_type", sb_clean["prep"])

In [17]:
sb_clean.head(15)

Unnamed: 0,category,name,prep,milk_type,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
0,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,Short Nonfat Milk,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10,0,20,0,75
1,Classic Espresso Drinks,Caffè Latte,2% Milk,2% Milk,100,3.5,2.0,0.1,15,85,10,0,9,6.0,10,0,20,0,75
2,Classic Espresso Drinks,Caffè Latte,Soymilk,Soymilk,70,2.5,0.4,0.0,0,65,6,1,4,5.0,6,0,20,8,75
3,Classic Espresso Drinks,Caffè Latte,Tall Nonfat Milk,Tall Nonfat Milk,100,0.2,0.2,0.0,5,120,15,0,14,10.0,15,0,30,0,75
4,Classic Espresso Drinks,Caffè Latte,2% Milk,2% Milk,150,6.0,3.0,0.2,25,135,15,0,14,10.0,15,0,30,0,75
5,Classic Espresso Drinks,Caffè Latte,Soymilk,Soymilk,110,4.5,0.5,0.0,0,105,10,1,6,8.0,10,0,30,15,75
6,Classic Espresso Drinks,Caffè Latte,Grande Nonfat Milk,Grande Nonfat Milk,130,0.3,0.2,0.0,5,150,19,0,18,13.0,20,0,40,0,150
7,Classic Espresso Drinks,Caffè Latte,2% Milk,2% Milk,190,7.0,3.5,0.2,30,170,19,0,17,12.0,20,2,40,0,150
8,Classic Espresso Drinks,Caffè Latte,Soymilk,Soymilk,150,5.0,0.5,0.0,0,130,13,1,8,10.0,15,0,40,15,150
9,Classic Espresso Drinks,Caffè Latte,Venti Nonfat Milk,Venti Nonfat Milk,170,0.4,0.3,0.0,10,190,25,0,23,16.0,30,0,50,0,150


In [18]:
# Using lambdas to update milk_type with selection of soy, 2%, whole, or nonfat
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: 'Soy' if 'Soy' in x else x)
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: 'Nonfat' if 'Nonfat' in x else x)
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: '2%' if '2%' in x else x)
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: 'Whole' if 'Whole Milk' in x else x)
sb_clean['milk_type'].value_counts()

Nonfat    79
Soy       62
2%        46
Whole     16
Tall       5
Grande     4
Venti      4
Short      3
Name: milk_type, dtype: int64

In [19]:
# Remove extraneous descriptors besides types of milk
sb_clean['milk_type'] = sb_clean['milk_type'].apply(lambda x: '' if 'Tall' in x or 'Grande' in x or 'Venti' in x or 'Short' in x or 'Solo' in x or 'Doppio' in x else x)
sb_clean['milk_type'].value_counts()

Nonfat    79
Soy       62
2%        46
Whole     16
          16
Name: milk_type, dtype: int64

In [20]:
# Normalize prep column: add column for size_oz
sb_clean.insert(3, "size_oz", sb_clean["prep"])
sb_clean.head()

Unnamed: 0,category,name,prep,size_oz,milk_type,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
0,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,Short Nonfat Milk,Nonfat,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10,0,20,0,75
1,Classic Espresso Drinks,Caffè Latte,2% Milk,2% Milk,2%,100,3.5,2.0,0.1,15,85,10,0,9,6.0,10,0,20,0,75
2,Classic Espresso Drinks,Caffè Latte,Soymilk,Soymilk,Soy,70,2.5,0.4,0.0,0,65,6,1,4,5.0,6,0,20,8,75
3,Classic Espresso Drinks,Caffè Latte,Tall Nonfat Milk,Tall Nonfat Milk,Nonfat,100,0.2,0.2,0.0,5,120,15,0,14,10.0,15,0,30,0,75
4,Classic Espresso Drinks,Caffè Latte,2% Milk,2% Milk,2%,150,6.0,3.0,0.2,25,135,15,0,14,10.0,15,0,30,0,75


In [34]:
sb_clean_slice = sb_clean.loc[0:197]
sb_clean_slice.tail(10)

Unnamed: 0,category,name,prep,size_oz,milk_type,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
188,Frappuccino® Blended Crème,Strawberries & Crème (Without Whipped Cream),Soymilk,Soymilk,Soy,170,1.5,0.2,0.0,0,135,37,1,35,3.0,4,6,10,6,0
189,Frappuccino® Blended Crème,Strawberries & Crème (Without Whipped Cream),Grande Nonfat Milk,Grande Nonfat Milk,Nonfat,230,0.2,0.1,0.0,0,190,53,0,52,4.0,8,6,15,4,0
190,Frappuccino® Blended Crème,Strawberries & Crème (Without Whipped Cream),Whole Milk,Whole Milk,Whole,260,4,2.0,0.1,10,190,53,0,52,4.0,6,6,15,4,0
191,Frappuccino® Blended Crème,Strawberries & Crème (Without Whipped Cream),Soymilk,Soymilk,Soy,240,2,0.2,0.0,0,180,51,1,49,3.0,4,6,15,8,0
192,Frappuccino® Blended Crème,Strawberries & Crème (Without Whipped Cream),Venti Nonfat Milk,Venti Nonfat Milk,Nonfat,310,0.2,0.1,0.0,5,260,70,0,69,6.0,10,8,20,4,0
193,Frappuccino® Blended Crème,Strawberries & Crème (Without Whipped Cream),Whole Milk,Whole Milk,Whole,350,6,3.0,0.2,15,260,70,0,68,6.0,8,8,20,4,0
194,Frappuccino® Blended Crème,Strawberries & Crème (Without Whipped Cream),Soymilk,Soymilk,Soy,320,3 2,0.4,0.0,0,250,67,1,64,5.0,6,8,20,10,0
195,Frappuccino® Blended Crème,Vanilla Bean (Without Whipped Cream),Tall Nonfat Milk,Tall Nonfat Milk,Nonfat,170,0.1,0.1,0.0,0,160,39,0,38,4.0,6,0,10,0,0
196,Frappuccino® Blended Crème,Vanilla Bean (Without Whipped Cream),Whole Milk,Whole Milk,Whole,200,3.5,2.0,0.1,10,160,39,0,38,3.0,6,0,10,0,0
197,Frappuccino® Blended Crème,Vanilla Bean (Without Whipped Cream),Soymilk,Soymilk,Soy,180,1.5,0.2,0.0,0,160,37,1,35,3.0,4,0,10,6,0


In [35]:
sb_clean_slice.loc[sb_clean_slice['prep'].shift(+1).eq('Short Nonfat Milk'), 'size_oz'] = 8
sb_clean_slice.loc[sb_clean_slice['prep'].shift(+2).eq('Short Nonfat Milk'), 'size_oz'] = 8
sb_clean_slice.loc[sb_clean_slice['prep'].shift(+1).eq('Tall Nonfat Milk'), 'size_oz'] = 12
sb_clean_slice.loc[sb_clean_slice['prep'].shift(+2).eq('Tall Nonfat Milk'), 'size_oz'] = 12
sb_clean_slice.loc[sb_clean_slice['prep'].shift(+1).eq('Venti Nonfat Milk'), 'size_oz'] = 16
sb_clean_slice.loc[sb_clean_slice['prep'].shift(+2).eq('Venti Nonfat Milk'), 'size_oz'] = 16
sb_clean_slice.loc[sb_clean_slice['prep'].shift(+1).eq('Grande Nonfat Milk'), 'size_oz'] = 20
sb_clean_slice.loc[sb_clean_slice['prep'].shift(+2).eq('Grande Nonfat Milk'), 'size_oz'] = 20

In [36]:
sb_clean['size_oz'].value_counts()

20                    38
12                    34
16                    32
Grande Nonfat Milk    25
Tall Nonfat Milk      22
Venti Nonfat Milk     21
8                     20
Short Nonfat Milk     11
Tall                   5
Grande                 4
Venti                  4
Short                  3
Name: size_oz, dtype: int64

In [42]:
# Using replace() to update values 

sb_clean['size_oz'].replace(["Short", "Short Nonfat Milk"], 8, inplace = True)
sb_clean['size_oz'].replace(["Tall", "Tall Nonfat Milk"], 12, inplace = True)
sb_clean['size_oz'].replace(["Grande", "Grande Nonfat Milk"], 16, inplace = True)
sb_clean['size_oz'].replace(["Venti", "Venti Nonfat Milk"], 20, inplace = True)
sb_clean['size_oz'].value_counts()

20    63
12    61
16    61
8     34
Name: size_oz, dtype: int64

In [43]:
sb_clean

Unnamed: 0,category,name,prep,size_oz,milk_type,calories,total_fat_g,trans_fat_g,sat_fat_g,sodium_mg,carbs_g,chol_mg,fiber_g,sugars_g,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,caffeine_mg
0,Classic Espresso Drinks,Caffè Latte,Short Nonfat Milk,8,Nonfat,70,0.1,0.1,0.0,5,75,10,0,9,6.0,10,0,20,0,75
1,Classic Espresso Drinks,Caffè Latte,2% Milk,8,2%,100,3.5,2.0,0.1,15,85,10,0,9,6.0,10,0,20,0,75
2,Classic Espresso Drinks,Caffè Latte,Soymilk,8,Soy,70,2.5,0.4,0.0,0,65,6,1,4,5.0,6,0,20,8,75
3,Classic Espresso Drinks,Caffè Latte,Tall Nonfat Milk,12,Nonfat,100,0.2,0.2,0.0,5,120,15,0,14,10.0,15,0,30,0,75
4,Classic Espresso Drinks,Caffè Latte,2% Milk,12,2%,150,6,3.0,0.2,25,135,15,0,14,10.0,15,0,30,0,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,Signature Espresso Drinks,Caramel Apple Spice (Without Whipped Cream),Grande,16,,280,0,0.0,0.0,0,20,70,0,65,0.0,0,0,0,0,0
232,Signature Espresso Drinks,Caramel Apple Spice (Without Whipped Cream),Venti,20,,360,0,0.0,0.0,0,25,89,0,83,0.0,0,0,0,0,0
237,Shaken Iced Beverages,Iced Brewed Coffee (With Classic Syrup),Tall,12,,60,0,0.0,0.0,0,4,15,0,15,0.2,0,0,0,0,120
238,Shaken Iced Beverages,Iced Brewed Coffee (With Classic Syrup),Grande,16,,90,0.1,0.0,0.0,0,5,21,0,21,0.3,0,0,0,0,165


In [23]:
# Column will be cold and = 24 if Beverage_category contains “Iced”, “Smoothies” or “Blended”

# df.loc[df['val'].shift(-1).eq('yes'), 'val'] = 'yes'

# for i in range(0, 199):
#    if sb_clean.iloc[i]['prep'] == 'Short Nonfat Milk':
#        sb_clean.loc[i, ['size_oz']] = 8
#        sb_clean.loc[i+1, ['size_oz']] = 8
#        sb_clean.loc[i+2, ['size_oz']] = 8
#    elif sb_clean.iloc[i]['prep'] == 'Tall Nonfat Milk':
#        sb_clean.loc[i, ['size_oz']] = 12
#        sb_clean.loc[i+1, ['size_oz']] = 12
#        sb_clean.loc[i+2, ['size_oz']] = 12
#    elif sb_clean.iloc[i]['prep'] == 'Grande Nonfat Milk':
#        sb_clean.loc[i, ['size_oz']] = 16
#        sb_clean.loc[i+1, ['size_oz']] = 16
#        sb_clean.loc[i+2, ['size_oz']] = 16
#    elif sb_clean.iloc[i]['prep'] == 'Venti Nonfat Milk':
#        sb_clean.loc[i, ['size_oz']] = 20
#        sb_clean.loc[i+1, ['size_oz']] = 20
#        sb_clean.loc[i+2, ['size_oz']] = 20

# for i in range(0, 106):
#    if sb_clean.iloc[i]['prep'] == 'Soymilk':
#        if sb_clean.iloc[i-2]['prep'] == 'Short Nonfat Milk':
#            sb_clean.loc[i, ['size_oz']] = 8
#        elif sb_clean.iloc[i-2]['prep'] == 'Tall Nonfat Milk':
#            sb_clean.loc[i, ['size_oz']] = 12
#        elif sb_clean.iloc[i-2]['prep'] == 'Grande Nonfat Milk':
#            sb_clean.loc[i, ['size_oz']] = 16
#        elif sb_clean.iloc[i-2]['prep'] == 'Venti Nonfat Milk':
#            sb_clean.loc[i, ['size_oz']] = 20
#        else:
#             continue

In [24]:
# sb_clean.to_csv('sb_clean.csv', index = False)

In [25]:
# Add % DV calculations for Fat, Sugar, Caffeine

In [26]:
# Descriptive stats (here or in new NB entry)?
# which drinks have the most sugar?  Overall caffeine in tea vs coffee drinks?

In [27]:
# export clean csv file

In [28]:
# add unique ids for cat_id, name_id, type_id