# Nutrition Analysis Tour
## Action: Prepare .csv file for optimization analysis
## Language/tool: Python (Pandas)
---

In [1]:
import pandas as pd

In [2]:
# Bring up cleaned data set
sb_prep = pd.read_csv("sb_clean.csv")
sb_prep.head()

Unnamed: 0,category,name,size_oz,milk_type,calories,calories_%dv,caffeine_mg,caffeine_%dv,total_fat_g,fat_%dv,...,carbs_g,chol_mg,fiber_g,sugars_g,sugars_%dv,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv
0,Classic Espresso Drinks,Caffè Latte,8,Nonfat,70,4.0,75,19.0,0.0,0.0,...,75,10,0,9,18.0,6.0,10,0,20,0
1,Classic Espresso Drinks,Caffè Latte,8,2%,100,5.0,75,19.0,4.0,4.0,...,85,10,0,9,18.0,6.0,10,0,20,0
2,Classic Espresso Drinks,Caffè Latte,8,Soy,70,4.0,75,19.0,2.0,3.0,...,65,6,1,4,8.0,5.0,6,0,20,8
3,Classic Espresso Drinks,Caffè Latte,12,Nonfat,100,5.0,75,19.0,0.0,0.0,...,120,15,0,14,28.0,10.0,15,0,30,0
4,Classic Espresso Drinks,Caffè Latte,12,2%,150,8.0,75,19.0,6.0,8.0,...,135,15,0,14,28.0,10.0,15,0,30,0


In [3]:
# Equalize names by removing descriptors in parentheses
sb_prep['name'] = sb_prep['name'].str.replace(r"\(.*\)","")

  sb_prep['name'] = sb_prep['name'].str.replace(r"\(.*\)","")


In [4]:
# Remove fancy accent marks
sb_prep['name'] = sb_prep['name'].str.replace('è', 'e')

In [5]:
# Remove trademark
sb_prep['name'] = sb_prep['name'].str.replace('®', '')
sb_prep.name.value_counts()

Caffe Latte                      12
Tazo Chai Tea Latte              12
Caffe Mocha                      12
Iced Brewed Coffee               12
Tazo Full-Leaf Red Tea Latte     12
Tazo Green Tea Latte             12
Coffee                           12
Hot Chocolate                    12
Caramel Macchiato                12
Cappuccino                       12
White Chocolate Mocha            12
Vanilla Latte                    12
Mocha                             9
Strawberries & Creme              9
Caramel                           9
Java Chip                         9
Vanilla Bean                      4
Brewed Coffee                     4
Caffe Americano                   4
Skinny Latte                      4
Caramel Apple Spice               4
Strawberry Banana Smoothie        3
Orange Mango Banana Smoothie      3
Banana Chocolate Smoothie         3
Mocha                             3
Caramel                           3
Java Chip                         3
Shaken Iced Tazo Tea        

In [6]:
# Remove whitespaces
sb_prep['name'] = sb_prep['name'].str.strip() 

In [7]:
# Now, to actually DE-normalize some data so that the name column can become descriptive and unique for each record
# First cast size_oz column as a string
sb_prep['size_oz'] = sb_prep['size_oz'].apply(str)
sb_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 23 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   category      219 non-null    object 
 1   name          219 non-null    object 
 2   size_oz       219 non-null    object 
 3   milk_type     203 non-null    object 
 4   calories      219 non-null    int64  
 5   calories_%dv  219 non-null    float64
 6   caffeine_mg   219 non-null    int64  
 7   caffeine_%dv  219 non-null    float64
 8   total_fat_g   219 non-null    float64
 9   fat_%dv       219 non-null    float64
 10  trans_fat_g   219 non-null    float64
 11  sat_fat_g     219 non-null    float64
 12  sodium_mg     219 non-null    int64  
 13  carbs_g       219 non-null    int64  
 14  chol_mg       219 non-null    int64  
 15  fiber_g       219 non-null    int64  
 16  sugars_g      219 non-null    int64  
 17  sugars_%dv    219 non-null    float64
 18  protein_g     219 non-null    

In [8]:
# Next create a label for any drinks with 'NaN' as milk type
sb_prep["milk_type"] = sb_prep["milk_type"].fillna("No_milk") 
sb_prep["milk_type"].value_counts(dropna = False)

Nonfat     79
Soy        62
2%         46
Whole      16
No_milk    16
Name: milk_type, dtype: int64

In [9]:
# And then create unique names for each drink
sb_prep['prep_name'] = sb_prep['name'] + '_' + sb_prep['size_oz'] + '_' + sb_prep['milk_type']

In [10]:
sb_prep.prep_name.value_counts

<bound method IndexOpsMixin.value_counts of 0                Caffe Latte_8_Nonfat
1                    Caffe Latte_8_2%
2                   Caffe Latte_8_Soy
3               Caffe Latte_12_Nonfat
4                   Caffe Latte_12_2%
                    ...              
214    Caramel Apple Spice_16_No_milk
215    Caramel Apple Spice_20_No_milk
216     Iced Brewed Coffee_12_No_milk
217     Iced Brewed Coffee_16_No_milk
218     Iced Brewed Coffee_20_No_milk
Name: prep_name, Length: 219, dtype: object>

In [13]:
# Finally, create a smaller dataframe containing only columns of future interest
sb_prep = sb_prep[["prep_name", "calories_%dv", "caffeine_%dv", "fat_%dv", "sugars_%dv"]]
sb_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   prep_name     219 non-null    object 
 1   calories_%dv  219 non-null    float64
 2   caffeine_%dv  219 non-null    float64
 3   fat_%dv       219 non-null    float64
 4   sugars_%dv    219 non-null    float64
dtypes: float64(4), object(1)
memory usage: 8.7+ KB


In [12]:
# Export this beauty
sb_prep.to_csv('sb_prep.csv', index = False)