# Nutrition Analysis Tour
## Action: Prepare .csv file for correlation analysis
## Language/tool: Python (Pandas)
---

In [1]:
import pandas as pd

In [2]:
# Bring up cleaned data set
sb_prep2 = pd.read_csv("sb_clean.csv")
sb_prep2.head()

Unnamed: 0,category,name,size_oz,milk_type,calories,calories_%dv,caffeine_mg,caffeine_%dv,total_fat_g,fat_%dv,...,carbs_g,chol_mg,fiber_g,sugars_g,sugars_%dv,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv
0,Classic Espresso Drinks,Caffè Latte,8,Nonfat,70,4.0,75,19.0,0.0,0.0,...,75,10,0,9,18.0,6.0,10,0,20,0
1,Classic Espresso Drinks,Caffè Latte,8,2%,100,5.0,75,19.0,4.0,4.0,...,85,10,0,9,18.0,6.0,10,0,20,0
2,Classic Espresso Drinks,Caffè Latte,8,Soy,70,4.0,75,19.0,2.0,3.0,...,65,6,1,4,8.0,5.0,6,0,20,8
3,Classic Espresso Drinks,Caffè Latte,12,Nonfat,100,5.0,75,19.0,0.0,0.0,...,120,15,0,14,28.0,10.0,15,0,30,0
4,Classic Espresso Drinks,Caffè Latte,12,2%,150,8.0,75,19.0,6.0,8.0,...,135,15,0,14,28.0,10.0,15,0,30,0


In [3]:
# Align name format by removing descriptors in parentheses
sb_prep2['name'] = sb_prep2['name'].str.replace(r"\(.*\)","")

  sb_prep2['name'] = sb_prep2['name'].str.replace(r"\(.*\)","")


In [4]:
# Remove fancy accent marks
sb_prep2['name'] = sb_prep2['name'].str.replace('è', 'e')

In [5]:
# Remove trademark
sb_prep2['name'] = sb_prep2['name'].str.replace('®', '')
sb_prep2.name.value_counts()

Caffe Latte                      12
Tazo Chai Tea Latte              12
Caffe Mocha                      12
Iced Brewed Coffee               12
Tazo Full-Leaf Red Tea Latte     12
Tazo Green Tea Latte             12
Coffee                           12
Hot Chocolate                    12
Caramel Macchiato                12
Cappuccino                       12
White Chocolate Mocha            12
Vanilla Latte                    12
Mocha                             9
Strawberries & Creme              9
Caramel                           9
Java Chip                         9
Vanilla Bean                      4
Brewed Coffee                     4
Caffe Americano                   4
Skinny Latte                      4
Caramel Apple Spice               4
Strawberry Banana Smoothie        3
Orange Mango Banana Smoothie      3
Banana Chocolate Smoothie         3
Mocha                             3
Caramel                           3
Java Chip                         3
Shaken Iced Tazo Tea        

In [6]:
# Remove whitespaces
sb_prep2['name'] = sb_prep2['name'].str.strip()

In [7]:
# Verify the proper groups exist 
sb_prep2.milk_type.value_counts(dropna=False)

Nonfat    79
Soy       62
2%        46
Whole     16
NaN       16
Name: milk_type, dtype: int64

In [8]:
# Filter to obtain a dataframe of representative drinks that are all 16oz, and either contain Nonfat milk or no milk
no_milk = sb_prep2["milk_type"].isna()
nonfat  = sb_prep2["milk_type"] == "Nonfat"
grande  = sb_prep2["size_oz"] == 16
sb_prep2 = sb_prep2[(no_milk | nonfat) & grande]

In [9]:
# Check existing milk types post-filter
sb_prep2.milk_type.value_counts(dropna=False)

Nonfat    25
NaN        4
Name: milk_type, dtype: int64

In [10]:
# Check existing drink size post-filter
sb_prep2.size_oz.value_counts(dropna=False)

16    29
Name: size_oz, dtype: int64

In [11]:
# Add a column containing a character count of each coffee name
name_list = []
for item in sb_prep2.name:
    char_length = len(item)
    name_list.append(char_length)

sb_prep2["name_length"] = name_list
sb_prep2.head()

Unnamed: 0,category,name,size_oz,milk_type,calories,calories_%dv,caffeine_mg,caffeine_%dv,total_fat_g,fat_%dv,...,chol_mg,fiber_g,sugars_g,sugars_%dv,protein_g,vit_a_%dv,vit_c_%dv,calcium_%dv,iron_%dv,name_length
6,Classic Espresso Drinks,Caffe Latte,16,Nonfat,130,6.0,150,38.0,0.0,0.0,...,19,0,18,36.0,13.0,20,0,40,0,11
18,Classic Espresso Drinks,Caffe Mocha,16,Nonfat,220,11.0,175,44.0,2.0,3.0,...,43,2,34,68.0,13.0,20,0,35,25,11
30,Classic Espresso Drinks,Vanilla Latte,16,Nonfat,200,10.0,150,38.0,0.0,0.0,...,37,0,35,70.0,12.0,20,0,35,0,13
42,Classic Espresso Drinks,Cappuccino,16,Nonfat,80,4.0,150,38.0,0.0,0.0,...,12,0,10,20.0,8.0,15,0,25,0,10
54,Signature Espresso Drinks,Caramel Macchiato,16,Nonfat,190,10.0,150,38.0,1.0,1.0,...,35,0,32,64.0,11.0,20,0,35,0,17


In [12]:
# Create a smaller dataframe containing only columns of future interest
sb_prep2 = sb_prep2[["name", "name_length", "calories_%dv", "caffeine_%dv", "fat_%dv", "sugars_%dv"]]
sb_prep2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29 entries, 6 to 217
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          29 non-null     object 
 1   name_length   29 non-null     int64  
 2   calories_%dv  29 non-null     float64
 3   caffeine_%dv  29 non-null     float64
 4   fat_%dv       29 non-null     float64
 5   sugars_%dv    29 non-null     float64
dtypes: float64(4), int64(1), object(1)
memory usage: 1.6+ KB


In [13]:
# Rename for use in R (remove "%")
mapper = {"calories_%dv": "cal_perc_dv", "caffeine_%dv": "caff_perc_dv", "fat_%dv": "fat_perc_dv", "sugars_%dv": "sug_perc_dv"}
sb_prep2 = sb_prep2.rename(columns = mapper)
sb_prep2.head()

Unnamed: 0,name,name_length,cal_perc_dv,caff_perc_dv,fat_perc_dv,sug_perc_dv
6,Caffe Latte,11,6.0,38.0,0.0,36.0
18,Caffe Mocha,11,11.0,44.0,3.0,68.0
30,Vanilla Latte,13,10.0,38.0,0.0,70.0
42,Cappuccino,10,4.0,38.0,0.0,20.0
54,Caramel Macchiato,17,10.0,38.0,1.0,64.0


In [14]:
# Verify there are no missing values
sb_prep2.isnull().sum()

name            0
name_length     0
cal_perc_dv     0
caff_perc_dv    0
fat_perc_dv     0
sug_perc_dv     0
dtype: int64

In [15]:
# Aaaand export
sb_prep2.to_csv(r'nutrition_analysis_r\sb_prep2.csv', index = False)