In [3]:
import pandas as pd
import re

In [18]:
def remove_percents(df, col):
    df[col] = df[col].str.replace("%", '')
    return df

def fill_zero_iron(df):
    print (type(df['Iron (% DV)'][0]))
    df['Iron (% DV)'] = df['Iron (% DV)'].fillna(0)
    return df
    
def fix_caffeine(df):
    df = df[df['Caffeine (mg)'] != 'Varies']
    df = df[df['Caffeine (mg)'] != 'NULL']
    return df

def standardize_names(df):
    x = list(df.columns)
    newName = []
    for col_name in x:
        new = re.sub(r" ?\([^)]+\)", "", col_name)
        new = new.lower()
        newName.append(new)
    df.columns = newName
    return df

def fix_strings(df, col):
    df[col] = df[col].str.lower()
    df[col] = df[col].str.replace('[^a-zA-Z0-9]', '')
    return df

In [19]:
def main():
    
    # first, read in the raw data
    df = pd.read_csv('../data/starbucks.csv')
    
    # the columns below represent percent daily value and are stored as strings with a percent sign, e.g. '0%'
    # complete the remove_percents function to remove the percent symbol and convert the columns to a numeric type
    pct_DV = ['Vitamin A (% DV)', 'Vitamin C (% DV)', 'Calcium (% DV)', 'Iron (% DV)']
    for col in pct_DV:
        df = remove_percents(df, col)
    
    # the column 'Iron (% DV)' has missing values when the drink has no iron
    # complete the fill_zero_iron function to fix this
    df = fill_zero_iron(df)

    # the column 'Caffeine (mg)' has some missing values and some 'varies' values
    # complete the fix_caffeine function to deal with these values
    # note: you may choose to fill in the values with the mean/median, or drop those values, etc.
    df = fix_caffeine(df)
    
    # the columns below are string columns... starbucks being starbucks there are some fancy characters and symbols in their names
    # complete the fix_strings function to convert these strings to lowercase and remove non-alphabet characters
    names = ['Beverage_category', 'Beverage']
    for col in names:
        df = fix_strings(df, col)
    
    # the column names in this data are clear but inconsistent
    # complete the standardize_names function to convert all column names to lower case and remove the units (in parentheses)
    df = standardize_names(df)
    
    print(df.head())
    
    # now that the data is all clean, save your output to the `data` folder as 'starbucks_clean.csv'
    # you will use this file in checkpoint 2
    df.to_csv('../data/starbucks_clean.csv', index=False)
    

if __name__ == "__main__":
    main()

<class 'float'>
       beverage_category      beverage      beverage_prep  calories  \
0                 coffee  brewedcoffee              Short         3   
1                 coffee  brewedcoffee               Tall         4   
2                 coffee  brewedcoffee             Grande         5   
3                 coffee  brewedcoffee              Venti         5   
4  classicespressodrinks     cafflatte  Short Nonfat Milk        70   

   total fat  trans fat  saturated fat   sodium  total carbohydrates  \
0        0.1        0.0            0.0        0                    5   
1        0.1        0.0            0.0        0                   10   
2        0.1        0.0            0.0        0                   10   
3        0.1        0.0            0.0        0                   10   
4        0.1        0.1            0.0        5                   75   

   cholesterol  dietary fibre  sugars  protein vitamin a vitamin c calcium  \
0            0              0       0      0.3

  df[col] = df[col].str.replace('[^a-zA-Z0-9]', '')
