In [70]:
import pandas as pd
from fractions import Fraction

csv_path = "Data/mr_boston_flattened.csv"

## Read CSV file

In [71]:
#read csv
cocktail_df = pd.read_csv(csv_path, encoding="utf-8")

In [72]:
# Create list of measurement columns to loop through
ingredient_cols = ["ingredient-1","ingredient-2","ingredient-3","ingredient-4","ingredient-5","ingredient-6"]
measurement_cols = ["measurement-1", "measurement-2", "measurement-3", "measurement-4", "measurement-5", "measurement-6"]

# Loop through columns and strip units
for col in measurement_cols:
        cocktail_df[col]= cocktail_df[col].str.strip(" oz")
    
cocktail_df.head()

Unnamed: 0,name,category,measurement-1,ingredient-1,measurement-2,ingredient-2,measurement-3,ingredient-3,measurement-4,ingredient-4,measurement-5,ingredient-5,measurement-6,ingredient-6,instructions,glass,glass-size
0,Gauguin,Cocktail Classics,2,Light Rum,1,Passion Fruit Syrup,1,Lemon Juice,1,Lime Juice,,,,,Combine ingredients with a cup of crushed ice ...,Old-Fashioned Glass,6 to 8 ounces
1,Fort Lauderdale,Cocktail Classics,1 1/2,Light Rum,1/2,Sweet Vermouth,1/4,Juice of Orange,1/4,Juice of a Lime,,,,,Shake with ice and strain into old-fashioned g...,Old-Fashioned Glass,6 to 8 ounces
2,Apple Pie,Cordials and Liqueurs,3,Apple schnapps,1,Cinnamon schnapps,,Apple slice,,,,,,,Pour into ice-filled old-fashioned glass. Garn...,Old-Fashioned Glass,6 to 8 ounces
3,Cuban Cocktail No. 1,Cocktail Classics,1/2,Juice of a Lime,1/2,Powdered Sugar,2,Light Rum,,,,,,,Shake with ice and strain into cocktail glass.,Cocktail Glass,6 or more ounces
4,Cool Carlos,Cocktail Classics,1 1/2,Dark rum,2,Cranberry Juice,2,Pineapple Juice,1,Orange curacao,1.0,Sour Mix,,,"Mix all ingredients except curacao with ice, s...",Collins Glass,14 to 16 ounces


## Define function ## 
to convert string values to number and clean measurement

In [73]:
#function for converting fractions and data cleanup
def convert_to_float(frac_str):
    try:
        return float(frac_str)
    except ValueError:
        frac_str = frac_str.lower()
        if "or" in frac_str:
            nums = frac_str.split(' or ')
            return nums[0]
        elif "  " in frac_str:
            frac_str = frac_str.replace("  ", " ")
            print(frac_str)
            value = convert_to_float(frac_str)
            return value
        elif "tsp" in frac_str:
            frac_str = frac_str.strip("tsp")
            print(frac_str)
            value = convert_to_float(frac_str)
            return value * 0.166667
        elif "750-ml" in frac_str:
            frac_str = frac_str.strip("750-ml")
            print(frac_str)
            value = convert_to_float(frac_str)
            return value * 25.3
        elif "750ml" in frac_str:
            frac_str = frac_str.strip("750ml")
            print(frac_str)
            return 25.3
        elif "bottles" in frac_str:
            frac_str = frac_str.strip("bottles")
            print(frac_str)
            value = convert_to_float(frac_str)
            return value * 25.3
        elif "for glass" in frac_str:
            nums = frac_str.split('for glass')
            return 1
        elif "/" in frac_str:
            num, denom = frac_str.split('/')
            try:
                leading, num = num.split(' ')
                whole = float(leading)
            except ValueError:
                whole = 0
            frac = float(num) / float(denom)
            return whole - frac if whole < 0 else whole + frac    
        else:
            print("from else")
            print(frac_str)
            return frac_str

## Combine all Ingredient and Measure Columns

In [74]:
#create Dataframes for each ingredient subset
ing1_df = cocktail_df[["name", "measurement-1", "ingredient-1"]]
ing1 = ing1_df.rename(columns= {"measurement-1": "measurement", "ingredient-1":"ingredient"})
ing2_df = cocktail_df[["name", "measurement-2", "ingredient-2"]]
ing2 = ing2_df.rename(columns= {"measurement-2": "measurement", "ingredient-2":"ingredient"})
ing3_df = cocktail_df[["name", "measurement-3", "ingredient-3"]]
ing3 = ing3_df.rename(columns= {"measurement-3": "measurement", "ingredient-3":"ingredient"})
ing4_df = cocktail_df[["name", "measurement-4", "ingredient-4"]]
ing4 = ing4_df.rename(columns= {"measurement-4": "measurement", "ingredient-4":"ingredient"})
ing5_df = cocktail_df[["name", "measurement-5", "ingredient-5"]]
ing5 = ing5_df.rename(columns= {"measurement-5": "measurement", "ingredient-5":"ingredient"})
ing6_df = cocktail_df[["name", "measurement-6", "ingredient-6"]]
ing6 = ing6_df.rename(columns= {"measurement-6": "measurement", "ingredient-6":"ingredient"})

In [75]:
#combine data frames and sort
frames = [ing1, ing2, ing3, ing4, ing5, ing6]

ingredient_df = pd.concat(frames)
ingredient_df.sort_values('name')

Unnamed: 0,name,measurement,ingredient
432,1626,1,Italian preserved cherry
432,1626,1/2,cherry-flavored brandy
432,1626,3/4,Gingerbread liqueur
432,1626,2,Angostura Bitters
432,1626,,
...,...,...,...
576,Zombie,1,"each Fresh Lime Juice, lemon juice, pineapple..."
576,Zombie,2,Demerara Syrup
576,Zombie,1,Light Rum
576,Zombie,1,151-Proof Rum


In [76]:
final_ingredient_df = ingredient_df.dropna(subset =['ingredient'])
final_ingredient_df

Unnamed: 0,name,measurement,ingredient
0,Gauguin,2,Light Rum
1,Fort Lauderdale,1 1/2,Light Rum
2,Apple Pie,3,Apple schnapps
3,Cuban Cocktail No. 1,1/2,Juice of a Lime
4,Cool Carlos,1 1/2,Dark rum
...,...,...,...
957,Bloody Maria,,Lemon wheel
962,Amante Picante,2,"Jalapeno hot pepper sauce, cucumber slice"
966,Betsy Ross Cocktail,1 1/4,Old Mr. Boston Dry Gin
974,The Winkle,4,"Raspberries, lemon twist"


## Run function and reindex

In [77]:
#apply function to convert fractions
final_ingredient_df['measurement'] = final_ingredient_df['measurement'].apply(convert_to_float)

1/2 
1 
2 
2 
from else
3 slices
from else
6 fresh
from else
splash
from else
3 slices
1 3/4
from else
1 splash
from else
3 slices
from else

from else
2 dashes
from else
1c
1 
2 - 3 drops
from else
2 - 3 drops
from else
1 dash


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_ingredient_df['measurement'] = final_ingredient_df['measurement'].apply(convert_to_float)


In [78]:
#reindex
final_ingredient_df.reset_index(inplace = True)
final_ingredient_df

Unnamed: 0,index,name,measurement,ingredient
0,0,Gauguin,2,Light Rum
1,1,Fort Lauderdale,1.5,Light Rum
2,2,Apple Pie,3,Apple schnapps
3,3,Cuban Cocktail No. 1,0.5,Juice of a Lime
4,4,Cool Carlos,1.5,Dark rum
...,...,...,...,...
3929,957,Bloody Maria,,Lemon wheel
3930,962,Amante Picante,2,"Jalapeno hot pepper sauce, cucumber slice"
3931,966,Betsy Ross Cocktail,1.25,Old Mr. Boston Dry Gin
3932,974,The Winkle,4,"Raspberries, lemon twist"


In [82]:
for row, index in final_ingredient_df.iterrows():
    if isinstance(final_ingredient_df['measurement'][row], str):
        final_ingredient_df['unit'] = final_ingredient_df['measurement']
#         print('Type of variable is string')
    else:
        final_ingredient_df['unit'] = ('Fl Oz')
#         print('Type is variable is not string')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_ingredient_df['unit'] = ('Fl Oz')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_ingredient_df['unit'] = final_ingredient_df['measurement']


In [84]:
final_ingredient_df.ingredient.unique()

array([' Light Rum', ' Apple schnapps', ' Juice of a Lime', ' Dark rum',
       ' Bourbon whiskey', ' Amaretto', ' Scotch Whiskey', ' Bacardi Rum',
       ' Brandy', ' Gin', ' Sloe gin', ' Sweet Vermouth',
       ' Tanqueray gin', ' Lemon Juice', ' Straight rye whiskey',
       ' Grenadine', ' Green Chartreuse', ' Irish Whiskey',
       ' Dry Vermouth', ' Juice of a Lemon', ' Orange', ' Absinthe',
       ' Rye or bourbon whiskey', ' Light Vermouth', ' Anisette',
       ' Kummel', ' Bourbon or Rye Whiskey', ' Tennessee whiskey',
       ' Vodka', ' Grapefruit Juice', ' Fresh rosemary sprig',
       ' Powdered Sugar', ' Wide spiral of lemon zest',
       ' lemon-flavored vodka', ' Lime Juice', ' Blended Scotch Whiskey',
       ' Apricot Flavored Brandy', ' Bitters', ' Juice of Orange',
       ' Orange Bitters', 'Blended Scotch Whiskey', ' White whiskey',
       ' Dry gin', ' Forbidden Fruit', ' Applejack', ' Port',
       ' Apple Brandy', ' Maraschino', ' Coffee-flavored brandy',
       '