![](ubc_header.png)

# Climate-Friendly Food Systems (CFFS) Labelling Project

### The University of British Columbia

****

## Part I: Data Preprocessing

## Set up and Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import glob
import os
import xml.etree.ElementTree as et
from xml.etree.ElementTree import parse
from datetime import datetime

In [2]:
# RUN ONLY ONCE
# os.chdir is used to change the current directory to the specified path
os.chdir("../") # Sets path to the repo folder as it is one level above where this file exists!
path = os.getcwd()
print(path)

/Users/vivaanwadhwa/Documents/GitHub/CFFS_sharon_2024


****

## Load Data Files

### Set Data File Path

In [3]:
filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", "AMS_Gallery_2024_25","PrepRecipeDetail_Gallery_New_OC_2024.csv"))
filepath_list.append(glob.glob(os.path.join(os.getcwd(), "data", "archive", "AMS_Gallery_Data","Gallery_all_years_test_new.csv"))[0])
filepath_list

['/Users/vivaanwadhwa/Documents/GitHub/CFFS_sharon_2024/data/raw/AMS_Gallery_2024_25/PrepRecipeDetail_Gallery_New_OC_2024.csv',
 '/Users/vivaanwadhwa/Documents/GitHub/CFFS_sharon_2024/data/archive/AMS_Gallery_Data/Gallery_all_years_test_new.csv']

In [4]:
Items = pd.read_csv(filepath_list[0], skiprows = 1, header=None)
Items.columns = ["Empty","category_1","category_2", "item_descrip","batch_yield", "total_cost", "Total Cost", "Shelf_Life", "Shelf Lifr", "recipe_uom","recipe_cost", "recipe_uom_2","recipe_cost_2","line_qty","ingre_cost","item_descrip.1","uom"]
Items = Items.drop(columns=["Total Cost","Shelf_Life","Shelf Lifr", "category_1", "Empty","recipe_uom","recipe_cost","recipe_uom_2","recipe_cost_2"])
Items

Unnamed: 0,category_2,item_descrip,batch_yield,total_cost,line_qty,ingre_cost,item_descrip.1,uom
0,Beverages - Juice,Raspberry Puree,Batch Yield: 2.000 L,$14.0160,1.00,$14.0160,Raspberries Frozen IQF,Kg
1,Beverages - Juice,Raspberry Puree,Batch Yield: 2.000 L,$14.0160,1.50,$0.0000,Water - Tap,L
2,Beverages - Juice,Strawberry Puree,Batch Yield: 2.000 L,$5.5242,1.00,$5.5242,Strawberries IQF FRZ,Kg
3,Beverages - Juice,Strawberry Puree,Batch Yield: 2.000 L,$5.5242,1.50,$0.0000,Water - Tap,L
4,Kitchen Supplies,To Go Cutlery 2023,Batch Yield: 1.000 ea,$0.6308,1.00,$0.0214,CUP PORT PAPER WHT 2Z,ea
...,...,...,...,...,...,...,...,...
1221,Alcohol - White Wine,Sangria White Tab 2023,Batch Yield: 16.000 L,$109.2348,200.00,$4.7647,Bacardi Light Rum 750mL,ml
1222,Alcohol - White Wine,Sangria White Tab 2023,Batch Yield: 16.000 L,$109.2348,200.00,$5.0881,Meagher's Triple Sec,ml
1223,Alcohol - White Wine,Sangria White Tab 2023,Batch Yield: 16.000 L,$109.2348,3.00,$6.0833,JUICE orange 100% tetra,L
1224,Alcohol - White Wine,Sangria White Tab 2023,Batch Yield: 16.000 L,$109.2348,375.00,$9.6546,Malibu 1.14L,ml


In [5]:
old_items = pd.read_csv(filepath_list[1])
old_items = old_items.drop(columns=['item_num.2', 'item_num.1'])
old_items

Unnamed: 0,item_num,item_descrip,pak_physical_yield,pak_uom,pak_uom.1,pak_factored_cost,line_item_num,line_qty,items_comments,item_descrip.1,inv_flag,uom
0,17284,2022 goose & Watermelon bull,1.0,PORT,ea,0.559100,1971,0.25,,LIMES,N,ea
1,17284,2022 goose & Watermelon bull,1.0,PORT,fl oz,1.553600,8228,1.00,,Grey Goose 1.14L,N,fl oz
2,17284,2022 goose & Watermelon bull,1.0,PORT,can,1.920800,15803,1.00,,Red Bull Watermelon,N,can
3,18292,Vegan Caesar wrap 2022,1.0,ea,HEAD,2.466700,5505,0.25,,Lettuce - Romaine,N,HEAD
4,18292,Vegan Caesar wrap 2022,1.0,ea,ml,0.189270,8667,2.00,,ITEM GARLIC MAYO,N,fl oz
...,...,...,...,...,...,...,...,...,...,...,...,...
2153,18049,Yellow Curry Prep Gall.2023,1.0,PORT,g,0.005200,2262,1.00,,SESAME SEEDS,Y,g
2154,18049,Yellow Curry Prep Gall.2023,1.0,PORT,oz,0.003968,14434,80.00,,Potato yellow Med 5lb bag,Y,g
2155,18049,Yellow Curry Prep Gall.2023,1.0,PORT,PORT,0.200000,15477,1.00,,2022 Jasmin rice,Y,PORT
2156,18049,Yellow Curry Prep Gall.2023,1.0,PORT,g,0.005300,15637,20.00,,Whole Green Beans IQF,Y,g


In [6]:
#Ensure 'line_item_num' column exists in Items
Items['line_item_num'] = 0

# Create a dictionary for quick lookup from old_items
old_items_dict = old_items.set_index('item_descrip.1')['line_item_num'].to_dict()
old_ids = list(set(old_items["line_item_num"].unique()))
ID = old_ids[0]

# Iterate through Items and update line_item_num
for index, row in Items.iterrows():
    description = row['item_descrip.1']
    if description in old_items_dict:
        Items.at[index, 'line_item_num'] = old_items_dict[description]
    else:
        while ID in old_ids:
            ID += 1
        old_items_dict[description] = ID
        old_ids.append(ID)
        Items.at[index, "line_item_num"] = ID

# Output the updated DataFrame
display(Items)

Unnamed: 0,category_2,item_descrip,batch_yield,total_cost,line_qty,ingre_cost,item_descrip.1,uom,line_item_num
0,Beverages - Juice,Raspberry Puree,Batch Yield: 2.000 L,$14.0160,1.00,$14.0160,Raspberries Frozen IQF,Kg,7168
1,Beverages - Juice,Raspberry Puree,Batch Yield: 2.000 L,$14.0160,1.50,$0.0000,Water - Tap,L,2640
2,Beverages - Juice,Strawberry Puree,Batch Yield: 2.000 L,$5.5242,1.00,$5.5242,Strawberries IQF FRZ,Kg,4099
3,Beverages - Juice,Strawberry Puree,Batch Yield: 2.000 L,$5.5242,1.50,$0.0000,Water - Tap,L,2640
4,Kitchen Supplies,To Go Cutlery 2023,Batch Yield: 1.000 ea,$0.6308,1.00,$0.0214,CUP PORT PAPER WHT 2Z,ea,4100
...,...,...,...,...,...,...,...,...,...
1221,Alcohol - White Wine,Sangria White Tab 2023,Batch Yield: 16.000 L,$109.2348,200.00,$4.7647,Bacardi Light Rum 750mL,ml,4286
1222,Alcohol - White Wine,Sangria White Tab 2023,Batch Yield: 16.000 L,$109.2348,200.00,$5.0881,Meagher's Triple Sec,ml,1109
1223,Alcohol - White Wine,Sangria White Tab 2023,Batch Yield: 16.000 L,$109.2348,3.00,$6.0833,JUICE orange 100% tetra,L,17946
1224,Alcohol - White Wine,Sangria White Tab 2023,Batch Yield: 16.000 L,$109.2348,375.00,$9.6546,Malibu 1.14L,ml,4293


In [7]:
def extract_yield_and_uom(batch_yield):
    match = re.search(r'Batch\s*Yield:\s*([\d.]+)\s*(\w+)', batch_yield)
    if match:
        physical_yield = float(match.group(1))
        uom = match.group(2)
        return physical_yield, uom
    else:
        print(f"Error parsing batch_yield '{batch_yield}': pattern not found")
        return None, None

# Apply the function and create new columns
Items['pak_physical_yield'], Items['pak_uom'] = zip(*Items['batch_yield'].apply(extract_yield_and_uom))
Items = Items.drop(columns=["batch_yield"])
Items

Unnamed: 0,category_2,item_descrip,total_cost,line_qty,ingre_cost,item_descrip.1,uom,line_item_num,pak_physical_yield,pak_uom
0,Beverages - Juice,Raspberry Puree,$14.0160,1.00,$14.0160,Raspberries Frozen IQF,Kg,7168,2.0,L
1,Beverages - Juice,Raspberry Puree,$14.0160,1.50,$0.0000,Water - Tap,L,2640,2.0,L
2,Beverages - Juice,Strawberry Puree,$5.5242,1.00,$5.5242,Strawberries IQF FRZ,Kg,4099,2.0,L
3,Beverages - Juice,Strawberry Puree,$5.5242,1.50,$0.0000,Water - Tap,L,2640,2.0,L
4,Kitchen Supplies,To Go Cutlery 2023,$0.6308,1.00,$0.0214,CUP PORT PAPER WHT 2Z,ea,4100,1.0,ea
...,...,...,...,...,...,...,...,...,...,...
1221,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,200.00,$4.7647,Bacardi Light Rum 750mL,ml,4286,16.0,L
1222,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,200.00,$5.0881,Meagher's Triple Sec,ml,1109,16.0,L
1223,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,3.00,$6.0833,JUICE orange 100% tetra,L,17946,16.0,L
1224,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,375.00,$9.6546,Malibu 1.14L,ml,4293,16.0,L


In [8]:
#Ensure 'line_item_num' column exists in Items
Items['item_num'] = 0

# Create a dictionary for quick lookup from old_items
old_items_dict = {}
old_ids = [1734]
ID = old_ids[0]

# Iterate through Items and update line_item_num
for index, row in Items.iterrows():
    description = row['item_descrip']
    if description in old_items_dict:
        Items.at[index, 'item_num'] = old_items_dict[description]
    else:
        while ID in old_ids:
            ID += 1
        old_items_dict[description] = ID
        old_ids.append(ID)
        Items.at[index, "item_num"] = ID

# Output the updated DataFrame
display(Items)
    

Unnamed: 0,category_2,item_descrip,total_cost,line_qty,ingre_cost,item_descrip.1,uom,line_item_num,pak_physical_yield,pak_uom,item_num
0,Beverages - Juice,Raspberry Puree,$14.0160,1.00,$14.0160,Raspberries Frozen IQF,Kg,7168,2.0,L,1735
1,Beverages - Juice,Raspberry Puree,$14.0160,1.50,$0.0000,Water - Tap,L,2640,2.0,L,1735
2,Beverages - Juice,Strawberry Puree,$5.5242,1.00,$5.5242,Strawberries IQF FRZ,Kg,4099,2.0,L,1736
3,Beverages - Juice,Strawberry Puree,$5.5242,1.50,$0.0000,Water - Tap,L,2640,2.0,L,1736
4,Kitchen Supplies,To Go Cutlery 2023,$0.6308,1.00,$0.0214,CUP PORT PAPER WHT 2Z,ea,4100,1.0,ea,1737
...,...,...,...,...,...,...,...,...,...,...,...
1221,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,200.00,$4.7647,Bacardi Light Rum 750mL,ml,4286,16.0,L,1929
1222,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,200.00,$5.0881,Meagher's Triple Sec,ml,1109,16.0,L,1929
1223,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,3.00,$6.0833,JUICE orange 100% tetra,L,17946,16.0,L,1929
1224,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,375.00,$9.6546,Malibu 1.14L,ml,4293,16.0,L,1929


In [9]:
Items["inv_flag"] = "Y"

### Import Items List

In [10]:
Items

Unnamed: 0,category_2,item_descrip,total_cost,line_qty,ingre_cost,item_descrip.1,uom,line_item_num,pak_physical_yield,pak_uom,item_num,inv_flag
0,Beverages - Juice,Raspberry Puree,$14.0160,1.00,$14.0160,Raspberries Frozen IQF,Kg,7168,2.0,L,1735,Y
1,Beverages - Juice,Raspberry Puree,$14.0160,1.50,$0.0000,Water - Tap,L,2640,2.0,L,1735,Y
2,Beverages - Juice,Strawberry Puree,$5.5242,1.00,$5.5242,Strawberries IQF FRZ,Kg,4099,2.0,L,1736,Y
3,Beverages - Juice,Strawberry Puree,$5.5242,1.50,$0.0000,Water - Tap,L,2640,2.0,L,1736,Y
4,Kitchen Supplies,To Go Cutlery 2023,$0.6308,1.00,$0.0214,CUP PORT PAPER WHT 2Z,ea,4100,1.0,ea,1737,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
1221,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,200.00,$4.7647,Bacardi Light Rum 750mL,ml,4286,16.0,L,1929,Y
1222,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,200.00,$5.0881,Meagher's Triple Sec,ml,1109,16.0,L,1929,Y
1223,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,3.00,$6.0833,JUICE orange 100% tetra,L,17946,16.0,L,1929,Y
1224,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,375.00,$9.6546,Malibu 1.14L,ml,4293,16.0,L,1929,Y


The function below is used to identify and mark any preparations in the items table to make the process identical to the one for UBC Food Services.

In [11]:
"""
INPUT: A Pandas Series
OUTPUT: A Pandas Series
Description: Given a DataFrame 'Items' with columns 'item_descrip.1', 'item_descrip', 'item_num', and 'line_item_num':
                - Select rows where 'item_descrip' or 'item_descrip.1' contains 'prep' (case insensitive).
                - Prepend 'P' to 'item_num' if 'item_descrip' contains 'prep'.
                - Prepend 'P' to 'line_item_num' if 'item_descrip.1' contains 'prep'.
"""
def prepend_p(row):
    if pd.notna(row['item_descrip']) and 'prep' in row['item_descrip'].lower():
        row['item_num'] = 'P-' + str(row['item_num'])
    if pd.notna(row['item_descrip.1']) and 'prep' in row['item_descrip.1'].lower():
        row['line_item_num'] = 'P-' + str(row['line_item_num'])
    return row

In [12]:
# Apply the function across the rows
Items = Items.apply(prepend_p, axis=1)

Items

Unnamed: 0,category_2,item_descrip,total_cost,line_qty,ingre_cost,item_descrip.1,uom,line_item_num,pak_physical_yield,pak_uom,item_num,inv_flag
0,Beverages - Juice,Raspberry Puree,$14.0160,1.00,$14.0160,Raspberries Frozen IQF,Kg,7168,2.0,L,1735,Y
1,Beverages - Juice,Raspberry Puree,$14.0160,1.50,$0.0000,Water - Tap,L,2640,2.0,L,1735,Y
2,Beverages - Juice,Strawberry Puree,$5.5242,1.00,$5.5242,Strawberries IQF FRZ,Kg,4099,2.0,L,1736,Y
3,Beverages - Juice,Strawberry Puree,$5.5242,1.50,$0.0000,Water - Tap,L,2640,2.0,L,1736,Y
4,Kitchen Supplies,To Go Cutlery 2023,$0.6308,1.00,$0.0214,CUP PORT PAPER WHT 2Z,ea,4100,1.0,ea,1737,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
1221,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,200.00,$4.7647,Bacardi Light Rum 750mL,ml,4286,16.0,L,1929,Y
1222,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,200.00,$5.0881,Meagher's Triple Sec,ml,1109,16.0,L,1929,Y
1223,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,3.00,$6.0833,JUICE orange 100% tetra,L,17946,16.0,L,1929,Y
1224,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,375.00,$9.6546,Malibu 1.14L,ml,4293,16.0,L,1929,Y


In [13]:
Items.dtypes

category_2             object
item_descrip           object
total_cost             object
line_qty               object
ingre_cost             object
item_descrip.1         object
uom                    object
line_item_num          object
pak_physical_yield    float64
pak_uom                object
item_num               object
inv_flag               object
dtype: object

In [14]:
# Converting 'item_num' column to string
Items['item_num'] = Items['item_num'].astype(str)

# Converting 'line_item_num' column to string
Items['line_item_num'] = Items['line_item_num'].astype(str)

In [15]:
# Checking the type of the first element in 'item_num' and 'line_item_num'
print(type(Items['item_num'].iloc[0]))
print(type(Items['line_item_num'].iloc[0]))

<class 'str'>
<class 'str'>


### Extracting all Preps

In [16]:
# Filter the DataFrame to include only rows where 'item_num' starts with 'P-'
Preps = Items[Items['item_num'].str.startswith('P-')]

# Select specific columns from the filtered DataFrame
Preps = Preps[['item_num', 'item_descrip', 'pak_physical_yield', 'pak_uom', 'inv_flag']]
Preps.rename(columns={'item_num': 'PrepId', 'item_descrip': 'Description', 'pak_physical_yield': 'PakQty', 'pak_uom': 'PakUOM', 'inv_flag': 'InventoryGroup'}, inplace=True)
Preps.drop_duplicates(subset=["PrepId"], inplace=True)
Preps.reset_index(drop=True, inplace=True)
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-1753,2023 Basmati Prep,2.0,Kg,Y
1,P-1754,2023 Beef Gravy (prep),4.5,L,Y
2,P-1755,2023 Beets Salad Prep,1.0,ea,Y
3,P-1756,2023 Blackened Carbonara Prep,1.0,ea,Y
4,P-1757,2023 Butter Chicken Prep,1.0,PORT,Y
...,...,...,...,...,...
76,P-1881,2023 Egplant prep,500.0,g,Y
77,P-1883,2023 Hummus Prep,1408.0,g,Y
78,P-1888,Beets Prep 2023,1.2,Kg,Y
79,P-1889,Candy Cane beets Prep 2023,1.0,LBS,Y


In [17]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Preps_List.csv")
Preps.to_csv(path, index = False, header = True)

### Extracting all Items

In [18]:
Items

Unnamed: 0,category_2,item_descrip,total_cost,line_qty,ingre_cost,item_descrip.1,uom,line_item_num,pak_physical_yield,pak_uom,item_num,inv_flag
0,Beverages - Juice,Raspberry Puree,$14.0160,1.00,$14.0160,Raspberries Frozen IQF,Kg,7168,2.0,L,1735,Y
1,Beverages - Juice,Raspberry Puree,$14.0160,1.50,$0.0000,Water - Tap,L,2640,2.0,L,1735,Y
2,Beverages - Juice,Strawberry Puree,$5.5242,1.00,$5.5242,Strawberries IQF FRZ,Kg,4099,2.0,L,1736,Y
3,Beverages - Juice,Strawberry Puree,$5.5242,1.50,$0.0000,Water - Tap,L,2640,2.0,L,1736,Y
4,Kitchen Supplies,To Go Cutlery 2023,$0.6308,1.00,$0.0214,CUP PORT PAPER WHT 2Z,ea,4100,1.0,ea,1737,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
1221,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,200.00,$4.7647,Bacardi Light Rum 750mL,ml,4286,16.0,L,1929,Y
1222,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,200.00,$5.0881,Meagher's Triple Sec,ml,1109,16.0,L,1929,Y
1223,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,3.00,$6.0833,JUICE orange 100% tetra,L,17946,16.0,L,1929,Y
1224,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,375.00,$9.6546,Malibu 1.14L,ml,4293,16.0,L,1929,Y


In [21]:
items = Items[~Items['line_item_num'].str.startswith('P-')]
items = items[['line_item_num', 'item_descrip.1', 'line_qty', 'uom', 'pak_physical_yield', 'pak_uom', 'inv_flag']]

items.rename(columns={'line_item_num': 'ItemId', 'item_descrip.1': 'Description', 'line_qty': 'CaseQty', 'uom': 'CaseUOM',
                     'pak_physical_yield': 'PakQty', 'pak_uom': 'PakUOM', 'inv_flag': 'InventoryGroup'}, inplace=True)

items.drop_duplicates(subset=["ItemId"], inplace=True)
items.reset_index(drop=True, inplace=True)
items['ItemId'] = 'I-' + items['ItemId'].astype(str)
items

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-7168,Raspberries Frozen IQF,1.00,Kg,2.0,L,Y
1,I-2640,Water - Tap,1.50,L,2.0,L,Y
2,I-4099,Strawberries IQF FRZ,1.00,Kg,2.0,L,Y
3,I-4100,CUP PORT PAPER WHT 2Z,1.00,ea,1.0,ea,Y
4,I-4101,LID Foil Board 6x8.5 SO,1.00,ea,1.0,ea,Y
...,...,...,...,...,...,...,...
344,I-12929,BIB- Sprite 10L,3.00,L,16.0,L,Y
345,I-1109,Meagher's Triple Sec,375.00,ml,16.0,L,Y
346,I-4293,Malibu 1.14L,375.00,ml,16.0,L,Y
347,I-4294,Abstract Apricot Brandy 750mL,375.00,ml,16.0,L,Y


In [22]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Items_List.csv")
items.to_csv(path, index = False, header = True)

### Extracting all Ingredients

In [23]:
Items

Unnamed: 0,category_2,item_descrip,total_cost,line_qty,ingre_cost,item_descrip.1,uom,line_item_num,pak_physical_yield,pak_uom,item_num,inv_flag
0,Beverages - Juice,Raspberry Puree,$14.0160,1.00,$14.0160,Raspberries Frozen IQF,Kg,7168,2.0,L,1735,Y
1,Beverages - Juice,Raspberry Puree,$14.0160,1.50,$0.0000,Water - Tap,L,2640,2.0,L,1735,Y
2,Beverages - Juice,Strawberry Puree,$5.5242,1.00,$5.5242,Strawberries IQF FRZ,Kg,4099,2.0,L,1736,Y
3,Beverages - Juice,Strawberry Puree,$5.5242,1.50,$0.0000,Water - Tap,L,2640,2.0,L,1736,Y
4,Kitchen Supplies,To Go Cutlery 2023,$0.6308,1.00,$0.0214,CUP PORT PAPER WHT 2Z,ea,4100,1.0,ea,1737,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
1221,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,200.00,$4.7647,Bacardi Light Rum 750mL,ml,4286,16.0,L,1929,Y
1222,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,200.00,$5.0881,Meagher's Triple Sec,ml,1109,16.0,L,1929,Y
1223,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,3.00,$6.0833,JUICE orange 100% tetra,L,17946,16.0,L,1929,Y
1224,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,375.00,$9.6546,Malibu 1.14L,ml,4293,16.0,L,1929,Y


In [24]:
mask = ~Items['line_item_num'].astype(str).str.startswith('P-')
# Prepend 'I-' to 'ItemId' for rows matching the mask
Items.loc[mask, 'line_item_num'] = 'I-' + Items.loc[mask, 'line_item_num'].astype(str)

mask = ~Items['item_num'].astype(str).str.startswith('P-')
Items.loc[mask, 'item_num'] = 'R-' + Items.loc[mask, 'item_num'].astype(str)
Items

Unnamed: 0,category_2,item_descrip,total_cost,line_qty,ingre_cost,item_descrip.1,uom,line_item_num,pak_physical_yield,pak_uom,item_num,inv_flag
0,Beverages - Juice,Raspberry Puree,$14.0160,1.00,$14.0160,Raspberries Frozen IQF,Kg,I-7168,2.0,L,R-1735,Y
1,Beverages - Juice,Raspberry Puree,$14.0160,1.50,$0.0000,Water - Tap,L,I-2640,2.0,L,R-1735,Y
2,Beverages - Juice,Strawberry Puree,$5.5242,1.00,$5.5242,Strawberries IQF FRZ,Kg,I-4099,2.0,L,R-1736,Y
3,Beverages - Juice,Strawberry Puree,$5.5242,1.50,$0.0000,Water - Tap,L,I-2640,2.0,L,R-1736,Y
4,Kitchen Supplies,To Go Cutlery 2023,$0.6308,1.00,$0.0214,CUP PORT PAPER WHT 2Z,ea,I-4100,1.0,ea,R-1737,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
1221,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,200.00,$4.7647,Bacardi Light Rum 750mL,ml,I-4286,16.0,L,R-1929,Y
1222,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,200.00,$5.0881,Meagher's Triple Sec,ml,I-1109,16.0,L,R-1929,Y
1223,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,3.00,$6.0833,JUICE orange 100% tetra,L,I-17946,16.0,L,R-1929,Y
1224,Alcohol - White Wine,Sangria White Tab 2023,$109.2348,375.00,$9.6546,Malibu 1.14L,ml,I-4293,16.0,L,R-1929,Y


In [25]:
Ingredients = Items[['line_item_num', 'line_qty', 'uom', 'item_num']]

Ingredients.rename(columns={'line_item_num': 'IngredientId', 'line_qty': 'Qty', 'uom': 'Uom', 'item_num': 'Recipe'}, inplace=True)

Ingredients.drop_duplicates(subset=["IngredientId", "Recipe"], inplace=True)

Ingredients.reset_index(drop=True, inplace=True)
Ingredients

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Ingredients.rename(columns={'line_item_num': 'IngredientId', 'line_qty': 'Qty', 'uom': 'Uom', 'item_num': 'Recipe'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Ingredients.drop_duplicates(subset=["IngredientId", "Recipe"], inplace=True)


Unnamed: 0,IngredientId,Qty,Uom,Recipe
0,I-7168,1.00,Kg,R-1735
1,I-2640,1.50,L,R-1735
2,I-4099,1.00,Kg,R-1736
3,I-2640,1.50,L,R-1736
4,I-4100,1.00,ea,R-1737
...,...,...,...,...
1221,I-4286,200.00,ml,R-1929
1222,I-1109,200.00,ml,R-1929
1223,I-17946,3.00,L,R-1929
1224,I-4293,375.00,ml,R-1929


In [26]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Ingredients_List.csv")
Ingredients.to_csv(path, index = False, header = True)

### Extracting Products List

In [27]:
Products = Items[~Items['item_num'].str.startswith('P-')]
Products = Products[['item_num', 'item_descrip', 'inv_flag']]

Products.rename(columns={'item_num': 'ProdId', 'item_descrip': 'Description', 'inv_flag': 'SalesGroup'}, inplace=True)
Products.drop_duplicates(inplace=True)
Products.reset_index(drop=True, inplace=True)
Products

Unnamed: 0,ProdId,Description,SalesGroup
0,R-1735,Raspberry Puree,Y
1,R-1736,Strawberry Puree,Y
2,R-1737,To Go Cutlery 2023,Y
3,R-1738,Catr - Mint Chutney,Y
4,R-1739,Gall 4.0 - Bean Patty,Y
...,...,...,...
109,R-1924,Lime Mojito Single 2023,Y
110,R-1925,Simple Syrup 2023,Y
111,R-1927,Misc Extras,Y
112,R-1928,Sangria Red Tap 2023,Y


In [28]:
# Prep item that is also product
crispy_eggplant = Preps[Preps['PrepId'] == 'P-18316']
crispy_eggplant = crispy_eggplant[['PrepId', 'Description', 'InventoryGroup']]
crispy_eggplant.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
crispy_eggplant

Unnamed: 0,ProdId,Description,SalesGroup


In [24]:
Products = pd.concat([crispy_eggplant, Products], ignore_index=True)

In [25]:
# Prep item that is also product
yam_fries = Preps[Preps['PrepId'] == 'P-18313']
yam_fries = yam_fries[['PrepId', 'Description', 'InventoryGroup']]
yam_fries.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
yam_fries

Unnamed: 0,ProdId,Description,SalesGroup
71,P-18313,Yam Fries prep 2023,N


In [26]:
Products = pd.concat([yam_fries, Products], ignore_index=True)

In [27]:
# Prep item that is also product
beet_salad = Preps[Preps['PrepId'] == 'P-18275']
beet_salad = beet_salad[['PrepId', 'Description', 'InventoryGroup']]
beet_salad.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
beet_salad

Unnamed: 0,ProdId,Description,SalesGroup
21,P-18275,Beets salad prep 2023,N


In [28]:
Products = pd.concat([beet_salad, Products], ignore_index=True)

In [29]:
# Prep item that is also product
korean_fried_chicken = Preps[Preps['PrepId'] == 'P-18266']
korean_fried_chicken = korean_fried_chicken[['PrepId', 'Description', 'InventoryGroup']]
korean_fried_chicken.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
korean_fried_chicken

Unnamed: 0,ProdId,Description,SalesGroup
27,P-18266,KFC Gallery prep 2022,N


In [30]:
Products = pd.concat([korean_fried_chicken, Products], ignore_index=True)

In [31]:
# Prep item that is also product
power_punch_salad = Preps[Preps['PrepId'] == 'P-15006']
power_punch_salad = power_punch_salad[['PrepId', 'Description', 'InventoryGroup']]
power_punch_salad.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
power_punch_salad

Unnamed: 0,ProdId,Description,SalesGroup
5,P-15006,2022 Power Punch Salad prep,N


In [32]:
Products = pd.concat([power_punch_salad, Products], ignore_index=True)

In [33]:
# Prep item that is also product
chicken_caesar_wrap = Preps[Preps['PrepId'] == 'P-14560']
chicken_caesar_wrap = chicken_caesar_wrap[['PrepId', 'Description', 'InventoryGroup']]
chicken_caesar_wrap.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
chicken_caesar_wrap

Unnamed: 0,ProdId,Description,SalesGroup
1,P-14560,2022 Caesar Wrap prep,N


In [34]:
Products = pd.concat([chicken_caesar_wrap, Products], ignore_index=True)

In [35]:
# Prep item that is also product
tuscan_penne = Preps[Preps['PrepId'] == 'P-18330']
tuscan_penne = tuscan_penne[['PrepId', 'Description', 'InventoryGroup']]
tuscan_penne.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
tuscan_penne

Unnamed: 0,ProdId,Description,SalesGroup
69,P-18330,Tuscan Prep 2023,N


In [36]:
Products = pd.concat([tuscan_penne, Products], ignore_index=True)

In [37]:
# Prep item that is also product
chicken_teriyaki = Preps[Preps['PrepId'] == 'P-17366']
chicken_teriyaki = chicken_teriyaki[['PrepId', 'Description', 'InventoryGroup']]
chicken_teriyaki.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
chicken_teriyaki

Unnamed: 0,ProdId,Description,SalesGroup
23,P-17366,Chicken Teriyaki Prep 2023,Y


In [38]:
Products = pd.concat([chicken_teriyaki, Products], ignore_index=True)

In [39]:
# Prep item that is also product
tempeh_teriyaki = Preps[Preps['PrepId'] == 'P-18296']
tempeh_teriyaki = tempeh_teriyaki[['PrepId', 'Description', 'InventoryGroup']]
tempeh_teriyaki.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
tempeh_teriyaki

Unnamed: 0,ProdId,Description,SalesGroup
67,P-18296,Teriyaki Tempeh Prep 2023,N


In [40]:
Products = pd.concat([tempeh_teriyaki, Products], ignore_index=True)

In [41]:
# Prep item that is also product
butter_chicken = Preps[Preps['PrepId'] == 'P-15019']
butter_chicken = butter_chicken[['PrepId', 'Description', 'InventoryGroup']]
butter_chicken.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
butter_chicken

Unnamed: 0,ProdId,Description,SalesGroup
22,P-15019,Butter Chicken Prep 2023,N


In [42]:
Products = pd.concat([butter_chicken, Products], ignore_index=True)

In [43]:
Products

Unnamed: 0,ProdId,Description,SalesGroup
0,P-15019,Butter Chicken Prep 2023,N
1,P-18296,Teriyaki Tempeh Prep 2023,N
2,P-17366,Chicken Teriyaki Prep 2023,Y
3,P-18330,Tuscan Prep 2023,N
4,P-14560,2022 Caesar Wrap prep,N
...,...,...,...
296,R-18272,Truffle Mushroom Pasta 2023,N
297,R-15427,Vegan Chipotle Mayo,N
298,R-15426,Vegan Tofu Scramble,N
299,R-15544,VEGGIE BENNY,N


In [44]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Products_List.csv")
Products.to_csv(path, index = False, header = True)

### Import Conversions List

In [45]:
# Read conventions.xml files in the filepath_list and construct a dataframe
ConversionId = []
Multiplier = []
ConvertFromQty = []
ConvertFromUom = []
ConvertToQty = []
ConvertToUom = []

# From the XML file for Conversions append the id into ConversionId, multiplier into Multiplier, ConvertFrom->qty into 
# ConvertFromQty,ConvertFrom->uom into ConvertFromUom, ConvertTo->qty into ConvertToQty and and ConvertTo->uom into the
# CovertToUom list. 
# Make a dataframe out of the 3 lists
# Then also drop the duplicates in the Products dataframe

filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", "OK 23-24 Sep-Dec*", "*.oc"))
for filepath in filepath_list:
    path = filepath + '/Conversions.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Conversion'):
            ConversionId.append(x.attrib['id'])
            Multiplier.append(x.attrib['multiplier'])
            ConvertFromQty.append(x.find('ConvertFrom').attrib['qty'])
            ConvertFromUom.append(x.find('ConvertFrom').attrib['uom'])
            ConvertToQty.append(x.find('ConvertTo').attrib['qty'])
            ConvertToUom.append(x.find('ConvertTo').attrib['uom'])
    
    
Conversions = pd.DataFrame({'ConversionId': ConversionId, 'Multiplier': Multiplier, 'ConvertFromQty': ConvertFromQty,
                           'ConvertFromUom': ConvertFromUom, 'ConvertToQty': ConvertToQty, 'ConvertToUom': ConvertToUom}
                          ).drop_duplicates()

Conversions.reset_index(drop=True, inplace=True)

In [54]:
Conversions.loc[Conversions["ConversionId"] == "I-29389"]

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom


In [52]:
# all_id_list = Items["ItemId"].unique()
all_id_set = set(Items["item_num"].unique())
all_conv_set = set(Conversions["ConversionId"].unique())

missing_conv_id = all_id_set - all_conv_set
n = len(missing_conv_id)
print(f"{n} Items in Items dataframe but not in Conversions Dataframe:\n",)
print(missing_conv_id)

364 Items in Items dayaframe but not in Conversions Dataframe:

{'R-14995', 'R-15315', 'P-18321', 'R-15427', 'R-14735', 'R-17356', 'R-11408', 'P-16793', 'R-15512', 'R-16786', 'P-18327', 'R-16106', 'R-15368', 'R-16858', 'R-10484', 'R-15411', 'R-16783', 'R-16222', 'R-16218', 'R-17013', 'R-17362', 'R-17353', 'R-18344', 'R-16860', 'P-6150', 'R-12290', 'R-13937', 'R-16581', 'R-14818', 'P-14376', 'R-16006', 'R-8992', 'R-15449', 'R-14250', 'R-14341', 'R-16843', 'R-15038', 'R-18675', 'R-15402', 'R-18270', 'R-14339', 'R-14340', 'R-14507', 'R-16572', 'P-18049', 'R-17134', 'R-13758', 'R-17014', 'R-18719', 'R-14981', 'P-18335', 'R-17028', 'R-14723', 'R-17350', 'R-14906', 'P-17305', 'R-17988', 'R-7814', 'R-15511', 'R-16775', 'R-17987', 'R-8988', 'P-14354', 'R-18348', 'P-18052', 'R-18342', 'R-16217', 'P-18316', 'R-15073', 'R-18470', 'R-18720', 'P-18313', 'R-14992', 'R-16574', 'R-12339', 'R-16794', 'P-6811', 'R-17039', 'R-16654', 'P-18330', 'R-15342', 'R-17941', 'R-12613', 'R-15438', 'R-11361', 'R-15

In [53]:
Conversions.shape

(444, 6)

In [55]:
Conversions.dtypes

ConversionId      object
Multiplier        object
ConvertFromQty    object
ConvertFromUom    object
ConvertToQty      object
ConvertToUom      object
dtype: object

In [56]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Conversions_List.csv")
Conversions.to_csv(path, index = False, header = True)

***
## Data Summary

In [57]:
# Summary of raw data imported for evaluation
# Here we have a summary of the number of items, preps, ingredients, products, conversions

datasum = pd.DataFrame([Items.shape, Preps.shape, Ingredients.shape, Products.shape, Conversions.shape],
                       columns = ['count', 'columns'], 
                       index = ['Items', 'Preps', 'Ingredients', 'Products', 'Conversions'])
datasum

Unnamed: 0,count,columns
Items,2158,12
Preps,73,5
Ingredients,2158,4
Products,301,3
Conversions,444,6
