![](ubc_header.png)

# Climate-Friendly Food Systems (CFFS) Labelling Project

### The University of British Columbia

****

## Part I: Data Preprocessing

## Set up and Import Libraries

In [1]:
#pip install -r requirements.txt

In [2]:
import pandas as pd
import glob
import os
import xml.etree.ElementTree as et

In [3]:
# # RUN ONLY ONCE
# # os.chdir is used to change the current directory to the specified path
os.chdir("../") # Sets path to the repo folder as it is one level above where this file exists!
path = os.getcwd()
print(path)

/Users/ankurbhardwaj/Desktop/SEEDS/CFFS_Label_2024_25


****

## Load Data Files

### Set Data File Path

In [4]:
# Select data file path for the chosen venue and time range where the recipes data stored
RESTAURANT_FOLDER_NAME = "PRESTO"

# List all the files in the respective restaurant folder
filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", RESTAURANT_FOLDER_NAME, "*.oc"))

filepath_list

['/Users/ankurbhardwaj/Desktop/SEEDS/CFFS_Label_2024_25/data/raw/PRESTO/presto_27_feb_2025.oc']

### Import Items List

In [5]:
# Read items.xml files in the filepath_list and construct a dataframe
ItemId = []
Description = []
CaseQty = []
CaseUOM = []
PakQty = []
PakUOM = []
InventoryGroup = []


# from the items xml file, findtext of CaseQty, CaseUOM, PakQty, PakUOM, and InventoryGroup
# then append it on the lists above

for filepath in filepath_list:
    path = filepath + '/Items.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for item in xtree.iterfind('Item'):
            ItemId.append(item.attrib['id'])
            Description.append(item.findtext('Description'))
            CaseQty.append(item.findtext('CaseQty'))
            CaseUOM.append(item.findtext('CaseUOM'))
            PakQty.append(item.findtext('PakQty'))
            PakUOM.append(item.findtext('PakUOM'))
            InventoryGroup.append(item.findtext('InventoryGroup'))

            
# Create a dataframe from the lists created above.
        
Items = pd.DataFrame({'ItemId': ItemId, 'Description': Description, 'CaseQty': CaseQty, 
                      'CaseUOM': CaseUOM, 'PakQty': PakQty, 'PakUOM': PakUOM, 'InventoryGroup': InventoryGroup}
                    )

Items.drop_duplicates(inplace=True)

Items.reset_index(drop=True, inplace=True)

In [6]:
# creates a new array with unique ItemIds
all_id_list = Items["ItemId"].unique()

In [7]:
# Gives you the list of breads
breadlist = []


# If the items in the Items list is a "LOAF", "SANDWICH", or "BREAD" then we should add it in the breadlist
# We append it onto the breadlist at position row["ItemId"]
for ind, row in Items.iterrows():
    if ("LOAF" or "SANDWICH" "BREAD") in row["Description"]:
        breadlist.append(row["ItemId"])

breadlist

[]

In [8]:
Items = Items[~Items["InventoryGroup"].isin(["DISPOSABLES", "SUPPLY"])]

In [9]:
# Based on info below there are 486 rows and 7 columns
Items.shape

(27, 7)

In [10]:
Items.dtypes

ItemId            object
Description       object
CaseQty           object
CaseUOM           object
PakQty            object
PakUOM            object
InventoryGroup    object
dtype: object

In [11]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Items_List.csv")
Items.to_csv(path, index = False, header = True)

### Import Ingredients List

In [12]:
# Read ingredients.xml files in the filepath_list and construct a dataframe
IngredientId = []
Conversion = []
InvFactor = []
Qty = []
Recipe = []
Uom = []

# Using the Ingredients XML file, we extract attributes containing ingredients, conversion, invFactor, qty, recipe, and uom. 
# Then we append it onto the IngredientId, Coversion, InvFactor, Qty, Recipe, and Uom lists
# Then we create a dataframe using the lists created. 

for filepath in filepath_list:
    path = filepath + '/Ingredients.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Ingredient'):
            IngredientId.append(x.attrib['ingredient'])
            Conversion.append(x.attrib['conversion'])
            InvFactor.append(x.attrib['invFactor'])
            Qty.append(x.attrib['qty'])
            Recipe.append(x.attrib['recipe'])
            Uom.append(x.attrib['uom'])
    
Ingredients = pd.DataFrame({'IngredientId': IngredientId, 'Qty': Qty,'Uom': Uom, 'Conversion': Conversion, 
                      'InvFactor': InvFactor,'Recipe': Recipe}).drop_duplicates()
Ingredients.drop_duplicates(subset=["IngredientId", "Recipe"], inplace=True)

Ingredients.reset_index(drop=True, inplace=True)

In [13]:
# This will output the IngredientId on the right side and the number of times the ingredient appears in the Ingredients
# dataframe on the left side. 

# The duplicated() method returns a Series with True and False values that describe which rows in the DataFrame are 
# duplicated and not.

check = Ingredients["IngredientId"].duplicated()

# The line below tells us at which index is the ingredient duplicated. For example I-4598 exists on index 2 of the dataframe and
# then again on index 8. So duplicate is first true on index 8, which is why it is printed below. It also appears in a below 
# index so it is printed again with that index number. 
Ingredients["IngredientId"][check]

5      I-6026
9      I-3642
15     I-3632
17     I-3388
18     I-5983
19     I-6026
21     I-3388
23     I-5983
24     I-6026
25     I-3387
31     I-3387
35    P-18907
38    I-14182
43    P-74628
44    P-75383
Name: IngredientId, dtype: object

In [14]:
Ingredients

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe
0,I-3642,500.0,g,1.0,333.3333,P-18907
1,I-6026,1.0,Kg,1000.0,0.6667,P-18907
2,I-3388,250.0,ml,0.001,27.7778,P-26216
3,I-3416,4.54,Kg,2.20462,0.5044,P-26216
4,I-5983,20.0,L,1.0,2.2222,P-26216
5,I-6026,10.0,g,1.0,1.1111,P-26216
6,I-4772,1.0,BUNCH,1.0,0.0222,P-44728
7,I-14182,400.0,g,0.001,88.8889,P-74628
8,I-3632,15.0,g,1.0,3.3333,P-74628
9,I-3642,9.0,g,1.0,2.0,P-74628


In [15]:
Items

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-14182,CHEESE PARM*GRATED,2.0,bag,2.5,Kg,DAIRY
1,I-72089,CHICKEN BRST STRP CKD^,2.0,bag,2.0,Kg,"PACKAGED BEER, CIDER"
2,I-65802,CREAM WHIP*36% 946ML,16.0,each,946.0,ml,DAIRY
3,I-68787,FOCACCIA ORGANIC,1.0,SHEET,1.0,SHEET,BREAD
4,I-3632,GARLIC POWDER*,2.1,Kg,2100.0,g,SPICES
5,I-4772,HERB PARSLEY MX,3.0,BUNCH,1.0,BUNCH,PRODUCE
6,I-37005,MEATBALLS BEEF,4.54,Kg,1000.0,g,MEAT
7,I-3387,OIL CANOLA*OLIVE OIL,6.0,can,3.0,L,FOOD - GROCERY
8,I-3388,OIL CANOLA*SALAD 100% PURE,4.0,can,3.0,L,FOOD - GROCERY
9,I-4757,ONIONS RED,25.0,lb,1.0,lb,PRODUCE


In [16]:
Ingredients.shape

(52, 6)

In [17]:
Ingredients.dtypes

IngredientId    object
Qty             object
Uom             object
Conversion      object
InvFactor       object
Recipe          object
dtype: object

In [18]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Ingredients_List.csv")
Ingredients.to_csv(path, index = False, header = True)

### Import Preps List

In [19]:
# Read preps.xml files in the filepath_list and construct a dataframe
PrepId = []
Description = []
PakQty = []
PakUOM = []
InventoryGroup = []


# Here we do the same thing for the Preps XML file where we find the columns using attrib function and then append it onto
# the dataframe called Preps. 

for filepath in filepath_list:
    path = filepath + '/Preps.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Prep'):
            PrepId.append(x.attrib['id'])
            Description.append(x.findtext('Description'))
            PakQty.append(x.findtext('PakQty'))
            PakUOM.append(x.findtext('PakUOM'))
            InventoryGroup.append(x.findtext('InventoryGroup'))
    
Preps = pd.DataFrame({'PrepId': PrepId, 'Description': Description,
                  'PakQty': PakQty, 'PakUOM':PakUOM, 'InventoryGroup': InventoryGroup}).drop_duplicates()
preps_columns = Preps.columns
Preps.drop_duplicates(subset=["PrepId"], inplace=True)

Preps.reset_index(drop=True, inplace=True)

In [20]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-75391,COOKED|Pasta|Cavatappi,9.0,Kg,PREP
1,P-26216,COOKED|Pasta|Penne|WW*,9.0,Kg,PREP
2,P-75390,COOKED|Pasta|Spaghetti,9.0,Kg,PREP
3,P-75443,COOKED|Prawns,800.0,g,
4,P-18907,MIX|Salt & Pepper*,1.5,Kg,
5,P-75445,ROASTED|Veg,3.0,Kg,
6,P-74628,SAUCE|Alfredo,4.5,L,
7,P-75383,SAUCE|Marinara,4.5,L,
8,P-75559,SAUCE|Rose,4.0,L,
9,P-44728,YEILD|Chopped Parsley*,45.0,g,


In [21]:
# This tells us that there are not any duplicates in the Preps dataframe. There is a different process|ingredient pair for 
# all the recipes. 
check = Preps["PrepId"].duplicated().any()
print(check)

False


In [22]:
# There are 546 rows and 5 columns
Preps.shape

(10, 5)

In [23]:
Preps.dtypes

PrepId            object
Description       object
PakQty            object
PakUOM            object
InventoryGroup    object
dtype: object

In [24]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Preps_List.csv")
Preps.to_csv(path, index = False, header = True)

### Import Products List

In [25]:
# Read products.xml files in the filepath_list and construct a dataframe
ProdId = []
Description = []
SalesGroup = []

# From the XML file for products append the id into ProdId, description into Description, and SalesGroup into SalesGroup list.
# Make a dataframe out of the 3 lists
# Then also drop the duplicates in the Products dataframe

for filepath in filepath_list:
    path = filepath + '/Products.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Prod'):
            ProdId.append(x.attrib['id'])
            Description.append(x.findtext('Description'))
            SalesGroup.append(x.findtext('SalesGroup'))
        
Products = pd.DataFrame({'ProdId': ProdId, 'Description': Description, 'SalesGroup': SalesGroup})
Products.drop_duplicates(inplace=True)

Products.reset_index(drop=True, inplace=True)

In [26]:
Products

Unnamed: 0,ProdId,Description,SalesGroup
0,R-75442,ADDON|Chicken,ADD ONS
1,R-75441,ADDON|Meat Balls,ADD ONS
2,R-75444,ADDON|Prawns,ADD ONS
3,R-75446,ADDON|Roasted Veg,ADD ONS
4,R-75440,ENTREE|Pasta Bowl,FOOD


In [27]:
Products.dtypes

ProdId         object
Description    object
SalesGroup     object
dtype: object

In [28]:
Products.shape

(5, 3)

In [29]:
Products.dtypes

ProdId         object
Description    object
SalesGroup     object
dtype: object

In [30]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Products_List.csv")
Products.to_csv(path, index = False, header = True)

### Import Conversions List

In [31]:
# Read conventions.xml files in the filepath_list and construct a dataframe
ConversionId = []
Multiplier = []
ConvertFromQty = []
ConvertFromUom = []
ConvertToQty = []
ConvertToUom = []

# From the XML file for Conversions append the id into ConversionId, multiplier into Multiplier, ConvertFrom->qty into 
# ConvertFromQty,ConvertFrom->uom into ConvertFromUom, ConvertTo->qty into ConvertToQty and and ConvertTo->uom into the
# CovertToUom list. 
# Make a dataframe out of the 3 lists
# Then also drop the duplicates in the Products dataframe


for filepath in filepath_list:
    path = filepath + '/Conversions.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Conversion'):
            ConversionId.append(x.attrib['id'])
            Multiplier.append(x.attrib['multiplier'])
            ConvertFromQty.append(x.find('ConvertFrom').attrib['qty'])
            ConvertFromUom.append(x.find('ConvertFrom').attrib['uom'])
            ConvertToQty.append(x.find('ConvertTo').attrib['qty'])
            ConvertToUom.append(x.find('ConvertTo').attrib['uom'])
    
    
Conversions = pd.DataFrame({'ConversionId': ConversionId, 'Multiplier': Multiplier, 'ConvertFromQty': ConvertFromQty,
                           'ConvertFromUom': ConvertFromUom, 'ConvertToQty': ConvertToQty, 'ConvertToUom': ConvertToUom}
                          ).drop_duplicates()

Conversions.reset_index(drop=True, inplace=True)

In [32]:
# Here we can see for example that to convert 1.14 L to 1 L the multiplier is 0.877 since 1/1.14 = 0.877
Conversions

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,,1.0,1.0,XXX,1.0,L
1,,0.87719298,1.0,1.14L,1.14,L
2,,0.66666667,1.0,1.5L,1.5,L
3,,0.57142857,1.0,1.75 L,1.75,L
4,,0.5,1.0,2L,2.0,L
5,,0.25,1.0,4L,4.0,L
6,,0.08333333,1.0,FOOT,12.0,INCH
7,,0.0625,1.0,16L,16.0,L
8,,0.0591716,1.0,1/2LTR,16.9,fl oz
9,,0.03937008,1.0,750ML,25.4,fl oz


In [33]:
all_id_list = Items["ItemId"].unique()
all_conv_list = Conversions["ConversionId"].unique()

print("All unique IDs list\n")
print(all_id_list)
print("\n")
print("All unique Conversions list\n")
print(all_conv_list)

All unique IDs list

['I-14182' 'I-72089' 'I-65802' 'I-68787' 'I-3632' 'I-4772' 'I-37005'
 'I-3387' 'I-3388' 'I-4757' 'I-3416' 'I-3420' 'I-72822' 'I-3642' 'I-4791'
 'I-5093' 'I-6026' 'I-68186' 'I-3525' 'I-74878' 'I-16168' 'I-3579'
 'I-17159' 'I-3692' 'I-5983' 'I-4958' 'I-10491']


All unique Conversions list

['' 'I-3387' 'I-3388' 'I-3632' 'I-3642' 'I-3692' 'I-4772' 'I-5983']


In [34]:
# Here we have the number of items in the list that are a part of the "all_id_list" but not part of the "all_conv_list"
missing_conv_id = []

for item in all_id_list:
    if item not in all_conv_list:
        missing_conv_id.append(item)
        
missing_conv_id
print(len(missing_conv_id))

20


In [35]:
Conversions.shape

(56, 6)

In [36]:
Conversions.dtypes

ConversionId      object
Multiplier        object
ConvertFromQty    object
ConvertFromUom    object
ConvertToQty      object
ConvertToUom      object
dtype: object

In [37]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Conversions_List.csv")
Conversions.to_csv(path, index = False, header = True)

***
## Data Summary

In [38]:
# Summary of raw data imported for evaluation
# Here we have a summary of the number of items, preps, ingredients, products, conversions

datasum = pd.DataFrame([Items.shape, Preps.shape, Ingredients.shape, Products.shape, Conversions.shape],
                       columns = ['count', 'columns'], 
                       index = ['Items', 'Preps', 'Ingredients', 'Products', 'Conversions'])
datasum

Unnamed: 0,count,columns
Items,27,7
Preps,10,5
Ingredients,52,6
Products,5,3
Conversions,56,6
