![](ubc_header.png)

# Climate-Friendly Food Systems (CFFS) Labelling Project

### The University of British Columbia

****

## Part I: Data Preprocessing

## Set up and Import Libraries

In [1]:
#pip install -r requirements.txt

In [2]:
import pandas as pd
import glob
import os
import xml.etree.ElementTree as et

In [3]:
# # RUN ONLY ONCE
# # os.chdir is used to change the current directory to the specified path
os.chdir("../") # Sets path to the repo folder as it is one level above where this file exists!
path = os.getcwd()
print(path)

/Users/vivaanwadhwa/Documents/GitHub/CFFS_sharon_2024


****

## Load Data Files

### Set Data File Path

In [4]:
# Select data file path for the chosen venue and time range where the recipes data stored
RESTAURANT_FOLDER_NAME = "PERUGIA_CAFE"

# List all the files in the respective restaurant folder
filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", RESTAURANT_FOLDER_NAME, "*.oc"))

filepath_list

['/Users/vivaanwadhwa/Documents/GitHub/CFFS_sharon_2024/data/raw/PERUGIA_CAFE/PERUGIA_CAFE_5thDec.oc']

### Import Items List

In [5]:
# Read items.xml files in the filepath_list and construct a dataframe
ItemId = []
Description = []
CaseQty = []
CaseUOM = []
PakQty = []
PakUOM = []
InventoryGroup = []


# from the items xml file, findtext of CaseQty, CaseUOM, PakQty, PakUOM, and InventoryGroup
# then append it on the lists above

for filepath in filepath_list:
    path = filepath + '/Items.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for item in xtree.iterfind('Item'):
            ItemId.append(item.attrib['id'])
            Description.append(item.findtext('Description'))
            CaseQty.append(item.findtext('CaseQty'))
            CaseUOM.append(item.findtext('CaseUOM'))
            PakQty.append(item.findtext('PakQty'))
            PakUOM.append(item.findtext('PakUOM'))
            InventoryGroup.append(item.findtext('InventoryGroup'))

            
# Create a dataframe from the lists created above.
        
Items = pd.DataFrame({'ItemId': ItemId, 'Description': Description, 'CaseQty': CaseQty, 
                      'CaseUOM': CaseUOM, 'PakQty': PakQty, 'PakUOM': PakUOM, 'InventoryGroup': InventoryGroup}
                    )

Items.drop_duplicates(inplace=True)

Items.reset_index(drop=True, inplace=True)

In [6]:
# creates a new array with unique ItemIds
all_id_list = Items["ItemId"].unique()

In [7]:
# None of the items are egg yolk liq
Items.loc[Items["Description"] == "Egg Yolk Liq"]

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup


In [8]:
# Sumplemental option for egg, vegan option
Items.loc[Items["ItemId"] == "I-68700"]

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup


In [9]:
# Gives you the list of breads
breadlist = []


# If the items in the Items list is a "LOAF", "SANDWICH", or "BREAD" then we should add it in the breadlist
# We append it onto the breadlist at position row["ItemId"]
for ind, row in Items.iterrows():
    if ("LOAF" or "SANDWICH" "BREAD") in row["Description"]:
        breadlist.append(row["ItemId"])

breadlist

[]

In [10]:
Items = Items[~Items["InventoryGroup"].isin(["DISPOSABLES", "SUPPLY"])]

In [11]:
# Based on info below there are 486 rows and 7 columns
Items.shape

(59, 7)

In [12]:
Items.dtypes

ItemId            object
Description       object
CaseQty           object
CaseUOM           object
PakQty            object
PakUOM            object
InventoryGroup    object
dtype: object

In [13]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Items_List.csv")
Items.to_csv(path, index = False, header = True)

### Import Ingredients List

In [14]:
# Read ingredients.xml files in the filepath_list and construct a dataframe
IngredientId = []
Conversion = []
InvFactor = []
Qty = []
Recipe = []
Uom = []

# Using the Ingredients XML file, we extract attributes containing ingredients, conversion, invFactor, qty, recipe, and uom. 
# Then we append it onto the IngredientId, Coversion, InvFactor, Qty, Recipe, and Uom lists
# Then we create a dataframe using the lists created. 

for filepath in filepath_list:
    path = filepath + '/Ingredients.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Ingredient'):
            IngredientId.append(x.attrib['ingredient'])
            Conversion.append(x.attrib['conversion'])
            InvFactor.append(x.attrib['invFactor'])
            Qty.append(x.attrib['qty'])
            Recipe.append(x.attrib['recipe'])
            Uom.append(x.attrib['uom'])
    
Ingredients = pd.DataFrame({'IngredientId': IngredientId, 'Qty': Qty,'Uom': Uom, 'Conversion': Conversion, 
                      'InvFactor': InvFactor,'Recipe': Recipe}).drop_duplicates()
Ingredients.drop_duplicates(subset=["IngredientId", "Recipe"], inplace=True)

Ingredients.reset_index(drop=True, inplace=True)

In [15]:
Ingredients.loc[Ingredients["IngredientId"] == "I-29389"]

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe


In [16]:
# This will output the IngredientId on the right side and the number of times the ingredient appears in the Ingredients
# dataframe on the left side. 

# The duplicated() method returns a Series with True and False values that describe which rows in the DataFrame are 
# duplicated and not.

check = Ingredients["IngredientId"].duplicated()

# The line below tells us at which index is the ingredient duplicated. For example I-4598 exists on index 2 of the dataframe and
# then again on index 8. So duplicate is first true on index 8, which is why it is printed below. It also appears in a below 
# index so it is printed again with that index number. 
Ingredients["IngredientId"][check]

10      I-5983
13     P-25993
18      I-3387
19      I-3643
23      I-6026
        ...   
144     I-1120
145    I-14179
146     I-3525
148     I-4463
149     I-6872
Name: IngredientId, Length: 72, dtype: object

In [17]:
# We can see from the printed items above that I-64877 does not appear because it does not get duplicated
Ingredients.loc[Ingredients["IngredientId"] == "I-64877"]

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe


In [18]:
Ingredients

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe
0,I-3421,9.090,Kg,1000.00000000,0.5000,P-18863
1,I-5983,18.000,L,1.00000000,0.9901,P-18863
2,P-25993,50.000,g,0.00100000,2.7503,P-18863
3,I-3387,350.000,ml,0.00100000,9.7222,P-19652
4,I-3969,14.860,Kg,1.00000000,0.4128,P-19652
...,...,...,...,...,...,...
146,I-3525,45.000,ml,0.00100000,1.0000,R-74228
147,I-3624,2.000,g,0.00100000,1.0000,R-74228
148,I-4463,15.000,g,0.00220462,1.0000,R-74228
149,I-6872,10.000,ml,1.00000000,1.0000,R-74228


In [19]:
# Below we are checking if we can extract an entire recipe. So we can do .loc with the particular recipe id and then 
# print all the data points (ingredientId, Qty, Uom, Conversion, InvFactor) for that recipe. 
Ingredients.loc[Ingredients["Recipe"] == "R-68698"]

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe


In [20]:
Items

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-28775,BAGUETTE BUN*DEMI 150G=,1.0,cs,50.0,ea,BREAD
1,I-8118,BAGUETTE PARBAKE*=,24.0,CT,1.0,LOAF,BREAD
2,I-6872,BALSAMIC GLAZE*REDUCTI=,6.0,bottle,380.0,ml,FOOD - GROCERY
3,I-3143,BEANS CHICK*PEAS GABANZO CAN,6.0,LG CAN,2.84,L,FOOD - GROCERY
4,I-19711,BEETS SHREDDED COARSE,5.0,lb,1.0,lb,PRODUCE
5,I-8260,CAPICOLLO EXT HOT CAPOLLA=,1.0,Kg,1.0,Kg,DELI & PREPARED MEAT
6,I-4574,CARROTS SHREDDED 1/8 (COARSE)=,5.0,lb,1.0,lb,PRODUCE
7,I-14179,CHEESE MOZZA*SHRED PIZZA BLEND,4.0,bag,2.5,Kg,DAIRY
8,I-2153,CHEESE NACHO*SHRED=,2.0,bag,2.27,Kg,DAIRY
9,I-2141,CHEESE PARM*SHRED=,2.0,bag,1.0,Kg,DAIRY


In [21]:
Ingredients.shape

(151, 6)

In [22]:
Ingredients.dtypes

IngredientId    object
Qty             object
Uom             object
Conversion      object
InvFactor       object
Recipe          object
dtype: object

In [23]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Ingredients_List.csv")
Ingredients.to_csv(path, index = False, header = True)

### Import Preps List

In [24]:
# Read preps.xml files in the filepath_list and construct a dataframe
PrepId = []
Description = []
PakQty = []
PakUOM = []
InventoryGroup = []


# Here we do the same thing for the Preps XML file where we find the columns using attrib function and then append it onto
# the dataframe called Preps. 

for filepath in filepath_list:
    path = filepath + '/Preps.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Prep'):
            PrepId.append(x.attrib['id'])
            Description.append(x.findtext('Description'))
            PakQty.append(x.findtext('PakQty'))
            PakUOM.append(x.findtext('PakUOM'))
            InventoryGroup.append(x.findtext('InventoryGroup'))
    
Preps = pd.DataFrame({'PrepId': PrepId, 'Description': Description,
                  'PakQty': PakQty, 'PakUOM':PakUOM, 'InventoryGroup': InventoryGroup}).drop_duplicates()
preps_columns = Preps.columns
Preps.drop_duplicates(subset=["PrepId"], inplace=True)

Preps.reset_index(drop=True, inplace=True)

In [25]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-74226,BAKEPASTA|PestoFormaggi*,12.0,Kg,
1,P-74224,BAKEPASTA|PomodoroFormaggi*,12.0,Kg,
2,P-62588,BAKEPASTA|QuattroFormaggi*,12.0,Kg,
3,P-21818,BREAD|Garlic*,6.0,ea,
4,P-19652,CHICKEN|Italian*,36.0,piece,
5,P-61582,CHOPPED|Romain*,5.0,Kg,
6,P-74119,COOKED|Gnocchi,8.0,Kg,
7,P-27185,COOKED|Pasta Short*,10.0,Kg,
8,P-18863,Cooked|Spaghetti*,18.18,Kg,CK - FOOD
9,P-67748,COOKED|Tortellini*,8.0,Kg,


In [26]:
# This tells us that there are not any duplicates in the Preps dataframe. There is a different process|ingredient pair for 
# all the recipes. 
check = Preps["PrepId"].duplicated().any()
print(check)

False


In [27]:
# There are 546 rows and 5 columns
Preps.shape

(22, 5)

In [28]:
# Here we see there is only one place P-50739 is used in the Preps
Preps.loc[Preps["PrepId"] == "P-50739"]

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup


In [29]:
Preps.dtypes

PrepId            object
Description       object
PakQty            object
PakUOM            object
InventoryGroup    object
dtype: object

In [30]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Preps_List.csv")
Preps.to_csv(path, index = False, header = True)

### Import Products List

In [31]:
# Read products.xml files in the filepath_list and construct a dataframe
ProdId = []
Description = []
SalesGroup = []

# From the XML file for products append the id into ProdId, description into Description, and SalesGroup into SalesGroup list.
# Make a dataframe out of the 3 lists
# Then also drop the duplicates in the Products dataframe

for filepath in filepath_list:
    path = filepath + '/Products.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Prod'):
            ProdId.append(x.attrib['id'])
            Description.append(x.findtext('Description'))
            SalesGroup.append(x.findtext('SalesGroup'))
        
Products = pd.DataFrame({'ProdId': ProdId, 'Description': Description, 'SalesGroup': SalesGroup})
Products.drop_duplicates(inplace=True)

Products.reset_index(drop=True, inplace=True)

In [32]:
Products

Unnamed: 0,ProdId,Description,SalesGroup
0,R-18166,CHICKEN|Quarter,ZDONT USE FOOD
1,R-37980,ENTREE|Rst Chicken|Pasta,FOOD - HOT
2,R-72930,FLATBREAD|BBQ Chicken,FOOD - SANDWICH & WRAPS
3,R-74208,FLATBREAD|Caprese,FOOD - SANDWICH & WRAPS
4,R-74207,FLATBREAD|Chicken Parmesan,FOOD - SANDWICH & WRAPS
5,R-74115,FLATBREAD|Meat Lover,FOOD - SANDWICH & WRAPS
6,R-74228,FLATBREAD|Spicy Veggie,FOOD - SANDWICH & WRAPS
7,R-74227,PASTA|Baked|Pesto Formaggi,PASTA
8,R-74225,PASTA|Baked|Pomodoro Formaggi,PASTA
9,R-54272,PASTA|Baked|Quattro Formaggi,PASTA


In [33]:
# Here we can see that there is only one R-56966 in the products 
Products.loc[Products["ProdId"] == "R-56966"]

Unnamed: 0,ProdId,Description,SalesGroup


In [34]:
Products.dtypes

ProdId         object
Description    object
SalesGroup     object
dtype: object

In [35]:
# Here we can see that there is only one R-68698 in the products 
Products.loc[Products["ProdId"] == "R-68698"]

Unnamed: 0,ProdId,Description,SalesGroup


In [36]:
Products.shape

(14, 3)

In [37]:
Products.dtypes

ProdId         object
Description    object
SalesGroup     object
dtype: object

In [38]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Products_List.csv")
Products.to_csv(path, index = False, header = True)

### Import Conversions List

In [39]:
# Read conventions.xml files in the filepath_list and construct a dataframe
ConversionId = []
Multiplier = []
ConvertFromQty = []
ConvertFromUom = []
ConvertToQty = []
ConvertToUom = []

# From the XML file for Conversions append the id into ConversionId, multiplier into Multiplier, ConvertFrom->qty into 
# ConvertFromQty,ConvertFrom->uom into ConvertFromUom, ConvertTo->qty into ConvertToQty and and ConvertTo->uom into the
# CovertToUom list. 
# Make a dataframe out of the 3 lists
# Then also drop the duplicates in the Products dataframe


for filepath in filepath_list:
    path = filepath + '/Conversions.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Conversion'):
            ConversionId.append(x.attrib['id'])
            Multiplier.append(x.attrib['multiplier'])
            ConvertFromQty.append(x.find('ConvertFrom').attrib['qty'])
            ConvertFromUom.append(x.find('ConvertFrom').attrib['uom'])
            ConvertToQty.append(x.find('ConvertTo').attrib['qty'])
            ConvertToUom.append(x.find('ConvertTo').attrib['uom'])
    
    
Conversions = pd.DataFrame({'ConversionId': ConversionId, 'Multiplier': Multiplier, 'ConvertFromQty': ConvertFromQty,
                           'ConvertFromUom': ConvertFromUom, 'ConvertToQty': ConvertToQty, 'ConvertToUom': ConvertToUom}
                          ).drop_duplicates()

Conversions.reset_index(drop=True, inplace=True)

In [40]:
# Here we can see for example that to convert 1.14 L to 1 L the multiplier is 0.877 since 1/1.14 = 0.877
Conversions

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,,1.00000000,1.0000,XXX,1.0000,L
1,,0.87719298,1.0000,1.14L,1.1400,L
2,,0.66666667,1.0000,1.5L,1.5000,L
3,,0.57142857,1.0000,1.75 L,1.7500,L
4,,0.50000000,1.0000,2L,2.0000,L
...,...,...,...,...,...,...
60,I-5983,0.03378378,1.0000,fl oz,29.6000,g
61,I-5983,0.00100000,1.0000,L,1000.0000,g
62,I-8118,0.00196850,1.0000,LOAF,508.0000,g
63,I-9295,0.06666667,1.0000,slice,15.0000,g


In [41]:
# Here we can check that there are no ingredients listed since 
Conversions.loc[Conversions["ConversionId"] == "I-4582"]

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom


In [42]:
all_id_list = Items["ItemId"].unique()
all_conv_list = Conversions["ConversionId"].unique()

print("All unique IDs list\n")
print(all_id_list)
print("\n")
print("All unique Conversions list\n")
print(all_conv_list)

All unique IDs list

['I-28775' 'I-8118' 'I-6872' 'I-3143' 'I-19711' 'I-8260' 'I-4574'
 'I-14179' 'I-2153' 'I-2141' 'I-17617' 'I-3969' 'I-3624' 'I-65800'
 'I-2205' 'I-3241' 'I-4626' 'I-3252' 'I-4640' 'I-1120' 'I-3632' 'I-4695'
 'I-3321' 'I-52598' 'I-4463' 'I-4844' 'I-3900' 'I-2236' 'I-4727' 'I-26468'
 'I-56382' 'I-3387' 'I-6829' 'I-4750' 'I-9274' 'I-71841' 'I-69779'
 'I-3421' 'I-2551' 'I-28657' 'I-3643' 'I-4793' 'I-6869' 'I-3313' 'I-8281'
 'I-6026' 'I-68186' 'I-3506' 'I-3525' 'I-2697' 'I-3579' 'I-2706' 'I-4896'
 'I-56386' 'I-17159' 'I-9295' 'I-3695' 'I-5983' 'I-4962']


All unique Conversions list

['' 'I-1120' 'I-2205' 'I-2236' 'I-3143' 'I-3321' 'I-3387' 'I-3624'
 'I-3632' 'I-3643' 'I-4574' 'I-4626' 'I-4640' 'I-4844' 'I-5983' 'I-8118'
 'I-9295' 'I-56382']


In [43]:
# Here we have the number of items in the list that are a part of the "all_id_list" but not part of the "all_conv_list"
missing_conv_id = []

for item in all_id_list:
    if item not in all_conv_list:
        missing_conv_id.append(item)
        
missing_conv_id
print(len(missing_conv_id))

42


In [44]:
Conversions.shape

(65, 6)

In [45]:
Conversions.loc[Conversions["ConversionId"] == "I-29389"]

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom


In [46]:
Conversions.dtypes

ConversionId      object
Multiplier        object
ConvertFromQty    object
ConvertFromUom    object
ConvertToQty      object
ConvertToUom      object
dtype: object

In [47]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Conversions_List.csv")
Conversions.to_csv(path, index = False, header = True)

***
## Data Summary

In [48]:
# Summary of raw data imported for evaluation
# Here we have a summary of the number of items, preps, ingredients, products, conversions

datasum = pd.DataFrame([Items.shape, Preps.shape, Ingredients.shape, Products.shape, Conversions.shape],
                       columns = ['count', 'columns'], 
                       index = ['Items', 'Preps', 'Ingredients', 'Products', 'Conversions'])
datasum

Unnamed: 0,count,columns
Items,59,7
Preps,22,5
Ingredients,151,6
Products,14,3
Conversions,65,6
