# Climate-Friendly Food Systems (CFFS) Labelling Project

### The University of British Columbia

****

## Part II: Data Cleaning

## Set up and Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os
import xml.etree.ElementTree as et
from xml.etree.ElementTree import parse
from datetime import datetime

In [2]:
# RUN ONLY ONCE
# os.chdir is used to change the current directory to the specified path
os.chdir("../") # Sets path to the repo folder as it is one level above where this file exists!
path = os.getcwd()
print(path)

/Users/vivaanwadhwa/Documents/GitHub/CFFS_sharon_2024


***

## Import Preprocessed Datasets

In [3]:
"""
INPUT: a Dataframe
OUPUT: NONE
Description: Prints a description of the datasets
"""
def DescribeDataset(df):
    print(df.dtypes)
    print("\nShape: ", df.shape)
    display(df.head())

In [4]:
# Reading csv file: data/preprocessed/AMS_data/Items_List.csv
Items = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Items_List.csv"))
DescribeDataset(Items)

ItemId             object
Description        object
CaseQty           float64
CaseUOM            object
PakQty            float64
PakUOM             object
InventoryGroup     object
dtype: object

Shape:  (60, 7)


Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-4099,Olive Oil 100 Pct,60.0,ml,14.0,ea,Y
1,I-2402,TOMATO - 6 X 7 MED,300.0,g,14.0,ea,Y
2,I-1874,GARLIC WHOLE PEELED,15.0,g,1031.0,g,Y
3,I-14422,Onions Green 3ct,300.0,g,18.0,Kg,Y
4,I-1995,MINT LEAVES,50.0,g,1.0,L,Y


In [5]:
# Read Ingredients_List.csv
Ingredients = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Ingredients_List.csv"))
DescribeDataset(Ingredients)

IngredientId     object
Qty             float64
Uom              object
Recipe           object
dtype: object

Shape:  (122, 4)


Unnamed: 0,IngredientId,Qty,Uom,Recipe
0,I-4099,60.0,ml,P-2824
1,I-2402,300.0,g,P-2824
2,P-9935,14.0,ea,P-2824
3,I-4114,10.0,ea,P-2424
4,I-2349,300.0,g,P-2424


In [6]:
# Read Preps_List.csv
Preps = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Preps_List.csv"))
DescribeDataset(Preps)

PrepId             object
Description        object
PakQty            float64
PakUOM             object
InventoryGroup     object
dtype: object

Shape:  (14, 5)


Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-2824,BC - Tomato Focacia,14.0,ea,Y
1,P-2424,BCPrep - Asian Sesame Wrap,10.0,ea,Y
2,P-10674,BCPrep - BACON WRAP,1.0,ea,Y
3,P-1520,BCPrep - Chicken Quinoa Wrap,10.0,ea,Y
4,P-1488,BCPrep - Chicken Salad,1.0,ea,Y


In [7]:
# Read Product_List.csv that was created from 1_data preprocessing
Products = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Products_List.csv"))
DescribeDataset(Products)

ProdId         object
Description    object
SalesGroup     object
dtype: object

Shape:  (32, 3)


Unnamed: 0,ProdId,Description,SalesGroup
0,R-4593.0,$garlic & onion,Y
1,R-3266.0,CHEDDAR SLICE,Y
2,R-9348.0,Cucumber,Y
3,R-9085.0,Lettuce,Y
4,R-2489.0,Tomato,Y


***
## Converter

### Create Unit Converter

In [8]:
# Import standard unit conversion information and construct a dataframe
Std_Unit = pd.read_csv(os.path.join(os.getcwd(), "data", "external", "standard_conversions.csv"))
Std_Unit.head()

Unnamed: 0,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,4.9289,1,tsp,4.9289,ml
1,14.787,1,Tbsp,14.787,ml
2,946.35,1,qt,946.35,ml
3,473.17625,1,pt,473.17625,ml
4,28.3495,1,oz,28.3495,g


In [9]:
# Seperate uoms that converted to 'ml' or 'g'
# Below we create 2 lists. 
# list_unit contains list of unit of measurements that are being converted to milliliters 
# solid_unit contains a list of unit of measurements that are being converted to grams
# tolist() converts a Pandas Series or an array to a python list. 

liquid_unit = Std_Unit.loc[Std_Unit['ConvertToUom'] == 'ml', 'ConvertFromUom'].tolist()
solid_unit = Std_Unit.loc[Std_Unit['ConvertToUom'] == 'g', 'ConvertFromUom'].tolist()

In [10]:
# Construct a standard unit converter
def std_converter(qty, uom):
    if uom in Std_Unit['ConvertFromUom'].tolist():
        multiplier = Std_Unit.loc[Std_Unit['ConvertFromUom'] == uom, 'Multiplier']
        Qty = float(qty)*float(multiplier)
        Uom = Std_Unit.loc[Std_Unit['ConvertFromUom'] == uom, 'ConvertToUom'].values[0]
    else:
        Qty = qty
        Uom = uom
    return (Qty, Uom)

***
## Items with Non-standard Units

In [11]:
Ingredients

Unnamed: 0,IngredientId,Qty,Uom,Recipe
0,I-4099,60.0,ml,P-2824
1,I-2402,300.0,g,P-2824
2,P-9935,14.0,ea,P-2824
3,I-4114,10.0,ea,P-2424
4,I-2349,300.0,g,P-2424
...,...,...,...,...
117,P-10674,1.0,ea,R-4116.0
118,P-1520,1.0,ea,R-9786.0
119,P-9279,1.0,ea,R-8350.0
120,P-4811,1.0,ea,R-3296.0


In [12]:
Items

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-4099,Olive Oil 100 Pct,60.0,ml,14.0,ea,Y
1,I-2402,TOMATO - 6 X 7 MED,300.0,g,14.0,ea,Y
2,I-1874,GARLIC WHOLE PEELED,15.0,g,1031.0,g,Y
3,I-14422,Onions Green 3ct,300.0,g,18.0,Kg,Y
4,I-1995,MINT LEAVES,50.0,g,1.0,L,Y
5,I-4114,"TORTILLA 10"" FLOUR PRESSED TF",10.0,ea,10.0,ea,Y
6,I-2349,SPINACH WASHED TRIMMED,300.0,g,10.0,ea,Y
7,I-4115,ITEM SLICED RED ONION,50.0,g,10.0,ea,Y
8,I-4116,ITEM SHAVED HERL CARROTS,150.0,g,10.0,ea,Y
9,I-4117,ITEM SESAME TOFU,2.0,Kg,10.0,ea,Y


In [13]:
# Filter out the items whose unit information is unknown 
# We find the column names
col_names = list(Ingredients.columns.values)

# Create a Items_Nonstd list
Items_Nonstd = []

# If the unit of measurement is not grams or ml and ingredient id starts with I and the ingredient is not in ConversionId column of Conversions 
# then we add it to Items_Nonstd list
for index, row in Ingredients.iterrows():
    Ingre = Ingredients.loc[index,'IngredientId']
    Uom = Ingredients.loc[index,'Uom']
    if Uom not in ['g', 'ml'] and Uom not in liquid_unit + solid_unit and Ingre.startswith('I'):
        Dict = {}
        Dict.update(dict(row))
        Items_Nonstd.append(Dict)

# Create a DataFrame from Items_Nonstd list
Items_Nonstd = pd.DataFrame(Items_Nonstd, columns = col_names)
# Remove duplicate ingredients of the same properties so that Items_Nonstd has only unique rows. 
Items_Nonstd.drop_duplicates(subset=['IngredientId'], inplace=True,)
Items_Nonstd

Unnamed: 0,IngredientId,Qty,Uom,Recipe
0,I-4114,10.0,ea,P-2424
1,I-2992,2.0,slice,P-10674
4,I-4126,2.0,slice,P-1488
6,I-1695,1.0,slice,P-4582
7,I-1787,1.0,ea,P-4582
8,I-4132,10.0,ea,P-4811
9,I-4135,0.5,ea,P-4811
10,I-4137,1.0,ea,P-9279
11,I-3498,0.5,slice,P-10863
12,I-4139,1.0,ea,P-10863


In [14]:
# Assigning a Description column to the Items_Nonstd    
for index, row in Items_Nonstd.iterrows():
    idx = row['IngredientId']
    filtered_items = Items.loc[Items['ItemId'] == idx, 'Description']
    if not filtered_items.empty:
        descrp = filtered_items.values[0]
        Items_Nonstd.loc[index, 'Description'] = descrp
    else:
        # Handle the case when there is no matching item for the given 'IngredientId'
        Items_Nonstd.loc[index, 'Description'] = 'Not Found'
        pass



In [15]:
Items_Nonstd.head()

Unnamed: 0,IngredientId,Qty,Uom,Recipe,Description
0,I-4114,10.0,ea,P-2424,"TORTILLA 10"" FLOUR PRESSED TF"
1,I-2992,2.0,slice,P-10674,Bacon Pre-Ckd 30-34 ct
4,I-4126,2.0,slice,P-1488,MULTIGRAIN CLUBHOUSE
6,I-1695,1.0,slice,P-4582,CHEESE SWISS SLCD NAT 21G
7,I-1787,1.0,ea,P-4582,CROISSANT JUMBO


In [16]:
# Convert the Items_Nonstd DataFrame to a csv file
path = os.path.join(os.getcwd(), "data", "cleaning", "AMS_data", "Items_Nonstd.csv")
Items_Nonstd.to_csv(path, index = False, header = True)

***
## Clean Preps Units

In [17]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-2824,BC - Tomato Focacia,14.0,ea,Y
1,P-2424,BCPrep - Asian Sesame Wrap,10.0,ea,Y
2,P-10674,BCPrep - BACON WRAP,1.0,ea,Y
3,P-1520,BCPrep - Chicken Quinoa Wrap,10.0,ea,Y
4,P-1488,BCPrep - Chicken Salad,1.0,ea,Y
5,P-2535,BCPrep - Egg Salad,1.0,ea,Y
6,P-4582,BCPrep - Ham & Swiss Croissant,1.0,ea,Y
7,P-4811,BCPrep - Mediterranean Wrap,10.0,ea,Y
8,P-9279,BCPrep - Salsa Wrap,1.0,ea,Y
9,P-10863,BCPrep - T-Bird Muffin,1.0,ea,Y


In [18]:
# Creates 2 new columns called StdQty and StdUom in the Preps DataFrame. These columns contain NaN values
# Preparing to fill in these columns with standardized quantities and units of measurement 
Preps['StdQty'] = np.nan
Preps['StdUom'] = np.nan

In [19]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-2824,BC - Tomato Focacia,14.0,ea,Y,,
1,P-2424,BCPrep - Asian Sesame Wrap,10.0,ea,Y,,
2,P-10674,BCPrep - BACON WRAP,1.0,ea,Y,,
3,P-1520,BCPrep - Chicken Quinoa Wrap,10.0,ea,Y,,
4,P-1488,BCPrep - Chicken Salad,1.0,ea,Y,,
5,P-2535,BCPrep - Egg Salad,1.0,ea,Y,,
6,P-4582,BCPrep - Ham & Swiss Croissant,1.0,ea,Y,,
7,P-4811,BCPrep - Mediterranean Wrap,10.0,ea,Y,,
8,P-9279,BCPrep - Salsa Wrap,1.0,ea,Y,,
9,P-10863,BCPrep - T-Bird Muffin,1.0,ea,Y,,


In [20]:
# Function to apply the conversion logic
def convert_units(row):
    if row['PakUOM'].lower() in ['g', 'grams']:
        return row['PakQty'], 'g'
    elif row['PakUOM'].lower() in ['kg']:
        return row['PakQty'] * 1000, 'g'
    elif row['PakUOM'].lower() in ['ml']:
        return row['PakQty'], 'ml'
    elif row['PakUOM'].lower() in ['l']:
        return row['PakQty'] * 1000, 'ml'
    elif row['PakUOM'].lower() in ['lbs']:
        return row['PakQty'] * 453.59, 'g'
    else:
        return np.nan, np.nan

# Apply the function to each row
Preps['StdQty'], Preps['StdUom'] = zip(*Preps.apply(convert_units, axis=1))

In [21]:
Preps_Cleaned = Preps[~Preps["StdQty"].isna()]
Preps_Cleaned.reset_index(drop=True, inplace=True)
Preps_Cleaned

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-14403,Prep Cream Cheese - Garlic,11000.0,g,Y,11000.0,g


In [22]:
# Save cleaned preps list to file
path = os.path.join(os.getcwd(), "data", "cleaning", "AMS_data", "Preps_Unit_Cleaned.csv")
Preps_Cleaned.to_csv(path, index = False, header = True)

### Get Preps with Nonstandard Unit

In [23]:
col_names = list(Preps.columns.values)
Preps_Nonstd = []

for index, row in Preps.iterrows():
    StdUom = Preps.loc[index,'StdUom']
    if StdUom not in ['g', 'ml']:
        Dict = {}
        Dict.update(dict(row))
        Preps_Nonstd.append(Dict)

Preps_Nonstd = pd.DataFrame(Preps_Nonstd, columns = col_names)

In [24]:
Preps_Nonstd

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-2824,BC - Tomato Focacia,14.0,ea,Y,,
1,P-2424,BCPrep - Asian Sesame Wrap,10.0,ea,Y,,
2,P-10674,BCPrep - BACON WRAP,1.0,ea,Y,,
3,P-1520,BCPrep - Chicken Quinoa Wrap,10.0,ea,Y,,
4,P-1488,BCPrep - Chicken Salad,1.0,ea,Y,,
5,P-2535,BCPrep - Egg Salad,1.0,ea,Y,,
6,P-4582,BCPrep - Ham & Swiss Croissant,1.0,ea,Y,,
7,P-4811,BCPrep - Mediterranean Wrap,10.0,ea,Y,,
8,P-9279,BCPrep - Salsa Wrap,1.0,ea,Y,,
9,P-10863,BCPrep - T-Bird Muffin,1.0,ea,Y,,


In [25]:
path = os.path.join(os.getcwd(), "data", "cleaning", "AMS_data", "Preps_NonstdUom.csv")
Preps_Nonstd.to_csv(path, index = False, header = True)

In [26]:
# NEED TO CONTINUE FROM HERE, FINISH THE UNIT UPDATES
update_prep = pd.read_csv("data/cleaning/update/AMS_data/Preps_UpdateUom.csv")
update_prep #Manual? yes

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-14356,[PREP KAPPA MAKI,6.0,PORT,N,1511.82,g
1,P-14560,2023 Chicken Caesar wrap Prep,1.0,ea,Y,433.59,g
2,P-9003,2022 Gallery Burger prep,1.0,ea,N,501.82,g
3,P-17358,2023 Poutine Prep,1.0,ea,N,705.80,g
4,P-15006,2023 Power Punch Salad Prep,1.0,ea,N,416.73,g
...,...,...,...,...,...,...,...
88,P-3045,Greek Salad,1.0,ea,Y,303.20,g
89,P-7201,Harissa Chicken (Marinated),120.0,ea,Y,25300.00,g
90,P-4814,Waffles,55.0,ea,Y,3711.50,g
91,P-18292,2023 Vegan Caesar Wrap Prep,1.0,ea,Y,633.21,g


In [27]:
# Assuming update_prep and Preps_Nonstd are DataFrames and 'PrepId' is the column of interest
update = set(update_prep["PrepId"])
preps = set(Preps_Nonstd["PrepId"])

# Find the difference between the sets
difference = preps - update

# If you want to print or use the difference
print(difference)


{'P-1434', 'P-9279', 'P-2824'}


***

## New Items

In [28]:
# Load current Items List with assigned Emission Factors Category ID
Items_Assigned = pd.read_csv(os.path.join(os.getcwd(), "data", "mapping", "AMS_data", "Items_List_Assigned.csv"))
Items_Assigned.head()

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup,Food Category
0,I-1971,35,LIMES,0.25,ea,1.0,ea,N,
1,I-8228,55,Grey Goose 1.14L,1.0,fl oz,1.0,fl oz,N,
2,I-15803,31,Red Bull Watermelon,1.0,can,1.0,can,N,
3,I-5505,36,Lettuce - Romaine,0.25,HEAD,1.0,HEAD,N,
4,I-8667,58,ITEM GARLIC MAYO,2.0,fl oz,1.0,ml,N,


In [29]:
Items_Assigned.shape

(534, 9)

In [30]:
Items_Assigned["InventoryGroup"].unique()

array(['N', 'Y'], dtype=object)

In [31]:
Items_Assigned["CategoryID"].unique()

array([35, 55, 31, 36, 58, 24, 16, 20,  3, 40,  8, 38, 48, 54, 44, 18, 32,
       56, 61, 39, 43, 37,  5, 26, 25,  9,  4, 22,  6, 49, 12, 50, 41, 17,
       57, 11, 34,  1, 45, 42, 59, 46, 10, 51, 53, 21, 28,  7, 13, 30, 14])

In [32]:
Items_Assigned[Items_Assigned["CategoryID"] == 55]

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup,Food Category
1,I-8228,55,Grey Goose 1.14L,1.0,fl oz,1.0,fl oz,N,
33,I-2640,55,Water - Tap,250.0,ml,2.5,ml,N,
92,I-13817,55,FRENCH AUS JUS,490.0,g,5.0,g,N,
102,I-15592,55,JUICE ORANGE CONC 6+1,500.0,ml,13.0,ml,N,
121,I-3036,55,MIRIN 5.28GL,720.0,ml,2100.0,fl oz,N,
171,I-2103,55,PEACH SLCD IN PEAR JUICE,300.0,ml,300.0,ml,N,
172,I-6204,55,Juice - Lime Fresh Squeezed,30.0,g,300.0,oz,N,
212,I-17946,55,JUICE orange 100% tetra,1.0,L,6.0,ml,Y,
228,I-6203,55,Juice - Lemon Fresh Squeezed,50.0,g,1.2,oz,Y,
270,I-17129,55,Boiling Water,3000.0,g,2000.0,g,N,


### Get the List of New Items

In [33]:
# Filter new items by itemID that are not in the database and output them in a dataframe
col_names = list(Items.columns.values)
New_Items_List = []

for index, row in Items.iterrows():
    ItemId = Items.loc[index,'ItemId']
    descrp = Items.loc[index,'Description']
    if ItemId not in Items_Assigned['ItemId'].values:
        Dict = {}
        Dict.update(dict(row))
        New_Items_List.append(Dict)
    else:
        print("ItemID: ", ItemId, descrp, " already exists in the database")

New_Items = pd.DataFrame(New_Items_List, columns = col_names)

ItemID:  I-4099 Olive Oil 100 Pct  already exists in the database
ItemID:  I-2402 TOMATO - 6 X 7 MED  already exists in the database
ItemID:  I-1874 GARLIC WHOLE PEELED  already exists in the database
ItemID:  I-14422 Onions Green 3ct   already exists in the database
ItemID:  I-1995 MINT LEAVES  already exists in the database
ItemID:  I-2349 SPINACH WASHED TRIMMED  already exists in the database
ItemID:  I-4118 ITEM - Tahini Satay Sauce  already exists in the database
ItemID:  I-4119 CHEESE SHRED CHEDDAR  already exists in the database
ItemID:  I-2992 Bacon Pre-Ckd 30-34 ct  already exists in the database
ItemID:  I-1859 DILL - FRESH   already exists in the database
ItemID:  I-4125 ITEM ORGANIC QUINOA  already exists in the database
ItemID:  I-1982 MAYONNAISE GF  already exists in the database
ItemID:  I-5220 Lettuce Green Leaf Fillet  already exists in the database
ItemID:  I-4127 ITEM SALAD CHICKEN MIX  already exists in the database
ItemID:  I-4129 ITEM SALAD EGG MIX  already exists

In [34]:
New_Items.insert(1, "CategoryID", '')
New_Items

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-4114,,"TORTILLA 10"" FLOUR PRESSED TF",10.0,ea,10.0,ea,Y
1,I-4115,,ITEM SLICED RED ONION,50.0,g,10.0,ea,Y
2,I-4116,,ITEM SHAVED HERL CARROTS,150.0,g,10.0,ea,Y
3,I-4117,,ITEM SESAME TOFU,2.0,Kg,10.0,ea,Y
4,I-4120,,ITEM AIOLI CHIPOLTE,10.0,ml,1.0,ea,Y
5,I-4126,,MULTIGRAIN CLUBHOUSE,2.0,slice,1.0,ea,Y
6,I-4136,,ITEM - Black Olive Hummus,0.5,L,10.0,ea,Y
7,I-4137,,"TORTILLA 10"" TOMATO PRESSED",1.0,ea,1.0,ea,Y
8,I-4138,,ITEM SLICED TURKEY,25.0,g,1.0,ea,Y
9,I-4142,,Bread Loaf Sourdgh Cranb T/S,2.0,slice,1.0,ea,Y


In [35]:
New_Items.shape

(21, 8)

In [36]:
# Store the list of new items into .csv file
# If New_Items is not empty then we convert it to a csv file. 
if not New_Items.empty:
    path = os.path.join(os.getcwd(), "data", "mapping", "AMS_data", "new items", str(datetime.date(datetime.now()))+"_New_Items.csv")
    New_Items.to_csv(path, index = False, header = True)

***
## Data Summary

In [37]:
datasum = pd.DataFrame([New_Items.shape, Preps_Nonstd.shape, Items_Nonstd.shape],
                       columns = ['count', 'columns'], 
                       index = ['New_Items', 'Preps_Nonstd', 'Items_Nonstd'])
datasum

Unnamed: 0,count,columns
New_Items,21,8
Preps_Nonstd,13,7
Items_Nonstd,27,5


In [38]:
print(New_Items.columns)

Index(['ItemId', 'CategoryID', 'Description', 'CaseQty', 'CaseUOM', 'PakQty',
       'PakUOM', 'InventoryGroup'],
      dtype='object')
