# Climate-Friendly Food Systems (CFFS) Labelling Project

### The University of British Columbia

****

## Part II: Data Cleaning

## Set up and Import Libraries

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os
import xml.etree.ElementTree as et
from xml.etree.ElementTree import parse
from datetime import datetime

In [2]:
# RUN ONLY ONCE
# os.chdir is used to change the current directory to the specified path
os.chdir("../") # Sets path to the repo folder as it is one level above where this file exists!
path = os.getcwd()
print(path)

/Users/vivaanwadhwa/Documents/GitHub/CFFS_sharon_2024


***

## Import Preprocessed Datasets

In [10]:
"""
INPUT: a Dataframe
OUPUT: NONE
Description: Prints a description of the datasets
"""
def DescribeDataset(df):
    print(df.dtypes)
    print("\nShape: ", df.shape)
    display(df.head())

In [11]:
# Reading csv file: data/preprocessed/AMS_data/Items_List.csv
Items = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Items_List.csv"))
DescribeDataset(Items)

ItemId             object
Description        object
CaseQty           float64
CaseUOM            object
PakQty            float64
PakUOM             object
InventoryGroup     object
dtype: object

Shape:  (559, 7)


Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-1971,LIMES,0.25,ea,1.0,ea,N
1,I-8228,Grey Goose 1.14L,1.0,fl oz,1.0,fl oz,N
2,I-15803,Red Bull Watermelon,1.0,can,1.0,can,N
3,I-5505,Lettuce - Romaine,0.25,HEAD,1.0,HEAD,N
4,I-8667,ITEM GARLIC MAYO,2.0,fl oz,1.0,ml,N


In [12]:
# Read Ingredients_List.csv
Ingredients = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Ingredients_List.csv"))
DescribeDataset(Ingredients)

IngredientId     object
Qty             float64
Uom              object
Recipe           object
dtype: object

Shape:  (2158, 4)


Unnamed: 0,IngredientId,Qty,Uom,Recipe
0,I-1971,0.25,ea,R-17284
1,I-8228,1.0,fl oz,R-17284
2,I-15803,1.0,can,R-17284
3,I-5505,0.25,HEAD,R-18292
4,I-8667,2.0,fl oz,R-18292


In [13]:
# Read Preps_List.csv
Preps = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Preps_List.csv"))
DescribeDataset(Preps)

PrepId             object
Description        object
PakQty            float64
PakUOM             object
InventoryGroup     object
dtype: object

Shape:  (73, 5)


Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-14356,[PREP KAPPA MAKI,6.0,PORT,N
1,P-14560,2022 Caesar Wrap prep,1.0,ea,N
2,P-9003,2022 Gallery Burger prep,1.0,ea,N
3,P-17305,2022 Hummus prep,1600.0,g,N
4,P-17358,2022 Poutine Prep,1.0,PORT,N


In [14]:
# Read Product_List.csv that was created from 1_data preprocessing
Products = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Products_List.csv"))
DescribeDataset(Products)

ProdId         object
Description    object
SalesGroup     object
dtype: object

Shape:  (301, 3)


Unnamed: 0,ProdId,Description,SalesGroup
0,P-15019,Butter Chicken Prep 2023,N
1,P-18296,Teriyaki Tempeh Prep 2023,N
2,P-17366,Chicken Teriyaki Prep 2023,Y
3,P-18330,Tuscan Prep 2023,N
4,P-14560,2022 Caesar Wrap prep,N


***
## Converter

### Create Unit Converter

In [15]:
# Import standard unit conversion information and construct a dataframe
Std_Unit = pd.read_csv(os.path.join(os.getcwd(), "data", "external", "standard_conversions.csv"))
Std_Unit.head()

Unnamed: 0,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,4.9289,1,tsp,4.9289,ml
1,14.787,1,Tbsp,14.787,ml
2,946.35,1,qt,946.35,ml
3,473.17625,1,pt,473.17625,ml
4,28.3495,1,oz,28.3495,g


In [16]:
# Seperate uoms that converted to 'ml' or 'g'
# Below we create 2 lists. 
# list_unit contains list of unit of measurements that are being converted to milliliters 
# solid_unit contains a list of unit of measurements that are being converted to grams
# tolist() converts a Pandas Series or an array to a python list. 

liquid_unit = Std_Unit.loc[Std_Unit['ConvertToUom'] == 'ml', 'ConvertFromUom'].tolist()
solid_unit = Std_Unit.loc[Std_Unit['ConvertToUom'] == 'g', 'ConvertFromUom'].tolist()

In [17]:
# Construct a standard unit converter
def std_converter(qty, uom):
    if uom in Std_Unit['ConvertFromUom'].tolist():
        multiplier = Std_Unit.loc[Std_Unit['ConvertFromUom'] == uom, 'Multiplier']
        Qty = float(qty)*float(multiplier)
        Uom = Std_Unit.loc[Std_Unit['ConvertFromUom'] == uom, 'ConvertToUom'].values[0]
    else:
        Qty = qty
        Uom = uom
    return (Qty, Uom)

***
## Items with Non-standard Units

In [18]:
Ingredients

Unnamed: 0,IngredientId,Qty,Uom,Recipe
0,I-1971,0.25,ea,R-17284
1,I-8228,1.00,fl oz,R-17284
2,I-15803,1.00,can,R-17284
3,I-5505,0.25,HEAD,R-18292
4,I-8667,2.00,fl oz,R-18292
...,...,...,...,...
2153,I-2262,1.00,g,P-18049
2154,I-14434,80.00,g,P-18049
2155,I-15477,1.00,PORT,P-18049
2156,I-15637,20.00,g,P-18049


In [19]:
Items

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-1971,LIMES,0.25,ea,1.0,ea,N
1,I-8228,Grey Goose 1.14L,1.00,fl oz,1.0,fl oz,N
2,I-15803,Red Bull Watermelon,1.00,can,1.0,can,N
3,I-5505,Lettuce - Romaine,0.25,HEAD,1.0,HEAD,N
4,I-8667,ITEM GARLIC MAYO,2.00,fl oz,1.0,ml,N
...,...,...,...,...,...,...,...
554,I-14843,PieR Item Wild Mushroom Mix,2.00,oz,1.0,g,N
555,I-18348,Gremolata 2022,5.00,g,1.0,g,N
556,I-15425,G21 Southwest Tofu Scrambled,1.00,PORT,1.0,PORT,N
557,I-2432,VEG BURGER GARDEN 4Z,1.00,ea,1.0,ea,N


In [20]:
# Filter out the items whose unit information is unknown 
# We find the column names
col_names = list(Ingredients.columns.values)

# Create a Items_Nonstd list
Items_Nonstd = []

# If the unit of measurement is not grams or ml and ingredient id starts with I and the ingredient is not in ConversionId column of Conversions 
# then we add it to Items_Nonstd list
for index, row in Ingredients.iterrows():
    Ingre = Ingredients.loc[index,'IngredientId']
    Uom = Ingredients.loc[index,'Uom']
    if Uom not in ['g', 'ml'] and Uom not in liquid_unit + solid_unit and Ingre.startswith('I'):
        Dict = {}
        Dict.update(dict(row))
        Items_Nonstd.append(Dict)

# Create a DataFrame from Items_Nonstd list
Items_Nonstd = pd.DataFrame(Items_Nonstd, columns = col_names)
# Remove duplicate ingredients of the same properties so that Items_Nonstd has only unique rows. 
Items_Nonstd.drop_duplicates(subset=['IngredientId'], inplace=True,)
Items_Nonstd

Unnamed: 0,IngredientId,Qty,Uom,Recipe
0,I-1971,0.250,ea,R-17284
1,I-15803,1.000,can,R-17284
2,I-5505,0.250,HEAD,R-18292
3,I-11706,1.000,ea,R-18292
4,I-13308,1.000,ea,R-18292
...,...,...,...,...
414,I-12339,1.000,PORT,P-3173
430,I-2586,0.005,tank,R-13392
443,I-9186,1.000,ea,R-15315
450,I-15425,1.000,PORT,R-15426


In [21]:
# Assigning a Description column to the Items_Nonstd    
for index, row in Items_Nonstd.iterrows():
    idx = row['IngredientId']
    filtered_items = Items.loc[Items['ItemId'] == idx, 'Description']
    if not filtered_items.empty:
        descrp = filtered_items.values[0]
        Items_Nonstd.loc[index, 'Description'] = descrp
    else:
        # Handle the case when there is no matching item for the given 'IngredientId'
        Items_Nonstd.loc[index, 'Description'] = 'Not Found'
        pass



In [22]:
Items_Nonstd.head()

Unnamed: 0,IngredientId,Qty,Uom,Recipe,Description
0,I-1971,0.25,ea,R-17284,LIMES
1,I-15803,1.0,can,R-17284,Red Bull Watermelon
2,I-5505,0.25,HEAD,R-18292,Lettuce - Romaine
3,I-11706,1.0,ea,R-18292,Glry Side Fries 2023
4,I-13308,1.0,ea,R-18292,"TORTILLA 12"" FLOUR PRESSED"


In [23]:
# Convert the Items_Nonstd DataFrame to a csv file
path = os.path.join(os.getcwd(), "data", "cleaning", "AMS_data", "Items_Nonstd.csv")
Items_Nonstd.to_csv(path, index = False, header = True)

***
## Clean Preps Units

In [24]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-14356,[PREP KAPPA MAKI,6.0,PORT,N
1,P-14560,2022 Caesar Wrap prep,1.0,ea,N
2,P-9003,2022 Gallery Burger prep,1.0,ea,N
3,P-17305,2022 Hummus prep,1600.0,g,N
4,P-17358,2022 Poutine Prep,1.0,PORT,N
...,...,...,...,...,...
68,P-18329,Truffle Cream prep 2023,310.0,g,Y
69,P-18330,Tuscan Prep 2023,1.0,PORT,N
70,P-15013,Wings Prep 2023,1.0,PORT,Y
71,P-18313,Yam Fries prep 2023,1.0,PORT,N


In [25]:
# Creates 2 new columns called StdQty and StdUom in the Preps DataFrame. These columns contain NaN values
# Preparing to fill in these columns with standardized quantities and units of measurement 
Preps['StdQty'] = np.nan
Preps['StdUom'] = np.nan

In [26]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-14356,[PREP KAPPA MAKI,6.0,PORT,N,,
1,P-14560,2022 Caesar Wrap prep,1.0,ea,N,,
2,P-9003,2022 Gallery Burger prep,1.0,ea,N,,
3,P-17305,2022 Hummus prep,1600.0,g,N,,
4,P-17358,2022 Poutine Prep,1.0,PORT,N,,
...,...,...,...,...,...,...,...
68,P-18329,Truffle Cream prep 2023,310.0,g,Y,,
69,P-18330,Tuscan Prep 2023,1.0,PORT,N,,
70,P-15013,Wings Prep 2023,1.0,PORT,Y,,
71,P-18313,Yam Fries prep 2023,1.0,PORT,N,,


In [27]:
# Function to apply the conversion logic
def convert_units(row):
    if row['PakUOM'].lower() in ['g', 'grams']:
        return row['PakQty'], 'g'
    elif row['PakUOM'].lower() in ['kg']:
        return row['PakQty'] * 1000, 'g'
    elif row['PakUOM'].lower() in ['ml']:
        return row['PakQty'], 'ml'
    elif row['PakUOM'].lower() in ['l']:
        return row['PakQty'] * 1000, 'ml'
    else:
        return np.nan, np.nan

# Apply the function to each row
Preps['StdQty'], Preps['StdUom'] = zip(*Preps.apply(convert_units, axis=1))

In [28]:
Preps_Cleaned = Preps[~Preps["StdQty"].isna()]
Preps_Cleaned.reset_index(drop=True, inplace=True)
Preps_Cleaned

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-17305,2022 Hummus prep,1600.0,g,N,1600.0,g
1,P-16793,2022 Pulled Pork Prep,6.0,Kg,Y,6000.0,g
2,P-18380,2023 Babaganoush Prep,750.0,g,N,750.0,g
3,P-18458,2023 Fresh burger Patty prep,2.6,Kg,Y,2600.0,g
4,P-18575,2023 Wings Hot sauce prep,1650.0,g,Y,1650.0,g
5,P-18531,2023.7 Chili Garlic Prep,1000.0,g,N,1000.0,g
6,P-18530,2023.7 Sambal Chili Prep,2000.0,g,N,2000.0,g
7,P-18052,Beets prep 2023,2.0,Kg,N,2000.0,g
8,P-18336,Cooked Penne pasta prep 2023,4.0,Kg,N,4000.0,g
9,P-18381,Eggplant Prep 2023,500.0,g,N,500.0,g


In [30]:
# Save cleaned preps list to file
path = os.path.join(os.getcwd(), "data", "cleaning", "AMS_data", "Preps_Unit_Cleaned.csv")
Preps_Cleaned.to_csv(path, index = False, header = True)

### Get Preps with Nonstandard Unit

In [31]:
col_names = list(Preps.columns.values)
Preps_Nonstd = []

for index, row in Preps.iterrows():
    StdUom = Preps.loc[index,'StdUom']
    if StdUom not in ['g', 'ml']:
        Dict = {}
        Dict.update(dict(row))
        Preps_Nonstd.append(Dict)

Preps_Nonstd = pd.DataFrame(Preps_Nonstd, columns = col_names)

In [32]:
Preps_Nonstd

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-14356,[PREP KAPPA MAKI,6.0,PORT,N,,
1,P-14560,2022 Caesar Wrap prep,1.0,ea,N,,
2,P-9003,2022 Gallery Burger prep,1.0,ea,N,,
3,P-17358,2022 Poutine Prep,1.0,PORT,N,,
4,P-15006,2022 Power Punch Salad prep,1.0,PORT,N,,
5,P-16795,2022 Pulled Pork Sandwich prep,1.0,PORT,N,,
6,P-14552,2022 Vegan Pulled Pork Prep,1.0,ea,N,,
7,P-18327,2023 Appi Platter prep,1.0,PORT,N,,
8,P-18453,2023 GM Tempeh curry prep,1.0,PORT,N,,
9,P-18451,2023 Gm truffle Fries prep,1.0,PORT,N,,


In [33]:
path = os.path.join(os.getcwd(), "data", "cleaning", "AMS_data", "Preps_NonstdUom.csv")
Preps_Nonstd.to_csv(path, index = False, header = True)

In [34]:
# NEED TO CONTINUE FROM HERE, FINISH THE UNIT UPDATES
update_prep = pd.read_csv("data/cleaning/update/AMS_data/Preps_UpdateUom.csv")
update_prep #Manual?

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-14356,[PREP KAPPA MAKI,6,PORT,N,1511.82,g
1,P-14560,2022 Caesar Wrap prep,1,ea,N,433.59,g
2,P-9003,2022 Gallery Burger prep,1,ea,N,501.82,g
3,P-17358,2022 Poutine Prep,1,PORT,N,705.8,g
4,P-15006,2022 Power Punch Salad prep,1,PORT,N,416.73,g
5,P-16795,2022 Pulled Pork Sandwich prep,1,PORT,N,525.09,g
6,P-14552,2022 Vegan Pulled Pork Prep,1,ea,N,479.38,g
7,P-18327,2023 Appi Platter prep,1,PORT,N,1354.57,g
8,P-18453,2023 GM Tempeh curry prep,1,PORT,N,690.0,g
9,P-18451,2023 Gm truffle Fries prep,1,PORT,N,2756.0,g


***

## New Items

In [35]:
# Load current Items List with assigned Emission Factors Category ID
Items_Assigned = pd.read_csv(os.path.join(os.getcwd(), "data", "mapping", "AMS_data", "Items_List_Assigned.csv"))
Items_Assigned.head()

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-1971,35,LIMES,0.25,ea,1.0,ea,N
1,I-8228,55,Grey Goose 1.14L,1.0,fl oz,1.0,fl oz,N
2,I-15803,31,Red Bull Watermelon,1.0,can,1.0,can,N
3,I-5505,36,Lettuce - Romaine,0.25,HEAD,1.0,HEAD,N
4,I-8667,58,ITEM GARLIC MAYO,2.0,fl oz,1.0,ml,N


In [36]:
Items_Assigned.shape

(456, 8)

In [37]:
Items_Assigned["InventoryGroup"].unique()

array(['N', 'Y'], dtype=object)

In [38]:
Items_Assigned["CategoryID"].unique()

array([35, 55, 31, 36, 58, 24, 16,  3, 40,  8, 38, 48, 54, 44, 18, 32, 56,
       61, 20, 39, 43, 37,  5, 26, 25,  9,  4, 22,  6, 49, 12, 50, 41, 17,
       57, 11, 34,  1, 45, 42, 46, 10, 51, 53, 21, 28,  7, 13, 30])

In [39]:
Items_Assigned[Items_Assigned["CategoryID"] == 55]

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
1,I-8228,55,Grey Goose 1.14L,1.0,fl oz,1.0,fl oz,N
33,I-2640,55,Water - Tap,250.0,ml,2.5,ml,N
92,I-13817,55,FRENCH AUS JUS,490.0,g,5.0,g,N
102,I-15592,55,JUICE ORANGE CONC 6+1,500.0,ml,13.0,ml,N
121,I-3036,55,MIRIN 5.28GL,720.0,ml,2100.0,fl oz,N
171,I-2103,55,PEACH SLCD IN PEAR JUICE,300.0,ml,300.0,ml,N
172,I-6204,55,Juice - Lime Fresh Squeezed,30.0,g,300.0,oz,N
212,I-17946,55,JUICE orange 100% tetra,1.0,L,6.0,ml,Y
228,I-6203,55,Juice - Lemon Fresh Squeezed,50.0,g,1.2,oz,Y
270,I-17129,55,Boiling Water,3000.0,g,2000.0,g,N


### Get the List of New Items

In [40]:
# Filter new items by itemID that are not in the database and output them in a dataframe
col_names = list(Items.columns.values)
New_Items_List = []

for index, row in Items.iterrows():
    ItemId = Items.loc[index,'ItemId']
    if ItemId not in Items_Assigned['ItemId'].values:
        Dict = {}
        Dict.update(dict(row))
        New_Items_List.append(Dict)

New_Items = pd.DataFrame(New_Items_List, columns = col_names)

In [41]:
New_Items.insert(1, "CategoryID", '')
New_Items

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-11706,,Glry Side Fries 2023,1.0,ea,1.0,ea,N
1,I-16780,,2022 Vegan Caesar dressing,1.5,fl oz,1.0,fl oz,N
2,I-14715,,2022 Coleslaw mix,7.0,oz,1.0,Kg,N
3,I-15427,,Vegan Chipotle Mayo,2.0,fl oz,1.0,L,N
4,I-13956,,Sushi Rice,6.0,PORT,6.0,PORT,N
...,...,...,...,...,...,...,...,...
98,I-13398,,BIB Mix Sprite,1.0,L,18.5,L,Y
99,I-13955,,Sushi Su,1.0,L,50.0,L,N
100,I-13736,,2019 Ginger Garlic Paste,60.0,g,500.0,Kg,N
101,I-18348,,Gremolata 2022,5.0,g,1.0,g,N


In [42]:
New_Items.shape

(103, 8)

In [None]:
# Store the list of new items into .csv file
# If New_Items is not empty then we convert it to a csv file. 
if not New_Items.empty:
    path = os.path.join(os.getcwd(), "data", "mapping", "AMS_data", "new items", str(datetime.date(datetime.now()))+"_New_Items.csv")
    New_Items.to_csv(path, index = False, header = True)

In [48]:
# AMS version
file2 = pd.read_excel("data/mapping/new items added/AMS_data/New_Items_2023/New_Items_Added_2023-11-28.xlsx")
file2.to_csv("data/mapping/new items added/AMS_data/New_Items_2023/New_Items_Added_2023-11-28.csv", index=False)

***
## Data Summary

In [49]:
datasum = pd.DataFrame([New_Items.shape, Preps_Nonstd.shape, Items_Nonstd.shape],
                       columns = ['count', 'columns'], 
                       index = ['New_Items', 'Preps_Nonstd', 'Items_Nonstd'])
datasum

Unnamed: 0,count,columns
New_Items,103,8
Preps_Nonstd,49,7
Items_Nonstd,134,5


In [50]:
print(New_Items.columns)

Index(['ItemId', 'CategoryID', 'Description', 'CaseQty', 'CaseUOM', 'PakQty',
       'PakUOM', 'InventoryGroup'],
      dtype='object')
