# Climate-Friendly Food Systems (CFFS) Labelling Project

### The University of British Columbia

****

## Part II: Data Cleaning

## Set up and Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os
import xml.etree.ElementTree as et
from xml.etree.ElementTree import parse
from datetime import datetime

In [2]:
# RUN ONLY ONCE
# os.chdir is used to change the current directory to the specified path
os.chdir("../") # Sets path to the repo folder as it is one level above where this file exists!
path = os.getcwd()
print(path)

/Users/vivaanwadhwa/Documents/GitHub/CFFS_sharon_2024


***

## Import Preprocessed Datasets

In [3]:
"""
INPUT: a Dataframe
OUPUT: NONE
Description: Prints a description of the datasets
"""
def DescribeDataset(df):
    print(df.dtypes)
    print("\nShape: ", df.shape)
    display(df.head())

In [4]:
# Reading csv file: data/preprocessed/AMS_data/Items_List.csv
Items = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Items_List.csv"))
DescribeDataset(Items)

ItemId             object
Description        object
CaseQty           float64
CaseUOM            object
PakQty            float64
PakUOM             object
InventoryGroup     object
dtype: object

Shape:  (218, 7)


Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-1927,JALAPENO PEPPER - FRESH,20.0,g,1.25,L,Y
1,I-1958,KETCHUP VOL PAK,660.0,ml,1.25,L,Y
2,I-2025,MUSTARD DRY,22.5,g,1.25,L,Y
3,I-2323,SPICE CAYENNE SHAKER,15.0,ml,1.25,L,Y
4,I-2324,SPICE CHILI POWDER,20.0,g,1.25,L,Y


In [5]:
# Read Ingredients_List.csv
Ingredients = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Ingredients_List.csv"))
DescribeDataset(Ingredients)

IngredientId     object
Qty             float64
Uom              object
Recipe           object
dtype: object

Shape:  (805, 4)


Unnamed: 0,IngredientId,Qty,Uom,Recipe
0,P-18275,1.0,ea,R-2654
1,P-5506,1.0,ea,R-7227
2,P-17013,2.0,Tbsp,R-7227
3,P-14560,1.0,ea,R-8990
4,P-18295,1.0,ea,R-5554


In [6]:
# Read Preps_List.csv
Preps = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Preps_List.csv"))
DescribeDataset(Preps)

PrepId             object
Description        object
PakQty            float64
PakUOM             object
InventoryGroup     object
dtype: object

Shape:  (114, 5)


Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-6068,Pro - Kansas City BBQ,1.25,L,Y
1,P-2824,2023 Alfredo Sauce Gal.,2250.0,ml,Y
2,P-1409,2023 Basmati Prep,2.0,Kg,Y
3,P-17360,2023 Beef Gravy (prep),4.5,L,Y
4,P-18275,2023 Beets Salad Prep,1.0,ea,Y


In [7]:
# Read Product_List.csv that was created from 1_data preprocessing
Products = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Products_List.csv"))
DescribeDataset(Products)

ProdId         object
Description    object
SalesGroup     object
dtype: object

Shape:  (38, 3)


Unnamed: 0,ProdId,Description,SalesGroup
0,R-2654,2024S Beets Salad,Y
1,R-7227,2024S Blackend Chick Carbonara,Y
2,R-8990,2024S Chicken Caesar (Wrap),Y
3,R-5554,2024S Chicken Pesto Penne,Y
4,R-10589,2024S Chicken Po'Boy,Y


***
## Converter

### Create Unit Converter

In [8]:
# Import standard unit conversion information and construct a dataframe
Std_Unit = pd.read_csv(os.path.join(os.getcwd(), "data", "external", "standard_conversions.csv"))
Std_Unit.head()

Unnamed: 0,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,4.9289,1,tsp,4.9289,ml
1,14.787,1,Tbsp,14.787,ml
2,946.35,1,qt,946.35,ml
3,473.17625,1,pt,473.17625,ml
4,28.3495,1,oz,28.3495,g


In [9]:
# Seperate uoms that converted to 'ml' or 'g'
# Below we create 2 lists. 
# list_unit contains list of unit of measurements that are being converted to milliliters 
# solid_unit contains a list of unit of measurements that are being converted to grams
# tolist() converts a Pandas Series or an array to a python list. 

liquid_unit = Std_Unit.loc[Std_Unit['ConvertToUom'] == 'ml', 'ConvertFromUom'].tolist()
solid_unit = Std_Unit.loc[Std_Unit['ConvertToUom'] == 'g', 'ConvertFromUom'].tolist()

In [10]:
# Construct a standard unit converter
def std_converter(qty, uom):
    if uom in Std_Unit['ConvertFromUom'].tolist():
        multiplier = Std_Unit.loc[Std_Unit['ConvertFromUom'] == uom, 'Multiplier']
        Qty = float(qty)*float(multiplier)
        Uom = Std_Unit.loc[Std_Unit['ConvertFromUom'] == uom, 'ConvertToUom'].values[0]
    else:
        Qty = qty
        Uom = uom
    return (Qty, Uom)

***
## Items with Non-standard Units

In [11]:
Ingredients

Unnamed: 0,IngredientId,Qty,Uom,Recipe
0,P-18275,1.0,ea,R-2654
1,P-5506,1.0,ea,R-7227
2,P-17013,2.0,Tbsp,R-7227
3,P-14560,1.0,ea,R-8990
4,P-18295,1.0,ea,R-5554
...,...,...,...,...
800,I-3005,60.0,g,P-15368
801,I-3804,18.0,g,P-15368
802,I-6865,500.0,g,P-15368
803,I-9117,1.0,L,P-15368


In [12]:
Items

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-1927,JALAPENO PEPPER - FRESH,20.0,g,1.25,L,Y
1,I-1958,KETCHUP VOL PAK,660.0,ml,1.25,L,Y
2,I-2025,MUSTARD DRY,22.5,g,1.25,L,Y
3,I-2323,SPICE CAYENNE SHAKER,15.0,ml,1.25,L,Y
4,I-2324,SPICE CHILI POWDER,20.0,g,1.25,L,Y
...,...,...,...,...,...,...,...
213,I-4197,DRESSING Ranch BtrMilk,2.0,oz (fl),,,Y
214,I-4198,Lemon Pepper Seasoning,10.0,g,,,Y
215,I-4199,ITEM G21- MANGO HABANERO SAUCE,2.0,oz (fl),,,Y
216,I-4200,Fries Sweet Potato,400.0,g,,,Y


In [13]:
# Filter out the items whose unit information is unknown 
# We find the column names
col_names = list(Ingredients.columns.values)

# Create a Items_Nonstd list
Items_Nonstd = []

# If the unit of measurement is not grams or ml and ingredient id starts with I and the ingredient is not in ConversionId column of Conversions 
# then we add it to Items_Nonstd list
for index, row in Ingredients.iterrows():
    Ingre = Ingredients.loc[index,'IngredientId']
    Uom = Ingredients.loc[index,'Uom']
    if Uom not in ['g', 'ml'] and Uom not in liquid_unit + solid_unit and Ingre.startswith('I'):
        Dict = {}
        Dict.update(dict(row))
        Items_Nonstd.append(Dict)

# Create a DataFrame from Items_Nonstd list
Items_Nonstd = pd.DataFrame(Items_Nonstd, columns = col_names)
# Remove duplicate ingredients of the same properties so that Items_Nonstd has only unique rows. 
Items_Nonstd.drop_duplicates(subset=['IngredientId'], inplace=True,)
Items_Nonstd

Unnamed: 0,IngredientId,Qty,Uom,Recipe
0,I-14127,1.0,ea,R-14296
1,I-14126,1.0,ea,R-14296
2,I-2087,0.01,bunch,R-14296
4,I-2121,1.0,slice,R-7065
5,I-2992,2.0,slice,R-7065
6,I-3498,1.0,slice,R-7065
7,I-14948,1.0,ea,R-7065
9,I-13813,0.09,LBS,R-4598
10,I-4207,1.0,ea,R-1916
11,I-4172,0.05,ea,R-7572


In [14]:
# Assigning a Description column to the Items_Nonstd    
for index, row in Items_Nonstd.iterrows():
    idx = row['IngredientId']
    filtered_items = Items.loc[Items['ItemId'] == idx, 'Description']
    if not filtered_items.empty:
        descrp = filtered_items.values[0]
        Items_Nonstd.loc[index, 'Description'] = descrp
    else:
        # Handle the case when there is no matching item for the given 'IngredientId'
        Items_Nonstd.loc[index, 'Description'] = 'Not Found'
        pass



In [15]:
Items_Nonstd.head()

Unnamed: 0,IngredientId,Qty,Uom,Recipe,Description
0,I-14127,1.0,ea,R-14296,PITA POCKETS THIN
1,I-14126,1.0,ea,R-14296,"PITA THICK 5"""
2,I-2087,0.01,bunch,R-14296,PARSLEY
4,I-2121,1.0,slice,R-7065,PICKLE DILL SANDW LONG SLCD
5,I-2992,2.0,slice,R-7065,Bacon Pre-Ckd 30-34 ct


In [16]:
# Convert the Items_Nonstd DataFrame to a csv file
path = os.path.join(os.getcwd(), "data", "cleaning", "AMS_data", "Items_Nonstd.csv")
Items_Nonstd.to_csv(path, index = False, header = True)

***
## Clean Preps Units

In [17]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-6068,Pro - Kansas City BBQ,1.25,L,Y
1,P-2824,2023 Alfredo Sauce Gal.,2250.00,ml,Y
2,P-1409,2023 Basmati Prep,2.00,Kg,Y
3,P-17360,2023 Beef Gravy (prep),4.50,L,Y
4,P-18275,2023 Beets Salad Prep,1.00,ea,Y
...,...,...,...,...,...
109,P-4733,Tzatziki,4.00,Kg,Y
110,P-5741,Veggie Stock,6.00,Kg,Y
111,P-2307,Waffles,55.00,ea,Y
112,P-15368,Pro - Marinara Sauce,7.00,L,Y


In [18]:
# Creates 2 new columns called StdQty and StdUom in the Preps DataFrame. These columns contain NaN values
# Preparing to fill in these columns with standardized quantities and units of measurement 
Preps['StdQty'] = np.nan
Preps['StdUom'] = np.nan

In [19]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-6068,Pro - Kansas City BBQ,1.25,L,Y,,
1,P-2824,2023 Alfredo Sauce Gal.,2250.00,ml,Y,,
2,P-1409,2023 Basmati Prep,2.00,Kg,Y,,
3,P-17360,2023 Beef Gravy (prep),4.50,L,Y,,
4,P-18275,2023 Beets Salad Prep,1.00,ea,Y,,
...,...,...,...,...,...,...,...
109,P-4733,Tzatziki,4.00,Kg,Y,,
110,P-5741,Veggie Stock,6.00,Kg,Y,,
111,P-2307,Waffles,55.00,ea,Y,,
112,P-15368,Pro - Marinara Sauce,7.00,L,Y,,


In [20]:
# Function to apply the conversion logic
def convert_units(row):
    if row['PakUOM'].lower() in ['g', 'grams']:
        return row['PakQty'], 'g'
    elif row['PakUOM'].lower() in ['kg']:
        return row['PakQty'] * 1000, 'g'
    elif row['PakUOM'].lower() in ['ml']:
        return row['PakQty'], 'ml'
    elif row['PakUOM'].lower() in ['l']:
        return row['PakQty'] * 1000, 'ml'
    elif row['PakUOM'].lower() in ['lbs']:
        return row['PakQty'] * 453.59, 'g'
    else:
        return np.nan, np.nan

# Apply the function to each row
Preps['StdQty'], Preps['StdUom'] = zip(*Preps.apply(convert_units, axis=1))

In [21]:
Preps_Cleaned = Preps[~Preps["StdQty"].isna()]
Preps_Cleaned.reset_index(drop=True, inplace=True)
Preps_Cleaned

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-6068,Pro - Kansas City BBQ,1.25,L,Y,1250.0,ml
1,P-2824,2023 Alfredo Sauce Gal.,2250.00,ml,Y,2250.0,ml
2,P-1409,2023 Basmati Prep,2.00,Kg,Y,2000.0,g
3,P-17360,2023 Beef Gravy (prep),4.50,L,Y,4500.0,ml
4,P-16778,2023 Candied walnut,1.00,Kg,Y,1000.0,g
...,...,...,...,...,...,...,...
60,P-1750,Smoked Paprika Rub,3.00,Kg,Y,3000.0,g
61,P-4733,Tzatziki,4.00,Kg,Y,4000.0,g
62,P-5741,Veggie Stock,6.00,Kg,Y,6000.0,g
63,P-15368,Pro - Marinara Sauce,7.00,L,Y,7000.0,ml


In [22]:
# Save cleaned preps list to file
path = os.path.join(os.getcwd(), "data", "cleaning", "AMS_data", "Preps_Unit_Cleaned.csv")
Preps_Cleaned.to_csv(path, index = False, header = True)

### Get Preps with Nonstandard Unit

In [23]:
col_names = list(Preps.columns.values)
Preps_Nonstd = []

for index, row in Preps.iterrows():
    StdUom = Preps.loc[index,'StdUom']
    if StdUom not in ['g', 'ml']:
        Dict = {}
        Dict.update(dict(row))
        Preps_Nonstd.append(Dict)

Preps_Nonstd = pd.DataFrame(Preps_Nonstd, columns = col_names)

In [24]:
Preps_Nonstd

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-18275,2023 Beets Salad Prep,1.0,ea,Y,,
1,P-5506,2023 Blackened Carbonara Prep,1.0,ea,Y,,
2,P-8990,2023 Chicken Caesar Prep,1.0,ea,Y,,
3,P-14560,2023 Chicken Caesar wrap Prep,1.0,PORT,Y,,
4,P-10589,2023 Chicken Po'Boy Prep,1.0,PORT,Y,,
5,P-18349,2023 Cooked Linguini,80.0,PORT,Y,,
6,P-11706,2023 Gallery Fries (Side),1.0,ea,Y,,
7,P-5012,2023 Gallery Nachos Prep (Sml),1.0,ea,Y,,
8,P-16794,2023 Grilled Pineapple prep,15.0,PORT,Y,,
9,P-4657,2023 Jasmine Rice (Prep),12.0,PORT,Y,,


In [25]:
path = os.path.join(os.getcwd(), "data", "cleaning", "AMS_data", "Preps_NonstdUom.csv")
Preps_Nonstd.to_csv(path, index = False, header = True)

In [26]:
# NEED TO CONTINUE FROM HERE, FINISH THE UNIT UPDATES
update_prep = pd.read_csv("data/cleaning/update/AMS_data/Preps_UpdateUom.csv")
update_prep #Manual?

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-14356,[PREP KAPPA MAKI,6.0,PORT,N,1511.82,g
1,P-14560,2023 Chicken Caesar wrap Prep,1.0,ea,Y,433.59,g
2,P-9003,2022 Gallery Burger prep,1.0,ea,N,501.82,g
3,P-17358,2023 Poutine Prep,1.0,ea,N,705.80,g
4,P-15006,2023 Power Punch Salad Prep,1.0,ea,N,416.73,g
...,...,...,...,...,...,...,...
86,P-18470,2023 GM Marinated Olives,1.0,PORT,Y,143.00,g
87,P-6881,Candy Cane beets Prep 2023,1.0,LBS,Y,453.59,g
88,P-9785,Greek Salad,1.0,ea,Y,303.20,g
89,P-3045,Harissa Chicken (Marinated),120.0,ea,Y,25300.00,g


***

## New Items

In [27]:
# Load current Items List with assigned Emission Factors Category ID
Items_Assigned = pd.read_csv(os.path.join(os.getcwd(), "data", "mapping", "AMS_data", "Items_List_Assigned.csv"))
Items_Assigned.head()

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup,Food Category
0,I-1971,35,LIMES,0.25,ea,1.0,ea,N,
1,I-8228,55,Grey Goose 1.14L,1.0,fl oz,1.0,fl oz,N,
2,I-15803,31,Red Bull Watermelon,1.0,can,1.0,can,N,
3,I-5505,36,Lettuce - Romaine,0.25,HEAD,1.0,HEAD,N,
4,I-8667,58,ITEM GARLIC MAYO,2.0,fl oz,1.0,ml,N,


In [28]:
Items_Assigned.shape

(526, 9)

In [29]:
Items_Assigned["InventoryGroup"].unique()

array(['N', 'Y'], dtype=object)

In [30]:
Items_Assigned["CategoryID"].unique()

array([35, 55, 31, 36, 58, 24, 16,  3, 40,  8, 38, 48, 54, 44, 18, 32, 56,
       61, 20, 39, 43, 37,  5, 26, 25,  9,  4, 22,  6, 49, 12, 50, 41, 17,
       57, 11, 34,  1, 45, 42, 46, 10, 51, 53, 21, 28,  7, 13, 30, 63, 59,
       14])

In [31]:
Items_Assigned[Items_Assigned["CategoryID"] == 55]

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup,Food Category
1,I-8228,55,Grey Goose 1.14L,1.0,fl oz,1.0,fl oz,N,
33,I-2640,55,Water - Tap,250.0,ml,2.5,ml,N,
92,I-13817,55,FRENCH AUS JUS,490.0,g,5.0,g,N,
102,I-15592,55,JUICE ORANGE CONC 6+1,500.0,ml,13.0,ml,N,
121,I-3036,55,MIRIN 5.28GL,720.0,ml,2100.0,fl oz,N,
171,I-2103,55,PEACH SLCD IN PEAR JUICE,300.0,ml,300.0,ml,N,
172,I-6204,55,Juice - Lime Fresh Squeezed,30.0,g,300.0,oz,N,
212,I-17946,55,JUICE orange 100% tetra,1.0,L,6.0,ml,Y,
228,I-6203,55,Juice - Lemon Fresh Squeezed,50.0,g,1.2,oz,Y,
270,I-17129,55,Boiling Water,3000.0,g,2000.0,g,N,


### Get the List of New Items

In [32]:
# Filter new items by itemID that are not in the database and output them in a dataframe
col_names = list(Items.columns.values)
New_Items_List = []

for index, row in Items.iterrows():
    ItemId = Items.loc[index,'ItemId']
    descrp = Items.loc[index,'Description']
    if ItemId not in Items_Assigned['ItemId'].values:
        Dict = {}
        Dict.update(dict(row))
        New_Items_List.append(Dict)
    else:
        print("ItemID: ", ItemId, descrp, " already exists in the database")

New_Items = pd.DataFrame(New_Items_List, columns = col_names)

ItemID:  I-1927 JALAPENO PEPPER - FRESH  already exists in the database
ItemID:  I-1958 KETCHUP VOL PAK  already exists in the database
ItemID:  I-2025 MUSTARD DRY  already exists in the database
ItemID:  I-2323 SPICE CAYENNE SHAKER   already exists in the database
ItemID:  I-2324 SPICE CHILI POWDER   already exists in the database
ItemID:  I-2329 SPICE CUMIN SEED GROUND BOX  already exists in the database
ItemID:  I-4105 SPICE GARLIC GRANULATED   already exists in the database
ItemID:  I-4106 SPICE Pepper Black Grnd   already exists in the database
ItemID:  I-2366 SUGAR GOLDEN YEL  already exists in the database
ItemID:  I-2640 Water - Tap  already exists in the database
ItemID:  I-2958 Ginger Fresh  already exists in the database
ItemID:  I-3180 Vinegar White Wine  already exists in the database
ItemID:  I-4091 SALT KOSHER COARSE  already exists in the database
ItemID:  I-5095 Onion Granulated powder  already exists in the database
ItemID:  I-1783 CREAM WHIPPING 33%  already exists i

In [33]:
New_Items.insert(1, "CategoryID", '')
New_Items

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-4196,,ITEM SAUCE BURGER,2.0,oz (fl),,,Y
1,I-4197,,DRESSING Ranch BtrMilk,2.0,oz (fl),,,Y
2,I-4200,,Fries Sweet Potato,400.0,g,,,Y
3,I-4207,,Belgian Waffle Ind.Wrapped 70g,1.0,ea,,,Y


In [34]:
New_Items.shape

(4, 8)

In [35]:
# Store the list of new items into .csv file
# If New_Items is not empty then we convert it to a csv file. 
if not New_Items.empty:
    path = os.path.join(os.getcwd(), "data", "mapping", "AMS_data", "new items", str(datetime.date(datetime.now()))+"_New_Items.csv")
    New_Items.to_csv(path, index = False, header = True)

In [36]:
# AMS version
file2 = pd.read_excel("data/mapping/new items added/AMS_data/New_Items_2023/New_Items_Added_2023-11-28.xlsx")
file2.to_csv("data/mapping/new items added/AMS_data/New_Items_2023/New_Items_Added_2023-11-28.csv", index=False)

***
## Data Summary

In [37]:
datasum = pd.DataFrame([New_Items.shape, Preps_Nonstd.shape, Items_Nonstd.shape],
                       columns = ['count', 'columns'], 
                       index = ['New_Items', 'Preps_Nonstd', 'Items_Nonstd'])
datasum

Unnamed: 0,count,columns
New_Items,4,8
Preps_Nonstd,49,7
Items_Nonstd,44,5


In [38]:
print(New_Items.columns)

Index(['ItemId', 'CategoryID', 'Description', 'CaseQty', 'CaseUOM', 'PakQty',
       'PakUOM', 'InventoryGroup'],
      dtype='object')
