# Climate-Friendly Food Systems (CFFS) Labelling Project

### The University of British Columbia

****

## Part II: Data Cleaning

## Set up and Import Libraries

In [1]:
#pip install -r requirements.txt

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os
import csv
from itertools import islice
from decimal import Decimal
import xml.etree.ElementTree as et
from xml.etree.ElementTree import parse
from datetime import datetime

In [3]:
# RUN ONLY ONCE
# os.chdir is used to change the current directory to the specified path
os.chdir("../") # Sets path to the repo folder as it is one level above where this file exists!
path = os.getcwd()
print(path)

/Users/ankurbhardwaj/Desktop/SEEDS/CFFS_Label_2024_25


***

## Import Preprocessed Datasets

In [4]:
# Read Items_List.csv
Items = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "Items_List.csv"))
Items.dtypes

ItemId             object
Description        object
CaseQty           float64
CaseUOM            object
PakQty            float64
PakUOM             object
InventoryGroup     object
dtype: object

In [5]:
# Display first 5 rows
Items.head()

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-14182,CHEESE PARM*GRATED,2.0,bag,2.5,Kg,DAIRY
1,I-72089,CHICKEN BRST STRP CKD^,2.0,bag,2.0,Kg,"PACKAGED BEER, CIDER"
2,I-65802,CREAM WHIP*36% 946ML,16.0,each,946.0,ml,DAIRY
3,I-68787,FOCACCIA ORGANIC,1.0,SHEET,1.0,SHEET,BREAD
4,I-3632,GARLIC POWDER*,2.1,Kg,2100.0,g,SPICES


In [6]:
# There are 486 rows and 7 columns
Items.shape

(27, 7)

In [7]:
# Read Ingredients_List.csv
Ingredients = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "Ingredients_List.csv"))
Ingredients.dtypes

IngredientId     object
Qty             float64
Uom              object
Conversion      float64
InvFactor       float64
Recipe           object
dtype: object

In [8]:
# Display first 5 rows
Ingredients.head()

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe
0,I-3642,500.0,g,1.0,333.3333,P-18907
1,I-6026,1.0,Kg,1000.0,0.6667,P-18907
2,I-3388,250.0,ml,0.001,27.7778,P-26216
3,I-3416,4.54,Kg,2.20462,0.5044,P-26216
4,I-5983,20.0,L,1.0,2.2222,P-26216


In [9]:
# There are 3278 rows and 6 columns
Ingredients.shape

(52, 6)

In [10]:
# Read Preps_List.csv
Preps = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "Preps_List.csv"))
Preps.dtypes

PrepId             object
Description        object
PakQty            float64
PakUOM             object
InventoryGroup     object
dtype: object

In [11]:
# Display first 5 rows
Preps.head()

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-75391,COOKED|Pasta|Cavatappi,9.0,Kg,PREP
1,P-26216,COOKED|Pasta|Penne|WW*,9.0,Kg,PREP
2,P-75390,COOKED|Pasta|Spaghetti,9.0,Kg,PREP
3,P-75443,COOKED|Prawns,800.0,g,
4,P-18907,MIX|Salt & Pepper*,1.5,Kg,


In [12]:
# There are 546 rows and 5 columns
Preps.shape

(10, 5)

In [13]:
# Read Product_List.csv that was created from 1_data preprocessing
Products = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "Products_List.csv"))
Products.dtypes

ProdId         object
Description    object
SalesGroup     object
dtype: object

In [14]:
# Display first 5 rows
Products.head()

Unnamed: 0,ProdId,Description,SalesGroup
0,R-75442,ADDON|Chicken,ADD ONS
1,R-75441,ADDON|Meat Balls,ADD ONS
2,R-75444,ADDON|Prawns,ADD ONS
3,R-75446,ADDON|Roasted Veg,ADD ONS
4,R-75440,ENTREE|Pasta Bowl,FOOD


In [15]:
# There are 223 rows and 3 columns
Products.shape

(5, 3)

In [16]:
# Read Coversions_List.csv that was created from 1_data preprocessing
Conversions = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "Conversions_List.csv"))
Conversions.dtypes

ConversionId       object
Multiplier        float64
ConvertFromQty    float64
ConvertFromUom     object
ConvertToQty      float64
ConvertToUom       object
dtype: object

In [17]:
# Display first 5 rows
Conversions.head()

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,,1.0,1.0,XXX,1.0,L
1,,0.877193,1.0,1.14L,1.14,L
2,,0.666667,1.0,1.5L,1.5,L
3,,0.571429,1.0,1.75 L,1.75,L
4,,0.5,1.0,2L,2.0,L


In [18]:
# There are 270 rows and 6 columns
Conversions.shape

(56, 6)

***
## Update Conversion List

In [19]:
# Add the specific conversion info from the newly-processed data to a unit conversion database
Update_Conv = pd.read_csv(os.path.join(os.getcwd(), "data", "cleaning", "update", "Conv_UpdateConv.csv"))
Update_Conv

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,I-67659,0.008818,1.00,each,113.398,g
1,I-28697,0.005181,1.00,ea,193.000,g
2,I-47441,0.005181,1.00,ea,193.000,g
3,I-1905,0.012500,1.00,CT,80.000,g
4,I-47440,0.005051,1.00,ea,198.000,g
...,...,...,...,...,...,...
661,I-72015,0.002205,0.25,ea,113.400,g
662,I-61314,0.002000,0.30,pak,150.000,g
663,I-72016,0.002597,0.20,ea,77.000,g
664,I-4677,0.001102,0.10,CT,90.720,g


In [20]:
# return dataframe with null values
# There are no null values
subset_conv = Update_Conv[Update_Conv["Multiplier"].isna()]
subset_conv

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom


In [21]:
# takes a data frame and assigns a new column called Multiplier to the dataframe
# uses iterrows() to iterate through the rows.Then subset_conv will have a new column named Multiplier with the computed values.
def assign_multiplier(df):
    for ind, row in df.iterrows():
        if row["ConvertFromQty"] == 0 or row["ConvertToQty"] == 0:
            df.loc[ind, "Multiplier"] = 1
        else:
            df.loc[ind, "Multiplier"] = row["ConvertFromQty"] / row["ConvertToQty"]
        
assign_multiplier(subset_conv)

# May 8th change: We want to 
assign_multiplier(Update_Conv)
#subset_conv

In [22]:
# convert dataframe to csv file
Update_Conv = pd.concat([Update_Conv, subset_conv], axis=0)
Update_Conv.to_csv("data/cleaning/update/Conv_UpdateConv.csv", index=False)

In [23]:
for index, row in Update_Conv.iterrows():
    Id = row['ConversionId'] 
    if Id in Conversions['ConversionId'].values:
        Conversions.drop(Conversions[Conversions['ConversionId'] == Id].index, inplace=True)
    else:
        print(f"Warning: 'ConversionId' {Id} not found in Conversions DataFrame. Skipping drop operation.")



In [24]:
frames = [Conversions, Update_Conv]
Conversions = pd.concat(frames).reset_index(drop=True, inplace=False).drop_duplicates()

In [25]:
Conversions

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,,1.000000,1.00,XXX,1.00,L
1,,0.877193,1.00,1.14L,1.14,L
2,,0.666667,1.00,1.5L,1.50,L
3,,0.571429,1.00,1.75 L,1.75,L
4,,0.500000,1.00,2L,2.00,L
...,...,...,...,...,...,...
717,I-72015,0.002205,0.25,ea,113.40,g
718,I-61314,0.002000,0.30,pak,150.00,g
719,I-72016,0.002597,0.20,ea,77.00,g
720,I-4677,0.001102,0.10,CT,90.72,g


In [26]:
# Convert the DataFrame: Conversions into a csv file called Conversions_Added.csv
path = os.path.join(os.getcwd(), "data", "cleaning", "Conversions_Added.csv")
Conversions.to_csv(path, index = False, header = True)

### Create Unit Converter

In [27]:
# Import standard unit conversion information and construct a dataframe
Std_Unit = pd.read_csv(os.path.join(os.getcwd(), "data", "external", "standard_conversions.csv"))
Std_Unit.head()

Unnamed: 0,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,4.9289,1,tsp,4.9289,ml
1,14.787,1,Tbsp,14.787,ml
2,946.35,1,qt,946.35,ml
3,473.17625,1,pt,473.17625,ml
4,28.3495,1,oz,28.3495,g


In [28]:
# Seperate uoms that converted to 'ml' or 'g'
liquid_unit = Std_Unit.loc[Std_Unit['ConvertToUom'] == 'ml', 'ConvertFromUom'].tolist()
solid_unit = Std_Unit.loc[Std_Unit['ConvertToUom'] == 'g', 'ConvertFromUom'].tolist()

In [29]:
# Construct a standard unit converter
def std_converter(qty, uom):
    if uom in Std_Unit['ConvertFromUom'].tolist():
        multiplier = Std_Unit.loc[Std_Unit['ConvertFromUom'] == uom, 'Multiplier']
        Qty = float(qty)*float(multiplier)
        Uom = Std_Unit.loc[Std_Unit['ConvertFromUom'] == uom, 'ConvertToUom'].values[0]
    else:
        Qty = qty
        Uom = uom
    return (Qty, Uom)

In [30]:
# Convert pounds to grams
std_converter(0.25,'lb')

  Qty = float(qty)*float(multiplier)


(113.398, 'g')

In [31]:
# Test the std_converter
assert std_converter(0.25,'lb') == (113.398, 'g')

  Qty = float(qty)*float(multiplier)


In [32]:
# Construct a unit converter for specific ingredients

# After this line below, spc_cov contains only the non-empty values from the 'ConversionId' column of the Conversions DataFrame.
spc_cov = list(filter(None, Conversions['ConversionId'].tolist()))


# Comments for spc_converter:
# The function checks if ingredient is in the liquid_unit or solid_unit lists. If so, it calls std_converter(qty, uom) to 
# convert the quantity and UOM to a standardized unit.

# If uom is not in liquid_unit or solid_unit it checks if ingre is in spc_cov, if it is and the ConvertToUom is equal to grams 
# then the function applies the factor to the qty argument to convert it to the standardized unit, and returns the result as
# a tuple containing the converted quantity and uom. If no conversion found, then it calls std_converter(qty, uom)

# If uom not in liquid_unit or solid_unit and if ingre is not in spc_cov then the function calls std_converter(qty, uom)

def spc_converter(ingre, qty, uom):
    if uom in liquid_unit + solid_unit:
        return std_converter(qty, uom)
    elif ingre in spc_cov:
        conversion = Conversions.loc[(Conversions['ConversionId'] == ingre) & (Conversions['ConvertFromUom'] == uom)
                                    & (Conversions['ConvertToUom'].isin(["ml", "g"]))]
        multiplier = conversion['Multiplier']
        if multiplier.empty:
            return std_converter(qty, uom)
        else: 
            Qty = float(qty)/float(multiplier)
            Uom = conversion['ConvertToUom'].values[0]
            return (Qty, Uom)
    else:
        return std_converter(qty, uom)

In [33]:
# spc_cov2 is the same as spc_cov but without null values labelled as "nan" in the list
import math
spc_cov2 = [item for item in spc_cov if not(pd.isnull(item)) == True]
spc_cov2

['I-3387',
 'I-3387',
 'I-3387',
 'I-3387',
 'I-3388',
 'I-3388',
 'I-3388',
 'I-3632',
 'I-3632',
 'I-3642',
 'I-3692',
 'I-3692',
 'I-3692',
 'I-3692',
 'I-4772',
 'I-4772',
 'I-4772',
 'I-5983',
 'I-5983',
 'I-67659',
 'I-28697',
 'I-47441',
 'I-1905',
 'I-47440',
 'I-13327',
 'I-54484',
 'I-2501',
 'I-52609',
 'I-52636',
 'I-2772',
 'I-20347',
 'I-54756',
 'I-41251',
 'I-2833',
 'I-54761',
 'I-54755',
 'I-54759',
 'I-63749',
 'I-54758',
 'I-53847',
 'I-34313',
 'I-19639',
 'I-63683',
 'I-19700',
 'I-3902',
 'I-63690',
 'I-48660',
 'I-2546',
 'I-1727',
 'I-2118',
 'I-2778',
 'I-63698',
 'I-2116',
 'I-2120',
 'I-32271',
 'I-38717',
 'I-31549',
 'I-53226',
 'I-32265',
 'I-41807',
 'I-41805',
 'I-63977',
 'I-43988',
 'I-42899',
 'I-13970',
 'I-54757',
 'I-51331',
 'I-38957',
 'I-41804',
 'I-41800',
 'I-55664',
 'I-41803',
 'I-54483',
 'I-54482',
 'I-42194',
 'I-42203',
 'I-42202',
 'I-42201',
 'I-42200',
 'I-42197',
 'I-42204',
 'I-40471',
 'I-40472',
 'I-40473',
 'I-40474',
 'I-42196'

In [34]:
# Here we pass in the ingredient (I-1120) the quantity of the ingredient (1) and the unit of measurement: CT -> count
# This gives us the number of grams of that ingredient
spc_converter('I-1120', 1, 'CT')

(1, 'CT')

In [35]:
Conversions.loc[Conversions["ConversionId"] == "I-19735"]

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
576,I-19735,0.01,1.0,CT,100.0,g


In [36]:
c_list = Conversions["ConversionId"].unique()
"I-68700" in c_list

True

In [37]:
spc_converter("I-14190", 1, "LOAF")

  Qty = float(qty)/float(multiplier)


(500.0, 'g')

In [38]:
spc_converter('I-47530', 7, 'ea')

  Qty = float(qty)/float(multiplier)


(210.0, 'g')

***
## Items with Non-standard Units

In [39]:
Ingredients

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe
0,I-3642,500.0,g,1.0,333.3333,P-18907
1,I-6026,1.0,Kg,1000.0,0.6667,P-18907
2,I-3388,250.0,ml,0.001,27.7778,P-26216
3,I-3416,4.54,Kg,2.20462,0.5044,P-26216
4,I-5983,20.0,L,1.0,2.2222,P-26216
5,I-6026,10.0,g,1.0,1.1111,P-26216
6,I-4772,1.0,BUNCH,1.0,0.0222,P-44728
7,I-14182,400.0,g,0.001,88.8889,P-74628
8,I-3632,15.0,g,1.0,3.3333,P-74628
9,I-3642,9.0,g,1.0,2.0,P-74628


In [40]:
Items

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-14182,CHEESE PARM*GRATED,2.0,bag,2.5,Kg,DAIRY
1,I-72089,CHICKEN BRST STRP CKD^,2.0,bag,2.0,Kg,"PACKAGED BEER, CIDER"
2,I-65802,CREAM WHIP*36% 946ML,16.0,each,946.0,ml,DAIRY
3,I-68787,FOCACCIA ORGANIC,1.0,SHEET,1.0,SHEET,BREAD
4,I-3632,GARLIC POWDER*,2.1,Kg,2100.0,g,SPICES
5,I-4772,HERB PARSLEY MX,3.0,BUNCH,1.0,BUNCH,PRODUCE
6,I-37005,MEATBALLS BEEF,4.54,Kg,1000.0,g,MEAT
7,I-3387,OIL CANOLA*OLIVE OIL,6.0,can,3.0,L,FOOD - GROCERY
8,I-3388,OIL CANOLA*SALAD 100% PURE,4.0,can,3.0,L,FOOD - GROCERY
9,I-4757,ONIONS RED,25.0,lb,1.0,lb,PRODUCE


In [41]:
# Filter out the items whose unit information is unknown 

# We find the column names
col_names = list(Ingredients.columns.values)

# Create a Items_Nonstd list
Items_Nonstd = []

# If the unit of measurement is not grams or ml and ingredient id starts with I and the ingredient is not in ConversionId column of Conversions 
# then we add it to Items_Nonstd list
for index, row in Ingredients.iterrows():
    Ingre = Ingredients.loc[index,'IngredientId']
    Uom = Ingredients.loc[index,'Uom']
    if Uom not in ['g', 'ml'] and Uom not in liquid_unit + solid_unit and Ingre.startswith('I') and Ingre not in Conversions["ConversionId"].tolist():
        Dict = {}
        Dict.update(dict(row))
        Items_Nonstd.append(Dict)

# Create a DataFrame from Items_Nonstd list
Items_Nonstd = pd.DataFrame(Items_Nonstd, columns = col_names)
# Remove duplicate ingredients of the same properties so that Items_Nonstd has only unique rows. 
Items_Nonstd.drop_duplicates(subset=['IngredientId'], inplace=True,)
Items_Nonstd

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe


In [42]:
# Assigning a Description column to the Items_Nonstd    
for index, row in Items_Nonstd.iterrows():
    idx = row['IngredientId']
    filtered_items = Items.loc[Items['ItemId'] == idx, 'Description']
    if not filtered_items.empty:
        descrp = filtered_items.values[0]
        Items_Nonstd.loc[index, 'Description'] = descrp
    else:
        pass

In [43]:
Items_Nonstd.head()

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe


In [44]:
# Convert the Items_Nonstd DataFrame to a csv file
path = os.path.join(os.getcwd(), "data", "cleaning", "Items_Nonstd.csv")
Items_Nonstd.to_csv(path, index = False, header = True)

***
## Clean Preps Units

In [45]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-75391,COOKED|Pasta|Cavatappi,9.0,Kg,PREP
1,P-26216,COOKED|Pasta|Penne|WW*,9.0,Kg,PREP
2,P-75390,COOKED|Pasta|Spaghetti,9.0,Kg,PREP
3,P-75443,COOKED|Prawns,800.0,g,
4,P-18907,MIX|Salt & Pepper*,1.5,Kg,
5,P-75445,ROASTED|Veg,3.0,Kg,
6,P-74628,SAUCE|Alfredo,4.5,L,
7,P-75383,SAUCE|Marinara,4.5,L,
8,P-75559,SAUCE|Rose,4.0,L,
9,P-44728,YEILD|Chopped Parsley*,45.0,g,


In [46]:
preps_updateuom = pd.read_csv("data/cleaning/update/Preps_UpdateUom.csv")

In [47]:
# Creates 2 new columns called StdQty and StdUom in the Preps DataFrame. These columns contain NaN values
# Preparing to fill in these columns with standardized quantities and units of measurement 
Preps['StdQty'] = np.nan
Preps['StdUom'] = np.nan

In [48]:
# Convert uom into 'g' or 'ml' for each prep using the unit converter

# Retrieve the PrepId, PakQty, and PakUOM from the current row
# Pass these values to spc_converter, then we update the StdQty and StdUom columns of the current row with the converted values.
for index in Preps.index:
    PrepId = Preps.loc[index,'PrepId']
    Qty = Preps.loc[index,'PakQty']
    Uom = Preps.loc[index,'PakUOM']
    Preps.loc[index,'StdQty'] = spc_converter(PrepId, Qty, Uom)[0]
    Preps.loc[index,'StdUom'] = spc_converter(PrepId, Qty, Uom)[1]

  Qty = float(qty)*float(multiplier)
  Qty = float(qty)*float(multiplier)
  Preps.loc[index,'StdUom'] = spc_converter(PrepId, Qty, Uom)[1]
  Qty = float(qty)*float(multiplier)
  Qty = float(qty)*float(multiplier)
  Qty = float(qty)*float(multiplier)
  Qty = float(qty)*float(multiplier)
  Qty = float(qty)*float(multiplier)
  Qty = float(qty)*float(multiplier)
  Qty = float(qty)*float(multiplier)
  Qty = float(qty)*float(multiplier)
  Qty = float(qty)*float(multiplier)
  Qty = float(qty)*float(multiplier)
  Qty = float(qty)*float(multiplier)
  Qty = float(qty)*float(multiplier)
  Qty = float(qty)*float(multiplier)
  Qty = float(qty)*float(multiplier)


In [49]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-75391,COOKED|Pasta|Cavatappi,9.0,Kg,PREP,9000.0,g
1,P-26216,COOKED|Pasta|Penne|WW*,9.0,Kg,PREP,9000.0,g
2,P-75390,COOKED|Pasta|Spaghetti,9.0,Kg,PREP,9000.0,g
3,P-75443,COOKED|Prawns,800.0,g,,800.0,g
4,P-18907,MIX|Salt & Pepper*,1.5,Kg,,1500.0,g
5,P-75445,ROASTED|Veg,3.0,Kg,,3000.0,g
6,P-74628,SAUCE|Alfredo,4.5,L,,4500.0,ml
7,P-75383,SAUCE|Marinara,4.5,L,,4500.0,ml
8,P-75559,SAUCE|Rose,4.0,L,,4000.0,ml
9,P-44728,YEILD|Chopped Parsley*,45.0,g,,45.0,g


In [50]:
# Save cleaned preps list to file
path = os.path.join(os.getcwd(), "data", "cleaning", "Preps_Unit_Cleaned.csv")
Preps.to_csv(path, index = False, header = True)

In [51]:
pd.read_csv("data/cleaning/update/Preps_UpdateUom.csv")

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-35132,MARINATED|Lemon & Herb Chx,185.0,ea,PREP,24050.000000,g
1,P-26234,BATCH|Roasted Garlic Bread,16.0,ea,PREP,1280.000000,g
2,P-26170,GRILLED|NaanBread,1.0,ea,PREP,125.000000,g
3,P-16305,YIELD|Smokie (1pc),1.0,ea,,112.000000,g
4,P-26047,BOILED|Hard Boiled Eggs FT,50.0,ea,PREP,2500.000000,g
...,...,...,...,...,...,...,...
521,P-26631,GRL|Pancake|Chocolate Chip,24.0,ea,,4125.000000,g
522,P-55093,PREP|Ajitama - Ramen Eggs,200.0,CT,,96800.000066,g
523,P-50511,TOASTED|French Toast,1.0,ea,,135.000000,g
524,P-51992,YIELD|Bread|Sourdough 5/8,36.0,slice,,650.000000,g


### Get Preps with Nonstandard Unit

In [52]:
col_names = list(Preps.columns.values)
Preps_Nonstd = []

for index, row in Preps.iterrows():
    StdUom = Preps.loc[index,'StdUom']
    if StdUom not in ['g', 'ml']:
        Dict = {}
        Dict.update(dict(row))
        Preps_Nonstd.append(Dict)

Preps_Nonstd = pd.DataFrame(Preps_Nonstd, columns = col_names)

In [53]:
Preps_Nonstd

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom


In [54]:
# for _,row in Preps_Nonstd.iterrows():
#     try:
#         prepID = row["PrepId"]
#         recipe = Ingredients.loc[Ingredients["Recipe"] == prepID]
#         final_weight = 0
#         if not recipe.empty:
#             for _,row  in recipe.iterrows():
#                 qty = row["Qty"]
#                 uom = row["Uom"]
#                 itemID = row["IngredientId"]
#                 converted = spc_converter(itemID, qty, uom)
#                 if converted[1] == "g" or converted[1] == "ml":
#                     final_weight += converted[0]
#                 else:
#                     if row["IngredientId"].startswith("P"):
#                         new_weight = preps_updateuom.loc[preps_updateuom["PrepId"] == row["IngredientId"]]["StdQty"]
#                         if not new_weight.empty:
#                             final_weight += new_weight
#                         else:
#                             raise Exception(f'Item with ID {row["IngredientId"]} not in standard units,it is in the unit {converted[1]}')         
#                     elif row["IngredientId"].startswith("I"):
#                         conversion = Conversions.loc[Conversions["ConversionId"] == row["IngredientId"]]
#                         print(conversion)
#                         if not conversion.empty and conversion["ConvertFromUom"] == row["Uom"]:
#                             final_weight += conversion["Multiplier"] * qty
#                         else:
#                             error_string = f'Item with ID {row["IngredientId"]} not in standard units,it is in the unit {converted[1]}'
#                             if conversion["ConvertFromUom"] != row["Uom"]:
#                                 error_string += f'Item is in UOM {conversion["ConvertFromUom"]}'
#                             else:
#                                 error_string = error_string
#                             raise Exception(error_string)
#         else:
#             raise Exception("PrepId not found in Ingredients")
#         Preps.loc[Preps["PrepId"] == prepID, "StdQty"] = final_weight
#     except Exception as e:
#         print(e)

# Using vectorized operations to avoid iterrows() for better performance
preps_updateuom_dict = preps_updateuom.set_index("PrepId")["StdQty"].to_dict()
conversions_dict = Conversions.set_index("ConversionId")[["ConvertFromUom", "Multiplier"]].to_dict()

def calculate_final_weight(prepID):
    try:
        recipe = Ingredients[Ingredients["Recipe"] == prepID]
        if recipe.empty:
            raise ValueError("PrepId not found in Ingredients")

        final_weight = 0
        for _, recipe_row in recipe.iterrows():
            qty = recipe_row["Qty"]
            uom = recipe_row["Uom"]
            itemID = recipe_row["IngredientId"]

            converted = spc_converter(itemID, qty, uom)
            if converted[1] in ["g", "ml"]:
                final_weight += converted[0]
            else:
                if itemID.startswith("P"):
                    # Handle prep item conversion using preps_updateuom
                    new_weight = preps_updateuom_dict.get(itemID)
                    if new_weight is not None:
                        final_weight += new_weight
                    else:
                        raise ValueError(f'Item with ID {itemID} not in standard units, it is in the unit {converted[1]}')
                elif itemID.startswith("I"):
                    # Handle ingredient conversion using Conversions
                    conversion_uom = conversions_dict["ConvertFromUom"].get(itemID)
                    conversion_multiplier = conversions_dict["Multiplier"].get(itemID)

                    if conversion_uom and conversion_uom == recipe_row["Uom"]:
                        final_weight += conversion_multiplier * qty
                    else:
                        error_string = f'Item with ID {itemID} not in standard units, it is in the unit {converted[1]}'
                        if conversion_uom and conversion_uom != recipe_row["Uom"]:
                            error_string += f'. Item is in UOM {conversion_uom}'
                        raise ValueError(error_string)

        return final_weight
    except Exception as e:
        # Handle or log the error (you can log the error message if needed)
        return str(e)

# Apply the function to Preps_Nonstd and update the Preps dataframe
Preps_Nonstd["StdQty"] = Preps_Nonstd["PrepId"].apply(calculate_final_weight)
Preps_Nonstd["StdUom"] = "g"
# Update the Preps dataframe with the calculated weights
for index, row in Preps_Nonstd.iterrows():
    PrepId = row['PrepId']
    StdQty = row['StdQty']
    Preps.loc[Preps['PrepId'] == PrepId, 'StdQty'] = StdQty
    Preps.loc[Preps['PrepId'] == PrepId, 'StdUom'] = 'g'

In [55]:
Preps_Nonstd

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom


In [56]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-75391,COOKED|Pasta|Cavatappi,9.0,Kg,PREP,9000.0,g
1,P-26216,COOKED|Pasta|Penne|WW*,9.0,Kg,PREP,9000.0,g
2,P-75390,COOKED|Pasta|Spaghetti,9.0,Kg,PREP,9000.0,g
3,P-75443,COOKED|Prawns,800.0,g,,800.0,g
4,P-18907,MIX|Salt & Pepper*,1.5,Kg,,1500.0,g
5,P-75445,ROASTED|Veg,3.0,Kg,,3000.0,g
6,P-74628,SAUCE|Alfredo,4.5,L,,4500.0,ml
7,P-75383,SAUCE|Marinara,4.5,L,,4500.0,ml
8,P-75559,SAUCE|Rose,4.0,L,,4000.0,ml
9,P-44728,YEILD|Chopped Parsley*,45.0,g,,45.0,g


In [57]:
# Filter out preps with nonstandard uom but have information already

Manual_PrepU = pd.read_csv(os.path.join(os.getcwd(), "data", "cleaning", "update", "Preps_UpdateUom.csv"))

col_names = list(Preps_Nonstd.columns.values)
Preps_Nonstd_na = []

# for index, row in Preps_Nonstd.iterrows():
#     PrepId = Preps_Nonstd.loc[index,'PrepId']
#     if PrepId not in Manual_PrepU['PrepId'].values or Preps_Nonstd.loc[index,"StdUom"] != "g":
#         Dict = {}
#         Dict.update(dict(row))
#         Preps_Nonstd_na.append(Dict)

for idx, row in Preps_Nonstd.iterrows():
    PrepID = Preps_Nonstd.loc[idx, 'PrepId']
    Manual_PakQty = Manual_PrepU.loc[Manual_PrepU['PrepId'] == PrepID, 'PakQty']
    Manual_PakUOM = Manual_PrepU.loc[Manual_PrepU['PrepId'] == PrepID, 'PakUOM']
    Nonstd_PakQty = Preps_Nonstd.loc[idx, 'PakQty']
    Nonstd_PakUOM = Preps_Nonstd.loc[idx, 'PakUOM']
    if ((Manual_PakQty.empty or Manual_PakUOM.empty) or 
        (Manual_PakQty.values[0] != Nonstd_PakQty or Manual_PakUOM.values[0] != Nonstd_PakUOM) or 
        (Preps_Nonstd.loc[idx, "StdUom"] != "g")):
        Dict = {}
        Dict.update(dict(row))
        Preps_Nonstd_na.append(Dict)

Preps_Nonstd = pd.DataFrame(Preps_Nonstd_na, columns = col_names)
Preps_Nonstd

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom


In [58]:
path = os.path.join(os.getcwd(), "data", "cleaning", "Preps_NonstdUom.csv")
Preps_Nonstd.to_csv(path, index = False, header = True)

In [59]:
update_prep = pd.read_csv("data/cleaning/update/Preps_UpdateUom.csv")
update_prep

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-35132,MARINATED|Lemon & Herb Chx,185.0,ea,PREP,24050.000000,g
1,P-26234,BATCH|Roasted Garlic Bread,16.0,ea,PREP,1280.000000,g
2,P-26170,GRILLED|NaanBread,1.0,ea,PREP,125.000000,g
3,P-16305,YIELD|Smokie (1pc),1.0,ea,,112.000000,g
4,P-26047,BOILED|Hard Boiled Eggs FT,50.0,ea,PREP,2500.000000,g
...,...,...,...,...,...,...,...
521,P-26631,GRL|Pancake|Chocolate Chip,24.0,ea,,4125.000000,g
522,P-55093,PREP|Ajitama - Ramen Eggs,200.0,CT,,96800.000066,g
523,P-50511,TOASTED|French Toast,1.0,ea,,135.000000,g
524,P-51992,YIELD|Bread|Sourdough 5/8,36.0,slice,,650.000000,g


***

## New Items

In [60]:
# Load current Items List with assigned Emission Factors Category ID
Items_Assigned = pd.read_csv(os.path.join(os.getcwd(), "data", "mapping", "Items_List_Assigned.csv"))
Items_Assigned.head()

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup,Food Category
0,I-57545,1.0,CHUCK FLAT BONELESS FZN,3.3,Kg,1.0,Kg,MEAT,
1,I-10869,1.0,BEEF STIRFRY COV FR,5.0,Kg,1.0,Kg,MEAT,
2,I-7064,1.0,BEEF OUTSIDE FLAT AAA,1.0,Kg,1.0,Kg,MEAT,
3,I-37005,1.0,BEEF MEATBALLS,4.54,Kg,1000.0,g,MEAT,
4,I-37002,1.0,BEEF INSIDE ROUND SHAVED,9.0,Kg,1000.0,g,MEAT,


In [61]:
Items_Assigned.shape

(3079, 9)

In [62]:
Items_Assigned["InventoryGroup"].unique()

array(['MEAT', 'DELI & PREPARED MEAT', 'FOOD - GROCERY', 'POULTRY',
       'PRODUCTION FOOD', 'DAIRY', 'BAKING-RAW INGREDIENTS', 'PRODUCE',
       'ICECREAM/NOVELTY/GELATO', 'SEAFOOD', 'CANDIES SNACKS', 'MISC.',
       'MM Grocery Cereal', 'BAKED GOODS', 'MM Grocery Chill', 'PREP',
       'BREAD', 'MM BAKERY', 'ZDONT USE FOOD', 'SPICES', 'FAIR TRADE',
       'MM Snack Ethnic', 'HM FROZEN SINGLE SERVE',
       'CHIPS, PRETZELS, NUTS', 'ZDONT USE FROZEN', 'BEVERAGE',
       'PACKAGED BEER, CIDER', 'LIQUOR', 'ALCOHOL - FOOD',
       'MM CANDY Chocolate ', 'PREGEL - ALL ITEMS',
       'HM SWEET & SAVOURY SNACKS', 'UCARE',
       'HM CHILL SINGLE SERVE BEVERAGE', 'JERKY, PEPPERONI', 'SUSHI',
       'MM CANDY Bulk', 'MM CANDY Gum & Mints', 'MM Snack Single Bars',
       'MERCHANDISE', 'MM Frozen Grocery', 'MM Frozen HMR', 'MM Grocery',
       'MM Grocery Coffee & Tea', 'MM Grocery Ethnic', 'MM Grocery Soup',
       'MM MERCH OTC', 'MM Snack Jerky', 'ZDONT USE CANDY - GUM & MINTS',
       'DI

In [63]:
Items_Assigned["CategoryID"].unique()

array([ 1., 21.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 16., 17., 20., 18., 19., 22., 23., 24., 59., 25., 26.,
       27., 28., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40.,
       41., 42., 43., 44., 45., 48., 49., 50., 51., 52., 53., 54., 55.,
       56., 57., 58., 60., 61., 63., 15., 62.])

In [64]:
Items_Assigned[Items_Assigned["CategoryID"] == 55]

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup,Food Category
1287,I-11794,55.0,YERBA MATE REVEL BERRY 458ML,12.0,each,1.0,each,BEVERAGE,
1288,I-1683,55.0,WATER VITAMIN ZERO XOXO,12.0,591ML,1.0,591ML,BEVERAGE,
1289,I-2955,55.0,WATER SPARKLING SAN PELLEG,24.0,PTN,1.0,PTN,BEVERAGE,
1290,I-52773,55.0,WATER SMART SPARKLING RASPB,12.0,each,1.0,each,BEVERAGE,
1291,I-5983,55.0,WATER FOR RECIPES,1.0,L,1.0,L,BEVERAGE,
...,...,...,...,...,...,...,...,...,...
2818,I-45755,55.0,WATER ACTIVE ORANGE MANGO,12.0,450ML,1.0,450ML,BEVERAGE,
2819,I-45756,55.0,WATER ACTIVE STRAWB BLK CHERRY,12.0,450ML,1.0,450ML,BEVERAGE,
2820,I-43251,55.0,WATER FRUIT 500ML PINEAPPLE,12.0,500ML,1.0,bottle,BEVERAGE,
2855,I-71665,55.0,ICE CUBES,1.0,L,1.0,L,BEVERAGE,manually adjusted


### Get the List of New Items

In [65]:
# Filter new items by itemID that are not in the database and output them in a dataframe
col_names = list(Items.columns.values)
New_Items_List = []

for index, row in Items.iterrows():
    ItemId = Items.loc[index,'ItemId']
    if ItemId not in Items_Assigned['ItemId'].values:
        Dict = {}
        Dict.update(dict(row))
        New_Items_List.append(Dict)

New_Items = pd.DataFrame(New_Items_List, columns = col_names)

In [66]:
New_Items.insert(1, "CategoryID", '')
New_Items

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup


In [67]:
New_Items.shape

(0, 8)

In [68]:
# Store the list of new items into .csv file
# If New_Items is not empty then we convert it to a csv file. 
if not New_Items.empty:
    path = os.path.join(os.getcwd(), "data", "mapping", "new items", str(datetime.date(datetime.now()))+"_New_Items.csv")
    New_Items.to_csv(path, index = False, header = True)

In [69]:
# file = pd.read_csv("data/mapping/new items/2022-11-01_New_Items.csv")
# file.to_excel("2022-11-01_New_Items.xlsx",index = None, header=True)
# if not New_Items.empty:
#     file = pd.read_csv("C:/Users/ENTER_HERE/CFFS-S23/CFFS-22-23/data/mapping/new items/"+ str(datetime.date(datetime.now()))+"_New_Items.csv")
#     file.to_excel(str(datetime.date(datetime.now()))+"_New_Items.xlsx",index = None, header=True)

In [70]:
# file2 = pd.read_excel("data/mapping/new items added/New_Items_Added_11.xlsx")
# file2.to_csv("data/mapping/new items added/New_Items_Added_11.csv", index=False)

file2 = pd.read_excel("data/mapping/new items added/New_Items_2023/New_Items_Added_2023_08_01.xlsx")
file2.to_csv("data/mapping/new items added/New_Items_2023/New_Items_Added_2023_08_01.csv", index=False)

***
## Data Summary

In [71]:
datasum = pd.DataFrame([New_Items.shape, Preps_Nonstd.shape, Items_Nonstd.shape],
                       columns = ['count', 'columns'], 
                       index = ['New_Items', 'Preps_Nonstd', 'Items_Nonstd'])
datasum

Unnamed: 0,count,columns
New_Items,0,8
Preps_Nonstd,0,7
Items_Nonstd,0,6


In [72]:
print(New_Items.columns)

Index(['ItemId', 'CategoryID', 'Description', 'CaseQty', 'CaseUOM', 'PakQty',
       'PakUOM', 'InventoryGroup'],
      dtype='object')
