![](ubc_header.png)

# Climate-Friendly Food Systems (CFFS) Labelling Project

### The University of British Columbia

****

## Part I: Data Preprocessing

## Set up and Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import glob
import os
import random
import xml.etree.ElementTree as et
from xml.etree.ElementTree import parse
from datetime import datetime
from rapidfuzz import process, fuzz

In [2]:
# RUN ONLY ONCE
# os.chdir is used to change the current directory to the specified path
os.chdir("../") # Sets path to the repo folder as it is one level above where this file exists!
path = os.getcwd()
print(path)

/Users/vivaanwadhwa/Documents/GitHub/CFFS_sharon_2024


****

## Load Data Files

### Set Data File Path

In [3]:
filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", "BlueChip","*.csv"))
filepath_list.append(glob.glob(os.path.join(os.getcwd(), "data", "archive", "AMS","AMS_Gallery_Data","Gallery_all_years_test_new.csv"))[0])
filepath_list

['/Users/vivaanwadhwa/Documents/GitHub/CFFS_sharon_2024/data/raw/BlueChip/BC_2024S_Prep_Recipe_detail_2.csv',
 '/Users/vivaanwadhwa/Documents/GitHub/CFFS_sharon_2024/data/raw/BlueChip/BC_2024S_Prep_Recipe_detail.csv',
 '/Users/vivaanwadhwa/Documents/GitHub/CFFS_sharon_2024/data/raw/BlueChip/BC_2024S_Product_Recipe_detail.csv',
 '/Users/vivaanwadhwa/Documents/GitHub/CFFS_sharon_2024/data/archive/AMS/AMS_Gallery_Data/Gallery_all_years_test_new.csv']

In [4]:
Preps_1 = pd.read_csv(filepath_list[0], skiprows = 1, header=None)
Preps_2 = pd.read_csv(filepath_list[1], skiprows = 1, header=None)
Preps = pd.concat([Preps_1, Preps_2], axis=0)
Preps.columns = ["Empty","category_1","category_2", "item_descrip","batch_yield", "total_cost", "Total Cost", "Shelf_Life", "Shelf Lifr", "recipe_uom","recipe_cost", "recipe_uom_2","recipe_cost_2","line_qty","ingre_cost","item_descrip.1","uom"]
Preps = Preps.drop(columns=["Total Cost","Shelf_Life","Shelf Lifr", "category_1", "Empty","recipe_uom","recipe_cost","recipe_uom_2","recipe_cost_2"])
Preps.to_csv("preps.csv", index=False)
Preps

Unnamed: 0,category_2,item_descrip,batch_yield,total_cost,line_qty,ingre_cost,item_descrip.1,uom
0,Commissary,Baking - Caramel Sauce,Batch Yield: 450.000 ml,$3.3514,250.00,$0.4508,SUGAR GRANULATED FINE,g
1,Commissary,Baking - Caramel Sauce,Batch Yield: 450.000 ml,$3.3514,25.00,$0.0000,Water - Tap,ml
2,Commissary,Baking - Caramel Sauce,Batch Yield: 450.000 ml,$3.3514,10.00,$0.0585,SYRUP CORN GOLDEN,ml
3,Commissary,Baking - Caramel Sauce,Batch Yield: 450.000 ml,$3.3514,150.00,$0.8640,CREAM WHIPPING 33%,ml
4,Commissary,Baking - Caramel Sauce,Batch Yield: 450.000 ml,$3.3514,150.00,$1.9742,BUTTER REG SALTED,g
...,...,...,...,...,...,...,...,...
3126,Prepared Frz,Pro - Vegan Meatloaf,Batch Yield: 48.000 ea,$196.1326,220.00,$0.5432,Corn Starch 50lb,g
3127,Prepared Frz,Pro - Vegan Meatloaf,Batch Yield: 48.000 ea,$196.1326,250.00,$1.0803,Onion White Peeled,g
3128,Prepared Frz,Pro - Vegan Meatloaf,Batch Yield: 48.000 ea,$196.1326,250.00,$0.7413,JUMBO CARROT,g
3129,Prepared Frz,Pro - Vegan Meatloaf,Batch Yield: 48.000 ea,$196.1326,250.00,$2.5463,PEPPERS RED 5LB BAG,g


In [5]:
Items = pd.read_csv(filepath_list[2], skiprows = 1, header=None)
Items.columns = ["Empty","category_1","category_2","Warning","Warning.1", "item_descrip","Empty.1", "drop","drop.1","drop.2","drop.3","drop.4","drop.5","PLU","drop.6","uom","item_descrip.1"]
Items = Items.drop(columns=["category_1","category_2","Warning","Warning.1","Empty","Empty.1","drop","drop.1","drop.2","drop.3","drop.4","drop.5","drop.6"])
Items

Unnamed: 0,item_descrip,PLU,uom,item_descrip.1
0,$garlic & onion,50.00,g,Prep Cream Cheese - Garlic
1,$PB & J ADD ON,50.00,g,PEANUT BUTTER CREAMY
2,$PB & J ADD ON,2.00,oz (fl),strawberry jam pail 14 kg
3,ADD JAM,25.00,g,strawberry jam pail 14 kg
4,CHEDDAR SLICE,1.00,slice,Cheese Cheddar Slices 21gm
...,...,...,...,...
919,20OZ PUMPK LATTE,0.75,oz (fl),Syrup Pumpkin
920,20OZ PUMPK LATTE,1.00,oz (fl),PUMPKIN PIE SAUCE
921,20OZ PUMPK LATTE,1.00,ea,CUP PPR HOT WHT 20Z SINGLE WAL
922,BAG OF ESPRESSO BEANS,1.00,1 KG,Spirit Bear Raven - Espresso


In [6]:
# Gallery
# temp = Items[Items["item_descrip"].str.contains("2024S")]
# temp2 = Items[Items["item_descrip"] == "Vegan Caesar (wrap) 2023"] # Manually Added items
# temp3 = Items[Items["item_descrip"] == "Tofu katsu 2024"] # Manually added items
# Products = pd.concat([temp,temp2,temp3])
# Blue Chip
# Create a boolean mask for rows to drop based on the conditions
mask = (
    Items["item_descrip"].str.contains("tea", case=False) |
    Items["item_descrip"].str.contains("oz", case=False) |
    Items["item_descrip"].str.contains("latte", case=False) |
    Items["item_descrip"].str.contains("TRAVEL BOX", case=False) |
    Items["item_descrip"].str.contains("OWN CUP DISC", case=False) |
    Items["item_descrip"].str.contains("AMERICANO", case=False) |
    Items["item_descrip"].str.contains("ESPRESSO", case=False) |
    Items["item_descrip"].str.contains("MACCHIATO", case=False) |
    Items["item_descrip"].str.contains("add", case=False) 
)

# Filter the DataFrame to exclude the rows that match the mask
Products = Items[~mask].reset_index(drop=True)

# Display the filtered DataFrame
Products

Unnamed: 0,item_descrip,PLU,uom,item_descrip.1
0,$garlic & onion,50.00,g,Prep Cream Cheese - Garlic
1,CHEDDAR SLICE,1.00,slice,Cheese Cheddar Slices 21gm
2,Cucumber,0.05,ea,Cucumber 3ct
3,Lettuce,1.00,oz (wt),Lettuce Green Leaf Fillet
4,Tomato,50.00,g,ITEM SLICED TOMATO
...,...,...,...,...
275,WRAP - SALSA,1.00,ea,BCPrep - Salsa Wrap
276,WRAP - Vegan Medditerranean,1.00,ea,BCPrep - Mediterranean Wrap
277,WRAP - Vegan Medditerranean,1.00,ea,BAG PPR SAND 6x2x9
278,EXTRA SHOT,9.00,g,Spirit Bear Raven - Espresso


In [7]:
list_products = Products["item_descrip"].unique()
with open("list_products.csv", 'w') as f:
    for item in list_products:
        f.write("%s\n" % item)

In [8]:
# Get unique values from the 'item_descrip.2' column in Products
prods = Products["item_descrip.1"].unique()

# Get unique values from the 'item_descrip.1' column in Preps
preps_unique = Preps["item_descrip.1"].unique()

# Filter Preps DataFrame to keep only rows where 'item_descrip' is in 'prods' or 'preps_unique'
Preps = Preps[Preps["item_descrip"].isin(prods) | Preps["item_descrip"].isin(preps_unique)]

# Reset index of Preps DataFrame
Preps = Preps.reset_index(drop=True)
Preps

Unnamed: 0,category_2,item_descrip,batch_yield,total_cost,line_qty,ingre_cost,item_descrip.1,uom
0,Commissary,BC - Tomato Focacia,Batch Yield: 14.000 ea,$10.2026,60.00,$0.6856,Olive Oil 100 Pct,ml
1,Commissary,BC - Tomato Focacia,Batch Yield: 14.000 ea,$10.2026,300.00,$1.2183,TOMATO - 6 X 7 MED,g
2,Commissary,BC - Tomato Focacia,Batch Yield: 14.000 ea,$10.2026,14.00,$8.2988,Baking - Focacia Baguette,ea
3,Commissary,BQT - Roasted Veggie,Batch Yield: 20.000 PORT,$16.2089,500.00,$2.6825,Cauliflower 9/12 Ct case,g
4,Commissary,BQT - Roasted Veggie,Batch Yield: 20.000 PORT,$16.2089,500.00,$6.9093,Radish-Watermelon,g
...,...,...,...,...,...,...,...,...
788,Prepared Frz,Pro - Vegan Meatloaf,Batch Yield: 48.000 ea,$196.1326,220.00,$0.5432,Corn Starch 50lb,g
789,Prepared Frz,Pro - Vegan Meatloaf,Batch Yield: 48.000 ea,$196.1326,250.00,$1.0803,Onion White Peeled,g
790,Prepared Frz,Pro - Vegan Meatloaf,Batch Yield: 48.000 ea,$196.1326,250.00,$0.7413,JUMBO CARROT,g
791,Prepared Frz,Pro - Vegan Meatloaf,Batch Yield: 48.000 ea,$196.1326,250.00,$2.5463,PEPPERS RED 5LB BAG,g


In [9]:
old_items = pd.read_csv(filepath_list[-1])
old_items = old_items.drop(columns=['item_num.2', 'item_num.1'])
old_items

Unnamed: 0,item_num,item_descrip,pak_physical_yield,pak_uom,pak_uom.1,pak_factored_cost,line_item_num,line_qty,items_comments,item_descrip.1,inv_flag,uom
0,17284,2022 goose & Watermelon bull,1.0,PORT,ea,0.559100,1971,0.25,,LIMES,N,ea
1,17284,2022 goose & Watermelon bull,1.0,PORT,fl oz,1.553600,8228,1.00,,Grey Goose 1.14L,N,fl oz
2,17284,2022 goose & Watermelon bull,1.0,PORT,can,1.920800,15803,1.00,,Red Bull Watermelon,N,can
3,18292,Vegan Caesar wrap 2022,1.0,ea,HEAD,2.466700,5505,0.25,,Lettuce - Romaine,N,HEAD
4,18292,Vegan Caesar wrap 2022,1.0,ea,ml,0.189270,8667,2.00,,ITEM GARLIC MAYO,N,fl oz
...,...,...,...,...,...,...,...,...,...,...,...,...
2153,18049,Yellow Curry Prep Gall.2023,1.0,PORT,g,0.005200,2262,1.00,,SESAME SEEDS,Y,g
2154,18049,Yellow Curry Prep Gall.2023,1.0,PORT,oz,0.003968,14434,80.00,,Potato yellow Med 5lb bag,Y,g
2155,18049,Yellow Curry Prep Gall.2023,1.0,PORT,PORT,0.200000,15477,1.00,,2022 Jasmin rice,Y,PORT
2156,18049,Yellow Curry Prep Gall.2023,1.0,PORT,g,0.005300,15637,20.00,,Whole Green Beans IQF,Y,g


In [10]:
used_ids = []
def assign_ids(df):
    # Preprocessing function
    def preprocess(description):
        description = description.lower()
        description = re.sub(r'\s+', ' ', description).strip()
        description = re.sub(r'[^\w\s]', '', description)
        return description

    # Apply preprocessing
    old_items['Normalized_Description'] = old_items['item_descrip'].apply(preprocess)
    df['Normalized_Description'] = df['item_descrip'].apply(preprocess)

    # Create a mapping from normalized description to item_num
    desc_to_item_num = {row['Normalized_Description']: row['item_num'] for idx, row in old_items.iterrows()}

    # Exact match
    df['item_num'] = df['Normalized_Description'].apply(lambda x: desc_to_item_num.get(x))

    # Fuzzy matching function
    def get_all_matches(description, choices, threshold=80):
        matches = process.extract(description, choices, scorer=fuzz.token_sort_ratio)
        return [match for match in matches if match[1] >= threshold]

    # Apply fuzzy matching where exact match failed
    df['Potential_Matches'] = df['Normalized_Description'].apply(lambda x: get_all_matches(x, desc_to_item_num.keys()))

    # Assign item_num directly if there is exactly one potential match
    def assign_item_num_or_matches(row):
        potential_matches = row['Potential_Matches']
        if potential_matches and len(potential_matches) == 1:
            # Assign the item_num of the single potential match
            return desc_to_item_num.get(potential_matches[0][0])
        else:
            # Keep as None if no match or multiple matches
            return None

    # Apply the function to assign item_num
    df['item_num'] = df.apply(lambda row: assign_item_num_or_matches(row), axis=1)

    # Add the best potential match item_num to Items
    def add_best_potential_match_item_num(row):
        potential_matches = row['Potential_Matches']
        if len(potential_matches) > 1:
            best_match = max(potential_matches, key=lambda x: x[1])
            return desc_to_item_num.get(best_match[0])
        return row['item_num']

    # Update Items to include best potential match item_num if exact match fails
    df['item_num'] = df.apply(lambda row: add_best_potential_match_item_num(row), axis=1)

    # Function to generate unique random item_num
    def generate_unique_random_item_num(existing_nums, start=1000):
        random.seed(42) #DO NOT CHANGE
        while True:
            num = random.randint(start, start + 10000)
            if num not in existing_nums and num not in used_ids:
                used_ids.append(num)
                return num

    # Get the set of used item_nums
    used_item_nums = set(old_items['item_num']).union(set(df['item_num'].dropna()))

    # Assign random item_num for rows without one, ensuring same item_descrip gets same item_num
    unassigned = df[df['item_num'].isna()]
    unique_descriptions = unassigned['Normalized_Description'].unique()

    for desc in unique_descriptions:
        random_num = generate_unique_random_item_num(used_item_nums)
        used_item_nums.add(random_num)
        df.loc[df['Normalized_Description'] == desc, 'item_num'] = random_num

    df["item_num"] = df["item_num"].astype(int)
    df.to_csv("df.csv")
    # Optionally, remove the Potential_Matches column if no longer needed
    df.drop(columns=['Potential_Matches',"Normalized_Description"], inplace=True)
    # Print the result
    return df


In [11]:
Preps = assign_ids(Preps)
Products = assign_ids(Products)

In [12]:
#Ensure 'line_item_num' column exists in Items
Preps['line_item_num'] = 0

# Create a dictionary for quick lookup from old_items
old_items_dict = old_items.set_index('item_descrip.1')['line_item_num'].to_dict()
old_ids = list(set(old_items["line_item_num"].unique()))
ID = old_ids[0]

# Iterate through Items and update line_item_num
for index, row in Preps.iterrows():
    description = row['item_descrip.1']
    if description in old_items_dict:
        Preps.at[index, 'line_item_num'] = old_items_dict[description]
    else:
        while ID in old_ids:
            ID += 1
        old_items_dict[description] = ID
        old_ids.append(ID)
        Preps.at[index, "line_item_num"] = ID

# Output the updated DataFrame
display(Preps)

Unnamed: 0,category_2,item_descrip,batch_yield,total_cost,line_qty,ingre_cost,item_descrip.1,uom,item_num,line_item_num
0,Commissary,BC - Tomato Focacia,Batch Yield: 14.000 ea,$10.2026,60.00,$0.6856,Olive Oil 100 Pct,ml,2824,4099
1,Commissary,BC - Tomato Focacia,Batch Yield: 14.000 ea,$10.2026,300.00,$1.2183,TOMATO - 6 X 7 MED,g,2824,2402
2,Commissary,BC - Tomato Focacia,Batch Yield: 14.000 ea,$10.2026,14.00,$8.2988,Baking - Focacia Baguette,ea,2824,4100
3,Commissary,BQT - Roasted Veggie,Batch Yield: 20.000 PORT,$16.2089,500.00,$2.6825,Cauliflower 9/12 Ct case,g,1409,4101
4,Commissary,BQT - Roasted Veggie,Batch Yield: 20.000 PORT,$16.2089,500.00,$6.9093,Radish-Watermelon,g,1409,12877
...,...,...,...,...,...,...,...,...,...,...
788,Prepared Frz,Pro - Vegan Meatloaf,Batch Yield: 48.000 ea,$196.1326,220.00,$0.5432,Corn Starch 50lb,g,6930,4172
789,Prepared Frz,Pro - Vegan Meatloaf,Batch Yield: 48.000 ea,$196.1326,250.00,$1.0803,Onion White Peeled,g,6930,6865
790,Prepared Frz,Pro - Vegan Meatloaf,Batch Yield: 48.000 ea,$196.1326,250.00,$0.7413,JUMBO CARROT,g,6930,13499
791,Prepared Frz,Pro - Vegan Meatloaf,Batch Yield: 48.000 ea,$196.1326,250.00,$2.5463,PEPPERS RED 5LB BAG,g,6930,4174


In [13]:
def extract_yield_and_uom(batch_yield):
    match = re.search(r'Batch\s*Yield:\s*([\d.]+)\s*(\w+)', batch_yield)
    if match:
        physical_yield = float(match.group(1))
        uom = match.group(2)
        return physical_yield, uom
    else:
        print(f"Error parsing batch_yield '{batch_yield}': pattern not found")
        return None, None

# Apply the function and create new columns
Preps['pak_physical_yield'], Preps['pak_uom'] = zip(*Preps['batch_yield'].apply(extract_yield_and_uom))
Preps = Preps.drop(columns=["batch_yield"])
Preps

Unnamed: 0,category_2,item_descrip,total_cost,line_qty,ingre_cost,item_descrip.1,uom,item_num,line_item_num,pak_physical_yield,pak_uom
0,Commissary,BC - Tomato Focacia,$10.2026,60.00,$0.6856,Olive Oil 100 Pct,ml,2824,4099,14.0,ea
1,Commissary,BC - Tomato Focacia,$10.2026,300.00,$1.2183,TOMATO - 6 X 7 MED,g,2824,2402,14.0,ea
2,Commissary,BC - Tomato Focacia,$10.2026,14.00,$8.2988,Baking - Focacia Baguette,ea,2824,4100,14.0,ea
3,Commissary,BQT - Roasted Veggie,$16.2089,500.00,$2.6825,Cauliflower 9/12 Ct case,g,1409,4101,20.0,PORT
4,Commissary,BQT - Roasted Veggie,$16.2089,500.00,$6.9093,Radish-Watermelon,g,1409,12877,20.0,PORT
...,...,...,...,...,...,...,...,...,...,...,...
788,Prepared Frz,Pro - Vegan Meatloaf,$196.1326,220.00,$0.5432,Corn Starch 50lb,g,6930,4172,48.0,ea
789,Prepared Frz,Pro - Vegan Meatloaf,$196.1326,250.00,$1.0803,Onion White Peeled,g,6930,6865,48.0,ea
790,Prepared Frz,Pro - Vegan Meatloaf,$196.1326,250.00,$0.7413,JUMBO CARROT,g,6930,13499,48.0,ea
791,Prepared Frz,Pro - Vegan Meatloaf,$196.1326,250.00,$2.5463,PEPPERS RED 5LB BAG,g,6930,4174,48.0,ea


In [14]:
Products["inv_flag"] = "Y"
Preps["inv_flag"] = "Y"

In [15]:
Preps.to_csv("Preps.csv", index=False)
Products.to_csv("Products.csv", index=False)

### Import Items List

In [110]:
Preps = pd.read_csv("Preps.csv")
Products = pd.read_csv("Products.csv")

In [111]:
Preps = Preps[(Preps["item_descrip.1"] != "ITEM - misc") & (Preps["item_descrip.1"] != "BAG PPR SAND 6x2x9")]
Preps.reset_index(drop=True, inplace=True)
Preps

Unnamed: 0,category_2,item_descrip,total_cost,line_qty,ingre_cost,item_descrip.1,uom,item_num,line_item_num,pak_physical_yield,pak_uom,inv_flag
0,Commissary,BC - Tomato Focacia,$10.2026,60.00,$0.6856,Olive Oil 100 Pct,ml,2824,4099,14.0,ea,Y
1,Commissary,BC - Tomato Focacia,$10.2026,300.00,$1.2183,TOMATO - 6 X 7 MED,g,2824,2402,14.0,ea,Y
2,Commissary,BC - Tomato Focacia,$10.2026,14.00,$8.2988,Baking - Focacia Baguette,ea,2824,4100,14.0,ea,Y
3,Commissary,BQT - Roasted Veggie,$16.2089,500.00,$2.6825,Cauliflower 9/12 Ct case,g,1409,4101,20.0,PORT,Y
4,Commissary,BQT - Roasted Veggie,$16.2089,500.00,$6.9093,Radish-Watermelon,g,1409,12877,20.0,PORT,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
762,Prepared Frz,Pro - Vegan Meatloaf,$196.1326,220.00,$0.5432,Corn Starch 50lb,g,6930,4172,48.0,ea,Y
763,Prepared Frz,Pro - Vegan Meatloaf,$196.1326,250.00,$1.0803,Onion White Peeled,g,6930,6865,48.0,ea,Y
764,Prepared Frz,Pro - Vegan Meatloaf,$196.1326,250.00,$0.7413,JUMBO CARROT,g,6930,13499,48.0,ea,Y
765,Prepared Frz,Pro - Vegan Meatloaf,$196.1326,250.00,$2.5463,PEPPERS RED 5LB BAG,g,6930,4174,48.0,ea,Y


In [112]:
# Ensure 'item_num' and 'line_item_num' are strings
Preps = Preps.astype({"item_num": str, "line_item_num": str})

# Get unique descriptions from 'item_descrip'
unique_descriptions = Preps["item_descrip"].unique()

# Iterate over rows to update 'item_num' and 'line_item_num'
for idx, row in Preps.iterrows():
    # Prepend 'P-' if not already present in 'item_num'
    if not row["item_num"].startswith("P-"):
        Preps.at[idx, "item_num"] = "P-" + row["item_num"]
    
for idx, row in Preps.iterrows(): 
    # Update 'line_item_num' based on 'item_descrip'
    if row["item_descrip.1"] in unique_descriptions:
        matching_item_num = Preps.loc[Preps["item_descrip"] == row["item_descrip.1"], "item_num"].values[0]
        print(f"Matching item_num: {matching_item_num}")
        Preps.at[idx, "line_item_num"] = matching_item_num
    else:
        # Prepend 'I-' if not already present in 'line_item_num'
        if not row["line_item_num"].startswith("I-"):
            Preps.at[idx, "line_item_num"] = "I-" + row["line_item_num"]

# Optional: Verify the changes
Preps.head(20)


Matching item_num: P-9935
Matching item_num: P-5506
Matching item_num: P-7912
Matching item_num: P-5012
Matching item_num: P-2679
Matching item_num: P-15427
Matching item_num: P-7912
Matching item_num: P-15427
Matching item_num: P-2679
Matching item_num: P-3547
Matching item_num: P-9935
Matching item_num: P-3045
Matching item_num: P-5333
Matching item_num: P-7482
Matching item_num: P-2169
Matching item_num: P-7482
Matching item_num: P-3664
Matching item_num: P-15368
Matching item_num: P-5506
Matching item_num: P-1106
Matching item_num: P-9935
Matching item_num: P-1916


Unnamed: 0,category_2,item_descrip,total_cost,line_qty,ingre_cost,item_descrip.1,uom,item_num,line_item_num,pak_physical_yield,pak_uom,inv_flag
0,Commissary,BC - Tomato Focacia,$10.2026,60.0,$0.6856,Olive Oil 100 Pct,ml,P-2824,I-4099,14.0,ea,Y
1,Commissary,BC - Tomato Focacia,$10.2026,300.0,$1.2183,TOMATO - 6 X 7 MED,g,P-2824,I-2402,14.0,ea,Y
2,Commissary,BC - Tomato Focacia,$10.2026,14.0,$8.2988,Baking - Focacia Baguette,ea,P-2824,P-9935,14.0,ea,Y
3,Commissary,BQT - Roasted Veggie,$16.2089,500.0,$2.6825,Cauliflower 9/12 Ct case,g,P-1409,I-4101,20.0,PORT,Y
4,Commissary,BQT - Roasted Veggie,$16.2089,500.0,$6.9093,Radish-Watermelon,g,P-1409,I-12877,20.0,PORT,Y
5,Commissary,BQT - Roasted Veggie,$16.2089,500.0,$1.4826,JUMBO CARROT,g,P-1409,I-13499,20.0,PORT,Y
6,Commissary,BQT - Roasted Veggie,$16.2089,500.0,$2.9625,Broccoli Crowns,g,P-1409,I-5941,20.0,PORT,Y
7,Commissary,BQT - Roasted Veggie,$16.2089,2.0,$2.1720,Turnips Yellow,LBS,P-1409,I-4102,20.0,PORT,Y
8,Commissary,Catr - Sauce Vierge,$9.2892,145.0,$1.6567,Olive Oil 100 Pct,ml,P-5506,I-4099,1031.0,g,Y
9,Commissary,Catr - Sauce Vierge,$9.2892,15.0,$0.0843,GARLIC WHOLE PEELED,g,P-5506,I-1874,1031.0,g,Y


In [113]:
Preps.dtypes

category_2             object
item_descrip           object
total_cost             object
line_qty               object
ingre_cost             object
item_descrip.1         object
uom                    object
item_num               object
line_item_num          object
pak_physical_yield    float64
pak_uom                object
inv_flag               object
dtype: object

In [114]:
mask_BC = ["NAPKIN DISP 1PLY INTERFOLD KRA", '"9""x6 1/8"" NAT HINGED CONTAINER"', "ECO 9x6 Kraft Hinged Container", 
           "BAG PPR SAND 6x2x9", "CUTLERY SPOON WOODEN ECO", "6x6x3 CLAMSHELL", '"Box - Pizza 10"" Kraft"', '9"x6 1/8" NAT HINGED CONTAINER']
mask_Gallery = ["To Go Cutlery 2023","Misc Extras"]
Products = Products[~Products["item_descrip.1"].isin(mask_BC)]
Products

Unnamed: 0,item_descrip,PLU,uom,item_descrip.1,item_num,inv_flag
0,$garlic & onion,50.0,g,Prep Cream Cheese - Garlic,4593.0,Y
1,CHEDDAR SLICE,1.0,slice,Cheese Cheddar Slices 21gm,3266.0,Y
2,Cucumber,0.05,ea,Cucumber 3ct,9348.0,Y
3,Lettuce,1.0,oz (wt),Lettuce Green Leaf Fillet,9085.0,Y
4,Tomato,50.0,g,ITEM SLICED TOMATO,2489.0,Y
5,Bagel - Jalapeno Cheddar,1.0,ea,ITEM BAGEL JALAPENO & CHEESE,1771.0,Y
6,BC SMOKE SALMON,5.0,ml,CAPER CAPUCINE,2796.0,Y
7,BC SMOKE SALMON,60.0,g,CHEESE CREAM PLAIN SOFT 10KG,2796.0,Y
9,BC SMOKE SALMON,2.0,ea,bean green pickled bar,2796.0,Y
10,BC SMOKE SALMON,1.0,ea,ITEM BAGEL PLAIN,2796.0,Y


In [115]:
#Ensure 'line_item_num' column exists in Items
Products.loc[:,'line_item_num'] = ""

# Create a dictionary for quick lookup from old_items
items_dict = Preps.set_index('item_descrip.1')['line_item_num'].to_dict()
items_dict.update(Preps.set_index('item_descrip')['item_num'].to_dict())

ids = list(set(Preps["line_item_num"].unique()))
ids.extend(Preps["item_num"].unique())
ids = [int(str(id).split("-")[1]) for id in ids]
print(ids)
ID = ids[0]

# Iterate through Items and update line_item_num
for index, row in Products.iterrows():
    description = row['item_descrip.1']
    if description in items_dict:
        Products.at[index, 'line_item_num'] = items_dict[description]
    else:
        while ID in ids:
            ID += 1
        print(row["item_descrip.1"], "not in Preps")
        items_dict[description] = "I-" + str(ID)
        ids.append(ID)
        Products.at[index, "line_item_num"] = "I-" + str(ID)

# Output the updated DataFrame
display(Products)

[4131, 1783, 2951, 5506, 5941, 13706, 4137, 4193, 2071, 4118, 2986, 6090, 2209, 1859, 3547, 3180, 2335, 4183, 3147, 4199, 13711, 2378, 2323, 4112, 4164, 1917, 3258, 2167, 14424, 2169, 5220, 2133, 1106, 1782, 2025, 4227, 4107, 14442, 4108, 4140, 4126, 4171, 1649, 4163, 4211, 4232, 4104, 4186, 4149, 5575, 15221, 1789, 2967, 13842, 4202, 7168, 4220, 4209, 4148, 6203, 4141, 4169, 7801, 6865, 4231, 4122, 4213, 16803, 5012, 5333, 2676, 2402, 4143, 4195, 2099, 4135, 4178, 1982, 2993, 1916, 4175, 5008, 4119, 4194, 14979, 4173, 1958, 3664, 1963, 2443, 2349, 4153, 2320, 4200, 4165, 4192, 2257, 5899, 2087, 4176, 14423, 2958, 7775, 4210, 2498, 5264, 2102, 4120, 2347, 4127, 4167, 13499, 4151, 14429, 5075, 4197, 5773, 4113, 2333, 4203, 4218, 4132, 4235, 17946, 4216, 14484, 1660, 9117, 4207, 9935, 4109, 2203, 4222, 2464, 12764, 4206, 1927, 1821, 4116, 12877, 15427, 4146, 8418, 1695, 4139, 14511, 4110, 4188, 2442, 4159, 1635, 2068, 4166, 3804, 2339, 1924, 5106, 4147, 4201, 2366, 4129, 4217, 12361, 422

Unnamed: 0,item_descrip,PLU,uom,item_descrip.1,item_num,inv_flag,line_item_num
0,$garlic & onion,50.0,g,Prep Cream Cheese - Garlic,4593.0,Y,P-14403
1,CHEDDAR SLICE,1.0,slice,Cheese Cheddar Slices 21gm,3266.0,Y,I-3498
2,Cucumber,0.05,ea,Cucumber 3ct,9348.0,Y,I-4195
3,Lettuce,1.0,oz (wt),Lettuce Green Leaf Fillet,9085.0,Y,I-5220
4,Tomato,50.0,g,ITEM SLICED TOMATO,2489.0,Y,I-4141
5,Bagel - Jalapeno Cheddar,1.0,ea,ITEM BAGEL JALAPENO & CHEESE,1771.0,Y,I-4134
6,BC SMOKE SALMON,5.0,ml,CAPER CAPUCINE,2796.0,Y,I-1649
7,BC SMOKE SALMON,60.0,g,CHEESE CREAM PLAIN SOFT 10KG,2796.0,Y,I-4144
9,BC SMOKE SALMON,2.0,ea,bean green pickled bar,2796.0,Y,I-4158
10,BC SMOKE SALMON,1.0,ea,ITEM BAGEL PLAIN,2796.0,Y,I-4170


### Extracting all Preps

In [116]:
# Filter the DataFrame to include only rows where 'item_num' starts with 'P-'
Preperations = Preps.copy()

# Select specific columns from the filtered DataFrame
Preperations = Preperations[['item_num', 'item_descrip', 'pak_physical_yield', 'pak_uom', 'inv_flag']]
Preperations.rename(columns={'item_num': 'PrepId', 'item_descrip': 'Description', 'pak_physical_yield': 'PakQty', 'pak_uom': 'PakUOM', 'inv_flag': 'InventoryGroup'}, inplace=True)
Preperations.drop_duplicates(subset=["PrepId"], inplace=True)
Preperations.reset_index(drop=True, inplace=True)
Preperations

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-2824,BC - Tomato Focacia,14.0,ea,Y
1,P-1409,BQT - Roasted Veggie,20.0,PORT,Y
2,P-5506,Catr - Sauce Vierge,1031.0,g,Y
3,P-5012,Catr- Roasted Chicken Breast,1.0,ea,Y
4,P-4657,Pro - Organic Quinoa,18.0,Kg,Y
...,...,...,...,...,...
116,P-8019,Pro - Tuna Salad Mix,5.0,Kg,Y
117,P-10560,Pro - Vinaigrette Sesame Lime,1.0,L,Y
118,P-7543,Pro - Wild Mix Mushroom,10.0,Kg,Y
119,P-15368,Pro - Marinara Sauce,7.0,L,Y


In [117]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Preps_List.csv")
Preperations.to_csv(path, index = False, header = True)

### Extracting all Items

In [118]:
Preps

Unnamed: 0,category_2,item_descrip,total_cost,line_qty,ingre_cost,item_descrip.1,uom,item_num,line_item_num,pak_physical_yield,pak_uom,inv_flag
0,Commissary,BC - Tomato Focacia,$10.2026,60.00,$0.6856,Olive Oil 100 Pct,ml,P-2824,I-4099,14.0,ea,Y
1,Commissary,BC - Tomato Focacia,$10.2026,300.00,$1.2183,TOMATO - 6 X 7 MED,g,P-2824,I-2402,14.0,ea,Y
2,Commissary,BC - Tomato Focacia,$10.2026,14.00,$8.2988,Baking - Focacia Baguette,ea,P-2824,P-9935,14.0,ea,Y
3,Commissary,BQT - Roasted Veggie,$16.2089,500.00,$2.6825,Cauliflower 9/12 Ct case,g,P-1409,I-4101,20.0,PORT,Y
4,Commissary,BQT - Roasted Veggie,$16.2089,500.00,$6.9093,Radish-Watermelon,g,P-1409,I-12877,20.0,PORT,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
762,Prepared Frz,Pro - Vegan Meatloaf,$196.1326,220.00,$0.5432,Corn Starch 50lb,g,P-6930,I-4172,48.0,ea,Y
763,Prepared Frz,Pro - Vegan Meatloaf,$196.1326,250.00,$1.0803,Onion White Peeled,g,P-6930,I-6865,48.0,ea,Y
764,Prepared Frz,Pro - Vegan Meatloaf,$196.1326,250.00,$0.7413,JUMBO CARROT,g,P-6930,I-13499,48.0,ea,Y
765,Prepared Frz,Pro - Vegan Meatloaf,$196.1326,250.00,$2.5463,PEPPERS RED 5LB BAG,g,P-6930,I-4174,48.0,ea,Y


In [119]:
Products[~Products['line_item_num'].str.startswith('P-')].head()

Unnamed: 0,item_descrip,PLU,uom,item_descrip.1,item_num,inv_flag,line_item_num
1,CHEDDAR SLICE,1.0,slice,Cheese Cheddar Slices 21gm,3266.0,Y,I-3498
2,Cucumber,0.05,ea,Cucumber 3ct,9348.0,Y,I-4195
3,Lettuce,1.0,oz (wt),Lettuce Green Leaf Fillet,9085.0,Y,I-5220
4,Tomato,50.0,g,ITEM SLICED TOMATO,2489.0,Y,I-4141
5,Bagel - Jalapeno Cheddar,1.0,ea,ITEM BAGEL JALAPENO & CHEESE,1771.0,Y,I-4134


In [120]:
# Filter out rows where 'line_item_num' does not start with 'P-'
temp1 = Preps[~Preps['line_item_num'].str.startswith('P-')]

# Select relevant columns
temp1 = temp1[['line_item_num', 'item_descrip.1', 'line_qty', 'uom', 'pak_physical_yield', 'pak_uom', 'inv_flag']]

temp2 = Products[~Products['line_item_num'].str.startswith('P-')]

temp2 = temp2[['line_item_num', 'item_descrip.1', 'PLU', 'uom', 'inv_flag']]
temp2.rename(columns={'PLU': 'line_qty'}, inplace=True)

items = pd.concat([temp1, temp2], ignore_index=True)

# Rename columns for clarity
items.rename(columns={
    'line_item_num': 'ItemId',
    'item_descrip.1': 'Description',
    'line_qty': 'CaseQty',
    'uom': 'CaseUOM',
    'pak_physical_yield': 'PakQty',
    'pak_uom': 'PakUOM',
    'inv_flag': 'InventoryGroup'
}, inplace=True)

# Define a function to deal with commas in numeric fields
def dealWithComma(x):
    if isinstance(x, str):
        return x.replace(",", "")
    return x

# Apply the function to the 'CaseQty' column
items['CaseQty'] = items['CaseQty'].apply(dealWithComma)

# Convert 'CaseQty' to float
items = items.astype({"CaseQty": float})

# Remove duplicates based on 'ItemId'
items.drop_duplicates(subset=["ItemId"], inplace=True)

# Reset index for the final DataFrame
items.reset_index(drop=True, inplace=True)

# Display the resulting DataFrame
items


Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-4099,Olive Oil 100 Pct,60.00,ml,14.0,ea,Y
1,I-2402,TOMATO - 6 X 7 MED,300.00,g,14.0,ea,Y
2,I-4101,Cauliflower 9/12 Ct case,500.00,g,20.0,PORT,Y
3,I-12877,Radish-Watermelon,500.00,g,20.0,PORT,Y
4,I-13499,JUMBO CARROT,500.00,g,20.0,PORT,Y
...,...,...,...,...,...,...,...
257,I-4237,Bagel - Rosemary,0.25,ea,,,Y
258,I-4238,Bagel - Jalapeno Cheddar,0.25,ea,,,Y
259,I-4239,TOMATO FOCACCIA,1.00,ea,,,Y
260,I-4240,CHICK SALAD,1.00,ea,,,Y


In [121]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Items_List.csv")
items.to_csv(path, index = False, header = True)

### Extracting all Ingredients

In [122]:
# Ensure 'item_num' and 'line_item_num' columns are strings
Products = Products.astype({"item_num": str, "line_item_num": str})

# Prepend "R-" to 'item_num' column
Products['item_num'] = "R-" + Products['item_num']

# Display the modified DataFrame
Products

Unnamed: 0,item_descrip,PLU,uom,item_descrip.1,item_num,inv_flag,line_item_num
0,$garlic & onion,50.0,g,Prep Cream Cheese - Garlic,R-4593.0,Y,P-14403
1,CHEDDAR SLICE,1.0,slice,Cheese Cheddar Slices 21gm,R-3266.0,Y,I-3498
2,Cucumber,0.05,ea,Cucumber 3ct,R-9348.0,Y,I-4195
3,Lettuce,1.0,oz (wt),Lettuce Green Leaf Fillet,R-9085.0,Y,I-5220
4,Tomato,50.0,g,ITEM SLICED TOMATO,R-2489.0,Y,I-4141
5,Bagel - Jalapeno Cheddar,1.0,ea,ITEM BAGEL JALAPENO & CHEESE,R-1771.0,Y,I-4134
6,BC SMOKE SALMON,5.0,ml,CAPER CAPUCINE,R-2796.0,Y,I-1649
7,BC SMOKE SALMON,60.0,g,CHEESE CREAM PLAIN SOFT 10KG,R-2796.0,Y,I-4144
9,BC SMOKE SALMON,2.0,ea,bean green pickled bar,R-2796.0,Y,I-4158
10,BC SMOKE SALMON,1.0,ea,ITEM BAGEL PLAIN,R-2796.0,Y,I-4170


In [123]:
Ingredients = Products[['line_item_num', 'PLU', 'uom', 'item_num']].copy()

Ingredients.rename(columns={'line_item_num': 'IngredientId', 'PLU': 'Qty', 'uom': 'Uom', 'item_num': 'Recipe'}, inplace=True)

Ingredients.drop_duplicates(subset=["IngredientId", "Recipe"], inplace=True)

Ingredients.reset_index(drop=True, inplace=True)
Ingredients

Unnamed: 0,IngredientId,Qty,Uom,Recipe
0,P-14403,50.0,g,R-4593.0
1,I-3498,1.0,slice,R-3266.0
2,I-4195,0.05,ea,R-9348.0
3,I-5220,1.0,oz (wt),R-9085.0
4,I-4141,50.0,g,R-2489.0
5,I-4134,1.0,ea,R-1771.0
6,I-1649,5.0,ml,R-2796.0
7,I-4144,60.0,g,R-2796.0
8,I-4158,2.0,ea,R-2796.0
9,I-4170,1.0,ea,R-2796.0


In [124]:
temp = Preps[['line_item_num', 'item_num', 'line_qty', 'uom']].copy()
temp.rename(columns={'line_item_num': 'IngredientId', 'item_num': 'Recipe', 'line_qty': 'Qty', 'uom': 'Uom'}, inplace=True)
temp.drop_duplicates(subset=["IngredientId", "Recipe"], inplace=True)
temp.reset_index(drop=True, inplace=True)
temp

Unnamed: 0,IngredientId,Recipe,Qty,Uom
0,I-4099,P-2824,60.00,ml
1,I-2402,P-2824,300.00,g
2,P-9935,P-2824,14.00,ea
3,I-4101,P-1409,500.00,g
4,I-12877,P-1409,500.00,g
...,...,...,...,...
704,I-4172,P-6930,220.00,g
705,I-6865,P-6930,250.00,g
706,I-13499,P-6930,250.00,g
707,I-4174,P-6930,250.00,g


In [125]:
df = [Ingredients, temp]
Ingredients = pd.concat(df)

Ingredients["Qty"] = Ingredients["Qty"].apply(dealWithComma)

In [126]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Ingredients_List.csv")
Ingredients.to_csv(path, index = False, header = True)

### Extracting Products List

In [127]:
Products = Products[['item_num', 'item_descrip', 'inv_flag']]

Products.rename(columns={'item_num': 'ProdId', 'item_descrip': 'Description', 'inv_flag': 'SalesGroup'}, inplace=True)
Products.drop_duplicates(inplace=True)
Products.reset_index(drop=True, inplace=True)
Products

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Products.rename(columns={'item_num': 'ProdId', 'item_descrip': 'Description', 'inv_flag': 'SalesGroup'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Products.drop_duplicates(inplace=True)


Unnamed: 0,ProdId,Description,SalesGroup
0,R-4593.0,$garlic & onion,Y
1,R-3266.0,CHEDDAR SLICE,Y
2,R-9348.0,Cucumber,Y
3,R-9085.0,Lettuce,Y
4,R-2489.0,Tomato,Y
5,R-1771.0,Bagel - Jalapeno Cheddar,Y
6,R-2796.0,BC SMOKE SALMON,Y
7,R-3621.0,Four Cheese Melt,Y
8,R-7916.0,Lite Crm Chz,Y
9,R-10771.0,MELT - TUNA,Y


In [128]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Products_List.csv")
Products.to_csv(path, index = False, header = True)

### Import Conversions List

In [129]:
# Read conventions.xml files in the filepath_list and construct a dataframe
ConversionId = []
Multiplier = []
ConvertFromQty = []
ConvertFromUom = []
ConvertToQty = []
ConvertToUom = []

# From the XML file for Conversions append the id into ConversionId, multiplier into Multiplier, ConvertFrom->qty into 
# ConvertFromQty,ConvertFrom->uom into ConvertFromUom, ConvertTo->qty into ConvertToQty and and ConvertTo->uom into the
# CovertToUom list. 
# Make a dataframe out of the 3 lists
# Then also drop the duplicates in the Products dataframe

filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", "OK 23-24 Sep-Dec*", "*.oc"))
for filepath in filepath_list:
    path = filepath + '/Conversions.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Conversion'):
            ConversionId.append(x.attrib['id'])
            Multiplier.append(x.attrib['multiplier'])
            ConvertFromQty.append(x.find('ConvertFrom').attrib['qty'])
            ConvertFromUom.append(x.find('ConvertFrom').attrib['uom'])
            ConvertToQty.append(x.find('ConvertTo').attrib['qty'])
            ConvertToUom.append(x.find('ConvertTo').attrib['uom'])
    
    
Conversions = pd.DataFrame({'ConversionId': ConversionId, 'Multiplier': Multiplier, 'ConvertFromQty': ConvertFromQty,
                           'ConvertFromUom': ConvertFromUom, 'ConvertToQty': ConvertToQty, 'ConvertToUom': ConvertToUom}
                          ).drop_duplicates()

Conversions.reset_index(drop=True, inplace=True)

In [130]:
Conversions.loc[Conversions["ConversionId"] == "I-29389"]

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom


In [131]:
# all_id_list = Items["ItemId"].unique()
all_id_set = set(Preps["item_num"].unique())
all_conv_set = set(Conversions["ConversionId"].unique())

missing_conv_id = all_id_set - all_conv_set
n = len(missing_conv_id)
print(f"{n} Items in Items dataframe but not in Conversions Dataframe:\n",)
print(missing_conv_id)

121 Items in Items dataframe but not in Conversions Dataframe:

{'P-5040', 'P-8573', 'P-2535', 'P-4582', 'P-8019', 'P-6160', 'P-9935', 'P-6635', 'P-5506', 'P-18274', 'P-10674', 'P-1434', 'P-7065', 'P-15427', 'P-7224', 'P-1488', 'P-7873', 'P-6977', 'P-10195', 'P-6514', 'P-6574', 'P-3547', 'P-4814', 'P-4456', 'P-2654', 'P-4598', 'P-9785', 'P-10044', 'P-6881', 'P-2291', 'P-6313', 'P-2139', 'P-6925', 'P-6068', 'P-5803', 'P-2169', 'P-8517', 'P-5554', 'P-1106', 'P-10654', 'P-4483', 'P-15007', 'P-15008', 'P-4432', 'P-4257', 'P-3237', 'P-10560', 'P-7924', 'P-3045', 'P-5339', 'P-3286', 'P-5010', 'P-9830', 'P-5374', 'P-1525', 'P-4150', 'P-9279', 'P-2424', 'P-10459', 'P-2674', 'P-10863', 'P-4752', 'P-3803', 'P-1750', 'P-5557', 'P-5552', 'P-9751', 'P-5012', 'P-5333', 'P-14403', 'P-10891', 'P-15368', 'P-2584', 'P-14995', 'P-8359', 'P-6155', 'P-1916', 'P-8527', 'P-5814', 'P-10197', 'P-3287', 'P-3615', 'P-2307', 'P-4733', 'P-2824', 'P-3677', 'P-7543', 'P-7216', 'P-3664', 'P-5422', 'P-5741', 'P-10125'

In [132]:
Conversions.shape

(0, 6)

In [133]:
Conversions.dtypes

ConversionId      float64
Multiplier        float64
ConvertFromQty    float64
ConvertFromUom    float64
ConvertToQty      float64
ConvertToUom      float64
dtype: object

In [134]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Conversions_List.csv")
Conversions.to_csv(path, index = False, header = True)

In [151]:
temp_df = []
for _,row in Ingredients.iterrows():
    if row["Recipe"] in Products["ProdId"].unique():
        temp_df.append(row)
Products_Recipe = pd.DataFrame(temp_df)
Ingredients_temp = Ingredients[Ingredients["Recipe"].isin(Products_Recipe["IngredientId"].unique())]
temp = pd.concat([Ingredients_temp, Products_Recipe])
Ingredients_temp[Ingredients_temp["IngredientId"] == "P-9935"]
# Ingredients = temp

Unnamed: 0,IngredientId,Qty,Uom,Recipe
2,P-9935,14.0,ea,P-2824


In [109]:
items = items[items["ItemId"].isin(Ingredients["IngredientId"].unique())]
Preperations = Preperations[Preperations["PrepId"].isin(Ingredients["Recipe"].unique())]

In [82]:
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Ingredients_List.csv")
Ingredients.to_csv(path, index = False, header = True)
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Items_List.csv")
items.to_csv(path, index = False, header = True)
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Preps_List.csv")
Preperations.to_csv(path, index = False, header = True)

***
## Data Summary

In [54]:
# Summary of raw data imported for evaluation
# Here we have a summary of the number of items, preps, ingredients, products, conversions

datasum = pd.DataFrame([items.shape, Preperations.shape, Ingredients.shape, Products.shape, Conversions.shape],
                       columns = ['count', 'columns'], 
                       index = ['Items', 'Preps', 'Ingredients', 'Products', 'Conversions'])
datasum

Unnamed: 0,count,columns
Items,60,7
Preps,14,5
Ingredients,122,4
Products,32,3
Conversions,0,6
