# Climate-Friendly Food Systems (CFFS) Labelling Project

### The University of British Columbia

****

## Category ID for Assigned Items

## Imports and setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import xml.etree.ElementTree as et
from xml.etree.ElementTree import parse

In [2]:
# RUN ONLY ONCE
# os.chdir is used to change the current directory to the specified path
os.chdir("../../") # Sets path to the repo folder as it is one level above where this file exists!
path = os.getcwd()
print(path)

/Users/vivaanwadhwa/Documents/GitHub/CFFS_sharon_2024


***
## Load Data Files

In [3]:
consolidated_df = pd.read_csv(os.getcwd() + "/data/Misc/Consolidated_purchases/OK_consol_purchases_march1_april21.csv")
consolidated_df.head()

Unnamed: 0,invoice_counter,invoice_num,invoice_total,item_num,qty,line_amount,adj_line_amount,uom,unit_cost,item_descrip,...,prod_group_descrip,order_code,compute_0022,pak_qty,case_qty,purchase_uom,cat_num,cat_descrip,has_credit,credit_pak_amount
0,252033,M123033004,-172.0,69025,-59.0,-123.9,-123.9,each,2.1,CANDIED CINNAMON ALMOND,...,CANDIES SNACKS,KK051094,N,1.0,1.0,each,7,Candy,N,0
1,252099,0252099,153.3,69025,15.0,31.5,31.5,each,2.1,CANDIED CINNAMON ALMOND,...,CANDIES SNACKS,KK051094,N,1.0,1.0,each,7,Candy,N,0
2,252100,0252099,-153.3,69025,-15.0,-31.5,-31.5,each,2.1,CANDIED CINNAMON ALMOND,...,CANDIES SNACKS,KK051094,N,1.0,1.0,each,7,Candy,N,0
3,252099,0252099,153.3,68933,14.0,29.4,29.4,each,2.1,CANDIED COCONUT ALMO,...,CANDIES SNACKS,KK051093,N,1.0,1.0,each,7,Candy,N,0
4,252100,0252099,-153.3,68933,-14.0,-29.4,-29.4,each,2.1,CANDIED COCONUT ALMO,...,CANDIES SNACKS,KK051093,N,1.0,1.0,each,7,Candy,N,0


In [4]:
categories_df = pd.read_csv(os.getcwd() + "/data/Misc/GHG_categories/ghge_factors.csv")
categories_df

Unnamed: 0,Category ID,Food Category,Active Total Supply Chain Emissions (kg CO2 / kg food)
0,1,beef & buffalo meat,41.3463
1,2,lamb/mutton & goat meat,41.6211
2,3,pork (pig meat),9.8315
3,4,"poultry (chicken, turkey)",4.3996
4,5,butter,11.4316
...,...,...,...
58,59,manually adjusted,0.0000
59,60,human labor,0.0000
60,61,kitchen supplies,0.0000
61,62,mushrooms,1.5000


In [5]:
items_list = pd.read_csv(os.getcwd() + '/data/mapping/Items_List_Assigned.csv')
items_list

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-57545,1,CHUCK FLAT BONELESS FZN,3.30,Kg,1.0,Kg,MEAT
1,I-10869,1,BEEF STIRFRY COV FR,5.00,Kg,1.0,Kg,MEAT
2,I-7064,1,BEEF OUTSIDE FLAT AAA,1.00,Kg,1.0,Kg,MEAT
3,I-37005,1,BEEF MEATBALLS,4.54,Kg,1000.0,g,MEAT
4,I-37002,1,BEEF INSIDE ROUND SHAVED,9.00,Kg,1000.0,g,MEAT
...,...,...,...,...,...,...,...,...
2816,I-17622,24,WAFFLES BLUBRRY - GF/WF,12.00,box,6.0,ea,FOOD - GROCERY
2817,I-45754,55,WATER ACTIVE LEMON LIME,12.00,450ML,1.0,450ML,BEVERAGE
2818,I-45755,55,WATER ACTIVE ORANGE MANGO,12.00,450ML,1.0,450ML,BEVERAGE
2819,I-45756,55,WATER ACTIVE STRAWB BLK CHERRY,12.00,450ML,1.0,450ML,BEVERAGE


In [6]:
consolidated_df.columns

Index(['invoice_counter', 'invoice_num', 'invoice_total', 'item_num', 'qty',
       'line_amount', 'adj_line_amount', 'uom', 'unit_cost', 'item_descrip',
       'invoice_date', 'supplier_name', 'store_num', 'type_bill_credit',
       'cost_over_ride', 'line_amount_qty', 'case_uom', 'supplier_num',
       'prod_group_num', 'prod_group_descrip', 'order_code', 'compute_0022',
       'pak_qty', 'case_qty', 'purchase_uom', 'cat_num', 'cat_descrip',
       'has_credit', 'credit_pak_amount'],
      dtype='object')

In [7]:
# if item_num == items_list ItemId then match the category with it
consolidated_df["item_num"] = consolidated_df["item_num"].astype(str) #Just to avoid warnings
for index, row in consolidated_df.iterrows():
    cos_id = "I-" + str(row['item_num'])
    consolidated_df.loc[index, 'item_num'] = cos_id

In [8]:
consolidated_df.head()

Unnamed: 0,invoice_counter,invoice_num,invoice_total,item_num,qty,line_amount,adj_line_amount,uom,unit_cost,item_descrip,...,prod_group_descrip,order_code,compute_0022,pak_qty,case_qty,purchase_uom,cat_num,cat_descrip,has_credit,credit_pak_amount
0,252033,M123033004,-172.0,I-69025,-59.0,-123.9,-123.9,each,2.1,CANDIED CINNAMON ALMOND,...,CANDIES SNACKS,KK051094,N,1.0,1.0,each,7,Candy,N,0
1,252099,0252099,153.3,I-69025,15.0,31.5,31.5,each,2.1,CANDIED CINNAMON ALMOND,...,CANDIES SNACKS,KK051094,N,1.0,1.0,each,7,Candy,N,0
2,252100,0252099,-153.3,I-69025,-15.0,-31.5,-31.5,each,2.1,CANDIED CINNAMON ALMOND,...,CANDIES SNACKS,KK051094,N,1.0,1.0,each,7,Candy,N,0
3,252099,0252099,153.3,I-68933,14.0,29.4,29.4,each,2.1,CANDIED COCONUT ALMO,...,CANDIES SNACKS,KK051093,N,1.0,1.0,each,7,Candy,N,0
4,252100,0252099,-153.3,I-68933,-14.0,-29.4,-29.4,each,2.1,CANDIED COCONUT ALMO,...,CANDIES SNACKS,KK051093,N,1.0,1.0,each,7,Candy,N,0


In [9]:
# Create an empty list to store the matching CategoryID values
category_ids = []

# Iterate over each row in consolidated_df
for index, row in consolidated_df.iterrows():
    item_num = row['item_num']
    
    if item_num in items_list['ItemId'].values:
        category_id = items_list.loc[items_list['ItemId'] == item_num, 'CategoryID'].values[0]
        category_ids.append(category_id)
    else:
        category_ids.append(None) 

# Add the category_ids list as a new column to consolidated_df
consolidated_df['Items_assign_category_ID'] = category_ids

In [10]:
consolidated_df.head()

Unnamed: 0,invoice_counter,invoice_num,invoice_total,item_num,qty,line_amount,adj_line_amount,uom,unit_cost,item_descrip,...,order_code,compute_0022,pak_qty,case_qty,purchase_uom,cat_num,cat_descrip,has_credit,credit_pak_amount,Items_assign_category_ID
0,252033,M123033004,-172.0,I-69025,-59.0,-123.9,-123.9,each,2.1,CANDIED CINNAMON ALMOND,...,KK051094,N,1.0,1.0,each,7,Candy,N,0,
1,252099,0252099,153.3,I-69025,15.0,31.5,31.5,each,2.1,CANDIED CINNAMON ALMOND,...,KK051094,N,1.0,1.0,each,7,Candy,N,0,
2,252100,0252099,-153.3,I-69025,-15.0,-31.5,-31.5,each,2.1,CANDIED CINNAMON ALMOND,...,KK051094,N,1.0,1.0,each,7,Candy,N,0,
3,252099,0252099,153.3,I-68933,14.0,29.4,29.4,each,2.1,CANDIED COCONUT ALMO,...,KK051093,N,1.0,1.0,each,7,Candy,N,0,
4,252100,0252099,-153.3,I-68933,-14.0,-29.4,-29.4,each,2.1,CANDIED COCONUT ALMO,...,KK051093,N,1.0,1.0,each,7,Candy,N,0,


In [11]:
# Sort the dataframe by 'item_num' and 'qty'
consolidated_df.sort_values(by=['item_num', 'qty'], inplace=True)

# Find the rows to keep
rows_to_keep = []
prev_item_num = None

for index, row in consolidated_df.iterrows():
    if row['item_num'] != prev_item_num:
        rows_to_keep.append(index)
        prev_item_num = row['item_num']
    elif row['qty'] < 0:
        rows_to_keep.pop()
        rows_to_keep.append(index)

# Filter the dataframe to keep the selected rows
unique_rows = consolidated_df.loc[rows_to_keep]

# Display the unique rows
unique_rows

Unnamed: 0,invoice_counter,invoice_num,invoice_total,item_num,qty,line_amount,adj_line_amount,uom,unit_cost,item_descrip,...,order_code,compute_0022,pak_qty,case_qty,purchase_uom,cat_num,cat_descrip,has_credit,credit_pak_amount,Items_assign_category_ID
11476,247768,1061818515,216.40,I-10117,1.0,6.90,6.90,pak,6.90,HERB MINT BC 4OZ,...,680804,N,1.0,4.0,pak,2,Food,Y,0,40.0
287,247973,9001094224,8190.64,I-1016,1.0,69.53,69.53,cs,69.53,BROWNIE ROCKY ROAD 12X16IN TFC,...,5801205,N,1.0,2.0,cs,2,Food,N,0,24.0
73,247973,9001094224,8190.64,I-1020,1.0,57.18,57.18,cs,57.18,BAR NANAIMO SCORED 12X16IN,...,1174120,N,1.0,2.0,cs,2,Food,N,0,24.0
215,248086,9001070978,520.60,I-1028,1.0,42.24,42.24,cs,42.24,BREAD NAAN ORIGINAL TEARDROP,...,1290728,N,1.0,48.0,cs,2,Food,N,0,24.0
3381,251703,9001385802,1271.19,I-1033,1.0,51.20,51.20,cs,51.20,PITA GREEK THK 7IN TFC,...,1346507,N,1.0,120.0,cs,2,Food,N,0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3185,252584,311232,125.84,I-9641,1.0,18.00,18.00,SHEET,18.00,FOCACCIA HERB FULL SHEET,...,3902,N,1.0,1.0,SHEET,2,Food,Y,0,24.0
7016,247853,0247852,-19.40,I-9687,-2.0,-19.40,-19.40,Kg,9.70,PASTE MISO YELLOW SHIRO,...,0412106,N,1.0,1.0,Kg,2,Food,N,0,20.0
6304,248149,9001111671,1062.80,I-9830,1.0,42.66,42.66,cs,42.66,GRAVY MIX POUTINE GF,...,1253421,N,453.0,6.0,cs,2,Food,N,0,58.0
13405,248299,1061818077,1640.75,I-9831,2.0,8.60,8.60,bag,4.30,RADISH BUNCHED 48CT,...,0258025,N,1.0,3.0,bag,2,Food,N,0,36.0


In [12]:
# Filter rows with no NaN values in "Items_assign_category_ID" from unique_rows DataFrame
filtered_df = unique_rows[~pd.isnull(unique_rows['Items_assign_category_ID'])]

# Print the filtered DataFrame
filtered_df


Unnamed: 0,invoice_counter,invoice_num,invoice_total,item_num,qty,line_amount,adj_line_amount,uom,unit_cost,item_descrip,...,order_code,compute_0022,pak_qty,case_qty,purchase_uom,cat_num,cat_descrip,has_credit,credit_pak_amount,Items_assign_category_ID
11476,247768,1061818515,216.40,I-10117,1.0,6.90,6.90,pak,6.90,HERB MINT BC 4OZ,...,680804,N,1.0,4.0,pak,2,Food,Y,0,40.0
287,247973,9001094224,8190.64,I-1016,1.0,69.53,69.53,cs,69.53,BROWNIE ROCKY ROAD 12X16IN TFC,...,5801205,N,1.0,2.0,cs,2,Food,N,0,24.0
73,247973,9001094224,8190.64,I-1020,1.0,57.18,57.18,cs,57.18,BAR NANAIMO SCORED 12X16IN,...,1174120,N,1.0,2.0,cs,2,Food,N,0,24.0
215,248086,9001070978,520.60,I-1028,1.0,42.24,42.24,cs,42.24,BREAD NAAN ORIGINAL TEARDROP,...,1290728,N,1.0,48.0,cs,2,Food,N,0,24.0
3381,251703,9001385802,1271.19,I-1033,1.0,51.20,51.20,cs,51.20,PITA GREEK THK 7IN TFC,...,1346507,N,1.0,120.0,cs,2,Food,N,0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3185,252584,311232,125.84,I-9641,1.0,18.00,18.00,SHEET,18.00,FOCACCIA HERB FULL SHEET,...,3902,N,1.0,1.0,SHEET,2,Food,Y,0,24.0
7016,247853,0247852,-19.40,I-9687,-2.0,-19.40,-19.40,Kg,9.70,PASTE MISO YELLOW SHIRO,...,0412106,N,1.0,1.0,Kg,2,Food,N,0,20.0
6304,248149,9001111671,1062.80,I-9830,1.0,42.66,42.66,cs,42.66,GRAVY MIX POUTINE GF,...,1253421,N,453.0,6.0,cs,2,Food,N,0,58.0
13405,248299,1061818077,1640.75,I-9831,2.0,8.60,8.60,bag,4.30,RADISH BUNCHED 48CT,...,0258025,N,1.0,3.0,bag,2,Food,N,0,36.0


In [13]:
assigned_data = pd.read_csv(os.getcwd() + '/data/Misc/GHG_categories/GHG_assigned_data.csv')
assigned_data

Unnamed: 0,invoice_counter,invoice_num,invoice_total,item_num,qty,line_amount,adj_line_amount,uom,unit_cost,item_descrip,...,compute_0022,pak_qty,case_qty,purchase_uom,cat_num,cat_descrip,has_credit,credit_pak_amount,Food Category,Category ID
0,247973,9001094224,8190.64,1016,1.0,69.53,69.53,cs,69.53,brownie rocky road 12x16in tfc,...,N,1.0,2.0,cs,2,Food,N,0,"wheat/rye (bread, pasta, baked goods)",24.0
1,248381,9001112003,5367.66,1016,1.0,69.53,69.53,cs,69.53,brownie rocky road 12x16in tfc,...,N,1.0,2.0,cs,2,Food,N,0,"wheat/rye (bread, pasta, baked goods)",24.0
2,248383,9001132129,4406.91,1016,1.0,69.53,69.53,cs,69.53,brownie rocky road 12x16in tfc,...,N,1.0,2.0,cs,2,Food,N,0,"wheat/rye (bread, pasta, baked goods)",24.0
3,248663,9001144510,9116.11,1016,1.0,69.53,69.53,cs,69.53,brownie rocky road 12x16in tfc,...,N,1.0,2.0,cs,2,Food,N,0,"wheat/rye (bread, pasta, baked goods)",24.0
4,248997,9001185341,7931.74,1016,1.0,69.53,69.53,cs,69.53,brownie rocky road 12x16in tfc,...,N,1.0,2.0,cs,2,Food,N,0,"wheat/rye (bread, pasta, baked goods)",24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18381,249360,5200356,213.43,70875,2.0,87.36,87.36,BX,43.68,beef medium grd,...,N,1.0,5.0,BX,2,Food,N,0,beef & buffalo meat,1.0
18382,251688,5204180,265.48,70875,2.0,89.56,89.56,BX,44.78,beef medium grd,...,N,1.0,5.0,BX,2,Food,Y,0,beef & buffalo meat,1.0
18383,252734,5206230,265.48,70875,2.0,89.56,89.56,BX,44.78,beef medium grd,...,N,1.0,5.0,BX,2,Food,Y,0,beef & buffalo meat,1.0
18384,248077,05197159,1042.32,70875,18.0,796.32,796.32,BX,44.24,beef medium grd,...,N,1.0,5.0,BX,2,Food,Y,0,beef & buffalo meat,1.0


In [14]:
# filtered_df.rename(columns={'Items_assign_category_ID': 'Category ID'}, inplace=True)
assigned_data["item_num"] = assigned_data["item_num"].astype(str) #Just to avoid warnings
for index, row in assigned_data.iterrows():
    cos_id = "I-" + str(row['item_num'])
    assigned_data.loc[index, 'item_num'] = cos_id

In [15]:
for index, row in filtered_df.iterrows():
    get_r = assigned_data.loc[assigned_data['item_num'] == row['item_num']]
    if not get_r.empty:
        Category_ID = get_r['Category ID'].values[0]
        filtered_df.loc[index, 'Category ID'] = Category_ID

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.loc[index, 'Category ID'] = Category_ID


In [16]:
filtered_df

Unnamed: 0,invoice_counter,invoice_num,invoice_total,item_num,qty,line_amount,adj_line_amount,uom,unit_cost,item_descrip,...,compute_0022,pak_qty,case_qty,purchase_uom,cat_num,cat_descrip,has_credit,credit_pak_amount,Items_assign_category_ID,Category ID
11476,247768,1061818515,216.40,I-10117,1.0,6.90,6.90,pak,6.90,HERB MINT BC 4OZ,...,N,1.0,4.0,pak,2,Food,Y,0,40.0,63.0
287,247973,9001094224,8190.64,I-1016,1.0,69.53,69.53,cs,69.53,BROWNIE ROCKY ROAD 12X16IN TFC,...,N,1.0,2.0,cs,2,Food,N,0,24.0,24.0
73,247973,9001094224,8190.64,I-1020,1.0,57.18,57.18,cs,57.18,BAR NANAIMO SCORED 12X16IN,...,N,1.0,2.0,cs,2,Food,N,0,24.0,24.0
215,248086,9001070978,520.60,I-1028,1.0,42.24,42.24,cs,42.24,BREAD NAAN ORIGINAL TEARDROP,...,N,1.0,48.0,cs,2,Food,N,0,24.0,24.0
3381,251703,9001385802,1271.19,I-1033,1.0,51.20,51.20,cs,51.20,PITA GREEK THK 7IN TFC,...,N,1.0,120.0,cs,2,Food,N,0,24.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3185,252584,311232,125.84,I-9641,1.0,18.00,18.00,SHEET,18.00,FOCACCIA HERB FULL SHEET,...,N,1.0,1.0,SHEET,2,Food,Y,0,24.0,24.0
7016,247853,0247852,-19.40,I-9687,-2.0,-19.40,-19.40,Kg,9.70,PASTE MISO YELLOW SHIRO,...,N,1.0,1.0,Kg,2,Food,N,0,20.0,58.0
6304,248149,9001111671,1062.80,I-9830,1.0,42.66,42.66,cs,42.66,GRAVY MIX POUTINE GF,...,N,453.0,6.0,cs,2,Food,N,0,58.0,54.0
13405,248299,1061818077,1640.75,I-9831,2.0,8.60,8.60,bag,4.30,RADISH BUNCHED 48CT,...,N,1.0,3.0,bag,2,Food,N,0,36.0,


In [17]:
#Change by Vivaan on 28th May
# Create a DataFrame containing only rows where 'Items_assign_category_ID' is not equal to 'Category ID'
need_to_manually_check_data = filtered_df[filtered_df['Items_assign_category_ID'] != filtered_df['Category ID']]


In [18]:
# check to see if automated approach is correctly assigning items to the right category
need_to_manually_check_data

Unnamed: 0,invoice_counter,invoice_num,invoice_total,item_num,qty,line_amount,adj_line_amount,uom,unit_cost,item_descrip,...,compute_0022,pak_qty,case_qty,purchase_uom,cat_num,cat_descrip,has_credit,credit_pak_amount,Items_assign_category_ID,Category ID
11476,247768,1061818515,216.40,I-10117,1.0,6.90,6.90,pak,6.90,HERB MINT BC 4OZ,...,N,1.0,4.0,pak,2,Food,Y,0,40.0,63.0
8865,247877,9001094408,7899.82,I-10503,4.0,344.56,344.56,cs,86.14,HOT DOG PLANT BASED WIENER FZN,...,N,4.0,1.0,cs,2,Food,N,0,1.0,
8420,249278,1061820702,2206.80,I-10611,1.0,49.25,49.25,cs,49.25,WON TON WRAPPER,...,N,1.0,1.0,lb,2,Food,N,0,24.0,
553,247972,9001070965,4973.26,I-1064,1.0,111.33,111.33,cs,111.33,CAKE STRAWBERRY CREAM TFC,...,N,30.0,2.0,cs,2,Food,N,0,24.0,8.0
11514,248312,1061818519,4630.10,I-10758,1.0,11.30,11.30,BUNCH,11.30,HERB TARRAGON BC 4oz,...,N,4.0,1.0,BUNCH,2,Food,Y,0,58.0,63.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8354,248354,9001070972,5360.83,I-9382,1.0,19.29,19.29,cs,19.29,VINEGAR APPLE CIDER PURE,...,N,5.0,2.0,cs,2,Food,N,0,57.0,32.0
7016,247853,0247852,-19.40,I-9687,-2.0,-19.40,-19.40,Kg,9.70,PASTE MISO YELLOW SHIRO,...,N,1.0,1.0,Kg,2,Food,N,0,20.0,58.0
6304,248149,9001111671,1062.80,I-9830,1.0,42.66,42.66,cs,42.66,GRAVY MIX POUTINE GF,...,N,453.0,6.0,cs,2,Food,N,0,58.0,54.0
13405,248299,1061818077,1640.75,I-9831,2.0,8.60,8.60,bag,4.30,RADISH BUNCHED 48CT,...,N,1.0,3.0,bag,2,Food,N,0,36.0,


In [19]:
# Unfilter rows with NaN values in "Items_assign_category_ID" and unique item_num
# unfiltered_df = consolidated_df[pd.isnull(consolidated_df['Items_assign_category_ID']) & ~consolidated_df['item_num'].duplicated(keep=False)]
unfiltered_df = unique_rows[pd.isnull(unique_rows['Items_assign_category_ID'])]

for index, row in unfiltered_df.iterrows():
    get_r = assigned_data.loc[assigned_data['item_num'] == row['item_num']]
    if not get_r.empty:
        Category_ID = get_r['Category ID'].values[0]
        unfiltered_df.loc[index, 'Category ID'] = Category_ID

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unfiltered_df.loc[index, 'Category ID'] = Category_ID


In [20]:
# Create a DataFrame containing only rows where 'Items_assign_category_ID' is not equal to 'Category ID'
not_assigned_from_list_method = unfiltered_df[unfiltered_df['Items_assign_category_ID'] != unfiltered_df['Category ID']]

In [21]:
# Create a dictionary mapping Category ID to Food Category from categories_df
category_dict = categories_df.set_index('Category ID')['Food Category'].to_dict()

# Create a new column "Category" in unfiltered_df based on the Category ID values
unfiltered_df['Category'] = unfiltered_df['Category ID'].map(category_dict)

# Print the updated unfiltered_df DataFrame
unfiltered_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unfiltered_df['Category'] = unfiltered_df['Category ID'].map(category_dict)


Unnamed: 0,invoice_counter,invoice_num,invoice_total,item_num,qty,line_amount,adj_line_amount,uom,unit_cost,item_descrip,...,pak_qty,case_qty,purchase_uom,cat_num,cat_descrip,has_credit,credit_pak_amount,Items_assign_category_ID,Category ID,Category
12585,248143,1061818523,711.90,I-10482,1.0,6.20,6.20,cs,6.20,ONIONS JUMBO 5lb,...,1.0,5.0,cs,2,Food,N,0,,39.0,onions and leeks
519,247972,9001070965,4973.26,I-1063,1.0,50.31,50.31,CA,50.31,CAKE SHEET SLAB ORANGE CITRUS,...,1.0,2.0,CA,2,Food,N,0,,24.0,"wheat/rye (bread, pasta, baked goods)"
671,250775,9001346509,1370.36,I-1084,1.0,28.60,28.60,cs,28.60,COOKIE DBL CHOC CHIP T/S,...,1.0,72.0,cs,2,Food,N,0,,41.0,potatoes
1404,251903,9001459647,6270.89,I-1086,4.0,218.56,218.56,cs,54.64,COOKIE DOUGH CHOC CHIP N/HYDRO,...,1.0,128.0,cs,2,Food,Y,0,,24.0,"wheat/rye (bread, pasta, baked goods)"
1356,249799,9001259668,5011.22,I-1091,2.0,116.60,116.60,cs,58.30,COOKIE DOUGH WHITE CHOC MACA,...,1.0,128.0,cs,2,Food,N,0,,24.0,"wheat/rye (bread, pasta, baked goods)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15316,250914,00022966,357.14,I-70756,1.0,70.00,70.00,ea,70.00,CK|BULK|Seasonal Chia/ Oat 8kg,...,1.0,1.0,ea,2,Food,N,0,,26.0,tree nuts and seeds
8603,251629,0000869104,968.35,I-70806,4.0,192.00,192.00,TUB,48.00,ICE CREAM ESPRESSO 11.4L,...,11.4,1.0,TUB,2,Food,N,0,,7.0,ice cream
8612,248410,0000860921,1872.55,I-70837,4.0,192.00,192.00,TUB,48.00,ICE CREAM VANILLA 11.4L,...,11.4,1.0,TUB,2,Food,N,0,,7.0,ice cream
8750,247689,5197332,263.28,I-70875,2.0,87.36,87.36,BX,43.68,BEEF MEDIUM GRD,...,1.0,5.0,BX,2,Food,N,0,,1.0,beef & buffalo meat


In [22]:
unfiltered_df.to_csv(os.getcwd() + '/data/Misc/GHG_categories/selected_unassigned_GHG_categories.csv', index = False)

In [23]:
unfiltered_df.shape

(286, 32)

In [24]:
unfiltered_df.head(60)

Unnamed: 0,invoice_counter,invoice_num,invoice_total,item_num,qty,line_amount,adj_line_amount,uom,unit_cost,item_descrip,...,pak_qty,case_qty,purchase_uom,cat_num,cat_descrip,has_credit,credit_pak_amount,Items_assign_category_ID,Category ID,Category
12585,248143,1061818523,711.9,I-10482,1.0,6.2,6.2,cs,6.2,ONIONS JUMBO 5lb,...,1.0,5.0,cs,2,Food,N,0,,39.0,onions and leeks
519,247972,9001070965,4973.26,I-1063,1.0,50.31,50.31,CA,50.31,CAKE SHEET SLAB ORANGE CITRUS,...,1.0,2.0,CA,2,Food,N,0,,24.0,"wheat/rye (bread, pasta, baked goods)"
671,250775,9001346509,1370.36,I-1084,1.0,28.6,28.6,cs,28.6,COOKIE DBL CHOC CHIP T/S,...,1.0,72.0,cs,2,Food,N,0,,41.0,potatoes
1404,251903,9001459647,6270.89,I-1086,4.0,218.56,218.56,cs,54.64,COOKIE DOUGH CHOC CHIP N/HYDRO,...,1.0,128.0,cs,2,Food,Y,0,,24.0,"wheat/rye (bread, pasta, baked goods)"
1356,249799,9001259668,5011.22,I-1091,2.0,116.6,116.6,cs,58.3,COOKIE DOUGH WHITE CHOC MACA,...,1.0,128.0,cs,2,Food,N,0,,24.0,"wheat/rye (bread, pasta, baked goods)"
1409,249799,9001259668,5011.22,I-1094,2.0,106.78,106.78,cs,53.39,COOKIE DOUGH MONSTER N/HYDRO,...,1.0,128.0,cs,2,Food,N,0,,24.0,"wheat/rye (bread, pasta, baked goods)"
1608,251190,2000155272,-145.52,I-11052,-1.0,-40.64,-40.64,ea,40.64,CO2 CANISTER,...,1.0,1.0,ea,2,Food,N,0,,61.0,kitchen supplies
9373,251701,1061826032,216.25,I-11078,1.0,9.75,9.75,cs,9.75,APPLES GALA ROYAL lb,...,1.0,38.0,CA,2,Food,N,0,,32.0,apples
17487,249084,IN161559,630.22,I-11173,1.0,74.74,74.74,cs,74.74,PEPPER BLK CRACKED,...,454.0,5.0,cs,2,Food,N,0,,54.0,stimulants & spices misc.
8393,248533,2035373,4083.92,I-11575,1.0,61.11,61.11,ea,61.11,VINEGAR SUSHI SUMO,...,20.0,1.0,CT,2,Food,N,0,,57.0,vinegar


In [25]:
unfiltered_df.iloc[60:78, :]

Unnamed: 0,invoice_counter,invoice_num,invoice_total,item_num,qty,line_amount,adj_line_amount,uom,unit_cost,item_descrip,...,pak_qty,case_qty,purchase_uom,cat_num,cat_descrip,has_credit,credit_pak_amount,Items_assign_category_ID,Category ID,Category
17800,251169,9001346513,210.09,I-23898,2.0,10.72,10.72,ea,5.36,STRIP TEST PPR QUAT SANITIZER,...,1.0,1.0,ea,4,Misc.,N,0,,61.0,kitchen supplies
18078,249022,9001185335,7277.82,I-2392,1.0,12.21,12.21,cs,12.21,FIRST AID BANDAGE FABRIC BLUE,...,1.0,100.0,cs,4,Misc.,N,0,,61.0,kitchen supplies
18244,250173,9001238103,1701.47,I-2411,1.0,15.54,15.54,pak,15.54,LBL FOOD ROTATION DISSOLVE 2X3,...,250.0,1.0,pak,4,Misc.,N,0,,61.0,kitchen supplies
18265,247973,9001094224,8190.64,I-2422,1.0,78.58,78.58,cs,78.58,PAD GRIDDLE POLS 46 BLK 4X5.25,...,20.0,3.0,cs,4,Misc.,N,0,,61.0,kitchen supplies
18285,249851,9001271332,7700.61,I-2428,4.0,42.56,42.56,cs,10.64,STRING BUTCHER MED WEIGHT #4R,...,1.0,1.0,cs,4,Misc.,N,0,,61.0,kitchen supplies
11172,248383,9001132129,4406.91,I-2616,2.0,126.68,126.68,cs,63.34,HASHBROWN DCD RISE N FRY TFC,...,5.0,6.0,cs,2,Food,N,0,,41.0,potatoes
1524,247784,71990893,609.06,I-2631,6.0,190.8,190.8,PL,31.8,MUFFIN BATT CARROT TFC,...,1.0,8.0,CA,2,Food,N,0,,38.0,root vegetables
17649,249480,2036658,733.2,I-26887,1.0,62.0,62.0,ea,62.0,SAKE,...,18.0,1.0,BIB 20L,8,"Liq,Wine,Beer",N,0,,51.0,wine grapes (wine)
7983,248126,9001081628,504.98,I-2716,1.0,49.03,49.03,cs,49.03,SOUP CHOWDER CLAM BOSTON,...,1.81,3.0,cs,2,Food,N,0,,59.0,manually adjusted
8001,248593,9001111666,352.06,I-2752,1.0,44.8,44.8,cs,44.8,SOUP VEG BEEF BARLEY NO MSG,...,1.81,1.0,cs,2,Food,N,0,,1.0,beef & buffalo meat


In [26]:
consolidated_df.to_csv(os.getcwd() + '/data/Misc/GHG_categories/GHG_categories_assigned_with_Items_List.csv', index = False)

In [27]:
items_list

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-57545,1,CHUCK FLAT BONELESS FZN,3.30,Kg,1.0,Kg,MEAT
1,I-10869,1,BEEF STIRFRY COV FR,5.00,Kg,1.0,Kg,MEAT
2,I-7064,1,BEEF OUTSIDE FLAT AAA,1.00,Kg,1.0,Kg,MEAT
3,I-37005,1,BEEF MEATBALLS,4.54,Kg,1000.0,g,MEAT
4,I-37002,1,BEEF INSIDE ROUND SHAVED,9.00,Kg,1000.0,g,MEAT
...,...,...,...,...,...,...,...,...
2816,I-17622,24,WAFFLES BLUBRRY - GF/WF,12.00,box,6.0,ea,FOOD - GROCERY
2817,I-45754,55,WATER ACTIVE LEMON LIME,12.00,450ML,1.0,450ML,BEVERAGE
2818,I-45755,55,WATER ACTIVE ORANGE MANGO,12.00,450ML,1.0,450ML,BEVERAGE
2819,I-45756,55,WATER ACTIVE STRAWB BLK CHERRY,12.00,450ML,1.0,450ML,BEVERAGE


In [28]:
df_to_be_added = unfiltered_df.rename(columns={"item_num": "ItemId", "Category ID": "CategoryID", "item_descrip": "Description", "case_qty": "CaseQty", "case_uom": "CaseUOM", "pak_qty": "PakQty", "uom": "PakUOM", "prod_group_descrip": "InventoryGroup"})

In [29]:
df_to_be_added = df_to_be_added[["ItemId", "CategoryID", "Description", "CaseQty", "CaseUOM", "PakQty", "PakUOM", "InventoryGroup"]]

In [32]:
df_to_be_added

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
12585,I-10482,39.0,ONIONS JUMBO 5lb,5.0,lb,1.0,cs,PRODUCE
519,I-1063,24.0,CAKE SHEET SLAB ORANGE CITRUS,2.0,SHEET,1.0,CA,BAKED GOODS
671,I-1084,41.0,COOKIE DBL CHOC CHIP T/S,72.0,ea,1.0,cs,BAKED GOODS
1404,I-1086,24.0,COOKIE DOUGH CHOC CHIP N/HYDRO,128.0,CT,1.0,cs,BAKING-RAW INGREDIENTS
1356,I-1091,24.0,COOKIE DOUGH WHITE CHOC MACA,128.0,each,1.0,cs,BAKING-RAW INGREDIENTS
...,...,...,...,...,...,...,...,...
15316,I-70756,26.0,CK|BULK|Seasonal Chia/ Oat 8kg,1.0,ea,1.0,ea,PRODUCTION FOOD
8603,I-70806,7.0,ICE CREAM ESPRESSO 11.4L,1.0,TUB,11.4,TUB,ICECREAM/NOVELTY/GELATO
8612,I-70837,7.0,ICE CREAM VANILLA 11.4L,1.0,TUB,11.4,TUB,ICECREAM/NOVELTY/GELATO
8750,I-70875,1.0,BEEF MEDIUM GRD,5.0,Kg,1.0,BX,MEAT


In [30]:
# Append df_to_be_added to items_list

# CHECK ALL ITEMS CAREFULLY BEFORE DOING: 

# items_list = pd.concat([items_list, df_to_be_added], axis=0)
# items_list.tail(10)

In [31]:
# CHECK ALL ITEMS CAREFULLY BEFORE DOING: 

# items_list.to_csv(os.getcwd() + '/data/GHG_categories/Items_List_Assigned.csv', index = False)