In [1]:
import os
import numpy as np
import pandas as pd

In [3]:
# Define save folder
save_folder = os.path.join(os.pardir, 'resources', 'groceries')
print(save_folder)

../resources/groceries


In [4]:
grocery_items = pd.read_csv('grocery_items.csv', sep=",", engine='python')
column_map = {'PRODUCT_NAME': 'title', 'PRODUCT_BRAND_NAME': 'brand', 'PRODUCT_ID': 'index',
             'L4_CATEGORY': 'category', 'PRICE': 'price'}
grocery_items.rename(columns=column_map, inplace=True)
# drop columns that are not in the column map
grocery_items.drop(grocery_items.columns.difference(list(column_map.values())).intersection(grocery_items.columns), axis=1, inplace=True)

In [5]:
# Filtering logic
old_len = len(grocery_items)
# remove items that have no title
grocery_items = grocery_items.dropna(subset=['title'])
# remove items that have no category
grocery_items = grocery_items.dropna(subset=['category'])
# reset index
grocery_items.reset_index(drop=True, inplace=True)
# create index column
grocery_items['id'] = np.arange(1, len(grocery_items) + 1)
print(f"Filtered out {old_len - len(grocery_items)} items")
grocery_items.head(10)


Filtered out 533 items


Unnamed: 0,index,price,title,brand,category,id
0,17011460,1.634,"Pizza Sauce, New York Style",DelGrosso,Sauces,1
1,30828654,4.65,Assorted Chocolate Macarons,Specially Selected,Frozen Novelty Desserts,2
2,19750626,4.301111,Gentle Baby Lotion,Little Journey,Baby Lotions,3
3,20193327,4.493125,"Soft Spreadable Cheese, Garden Vegetable",Alouette,Cheese Dips and Spreads,4
4,20725052,3.47,Sharp Cheddar Cheese Spread,Emporium Selection,Cheese Dips and Spreads,5
5,28033040,4.680526,Non-GMO Grapeseed Oil,Simply Nature,Cooking Oils,6
6,20537082,2.334308,Peanut Butter Filled Cookies,Benton's,Packaged Baked Goods,7
7,28127404,5.29,"Italian Sausage With Roasted Potatoes, Peppers...",Bremer,Italian Sausage,8
8,25901190,0.673786,Organic Bananas,,Bananas,9
9,20363182,3.49,Mango Passion 100% Juice,Nature's Nectar,Fruit Juice Blends,10


In [8]:
# Save to save directory
grocery_items.to_feather(os.path.join(save_folder, 'grocery_items.ftr'))

In [9]:
# Create a placeholder item similarity matrix in numpy
n_items = len(grocery_items)
item_similarity = np.zeros((n_items, n_items))
# Save it to .npy file in save_folder
np.save(os.path.join(save_folder, 'item_sim.npy'), item_similarity)

In [10]:
# Read the saved file
items = pd.read_feather(os.path.join(save_folder, 'grocery_items.ftr'))

In [11]:
items.head(3)

Unnamed: 0,index,price,title,brand,category,id
0,17011460,1.634,"Pizza Sauce, New York Style",DelGrosso,Sauces,1
1,30828654,4.65,Assorted Chocolate Macarons,Specially Selected,Frozen Novelty Desserts,2
2,19750626,4.301111,Gentle Baby Lotion,Little Journey,Baby Lotions,3
