In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# Define save folder
save_folder = os.path.join(os.pardir, 'resources', 'groceries')
print(save_folder)

../resources/groceries


In [3]:
grocery_items = pd.read_csv('table_creation.csv', sep=",", engine='python')
column_map = {'PRODUCT_NAME': 'title', 'PRODUCT_BRAND_NAME': 'brand', 'PRODUCT_ID': 'index',
             'L4_CATEGORY': 'category', 'PRICE': 'price'}
grocery_items.rename(columns=column_map, inplace=True)
# drop columns that are not in the column map
grocery_items.drop(grocery_items.columns.difference(list(column_map.values())).intersection(grocery_items.columns), axis=1, inplace=True)

In [4]:
# Filtering logic
old_len = len(grocery_items)
# remove items that have no title
grocery_items = grocery_items.dropna(subset=['title'])
# remove items that have no category
grocery_items = grocery_items.dropna(subset=['category'])
# reset index
grocery_items.reset_index(drop=True, inplace=True)
# create index column
grocery_items['id'] = np.arange(1, len(grocery_items) + 1)
print(f"Filtered out {old_len - len(grocery_items)} items")
grocery_items.head(10)


Filtered out 1012 items


Unnamed: 0,index,price,title,brand,category,id
0,29239089,14.99,Mini Casserole Set - Purple,Crofton,Baking Dishes,1
1,3230772,4.291482,Xtra Cheddar Crackers,Pepperidge Farm® Goldfish® Flavor Blasted®,Crackers and Dried Bread Snacks,2
2,18649218,3.974822,Calorie Free Stevia Sweetener,Sweet Additions,Sweeteners,3
3,49898818,3.99,USA Design 1 Sport Squeeze Bottle,Crane,Reusable Water Bottles,4
4,20060905,3.186295,Everything Skinnys Bagel,L'oven Fresh,Bagels,5
5,20269279,2.449034,Graham Crackers,Benton's,Crackers and Dried Bread Snacks,6
6,20873280,1.424644,Hamburger Buns,L'oven Fresh,Hamburger Buns,7
7,24195606,9.99,Size 6 8 Pack Children's Mine Craft Licensed U...,,Boys Underwear,8
8,20478028,1.657164,Reduced Sodium Soy Sauce,Fusia,Sauces,9
9,20863220,2.423477,Sugar Cookie Mix,Baker's Corner,Baking Mixes,10


In [5]:
# Save to save directory
grocery_items.to_feather(os.path.join(save_folder, 'grocery_items.ftr'))

In [9]:
# Create a placeholder item similarity matrix in numpy
n_items = len(grocery_items)
item_similarity = np.zeros((n_items, n_items))
# Save it to .npy file in save_folder
np.save(os.path.join(save_folder, 'item_sim.npy'), item_similarity)

In [10]:
# Read the saved file
items = pd.read_feather(os.path.join(save_folder, 'grocery_items.ftr'))

In [11]:
items.head(3)

Unnamed: 0,index,price,title,brand,category,id
0,17011460,1.634,"Pizza Sauce, New York Style",DelGrosso,Sauces,1
1,30828654,4.65,Assorted Chocolate Macarons,Specially Selected,Frozen Novelty Desserts,2
2,19750626,4.301111,Gentle Baby Lotion,Little Journey,Baby Lotions,3
