This notebook extracts Product (names), Brand (names) and Category (names) from input data. We will use this data to finetune a Spacy NER model to identify the same in incoming query.


In [1]:
import pandas as pd
import pickle


In [2]:
# read data from existing pickle (pre-processed file)

prod_meta_fn = 'proc_prod_meta_data_may22.pkl'

with open(prod_meta_fn, 'rb') as fhandle:
    prod_meta_data_list = pickle.load(fhandle)

In [3]:
# to see what fields exist in prod_meta_data

prod_meta_data_list [0]

{'asin': '0011300000',
 'title': 'Genuine Geovision 1 Channel 3rd Party NVR IP Software with USB Dongle Onvif PSIA',
 'description': 'The following camera brands and models have been tested for compatibility with GV-Software. GeoVision  ACTi  Arecont Vision  AXIS  Bosch  Canon CNB  D-Link  EtroVision  HikVision  HUNT  IQEye JVC  LG  MOBOTIX  Panasonic  Pelco  Samsung Sanyo  Sony  UDP  Verint  VIVOTEK      Compatible Standard and Protocol GV-System also allows for integration with all other IP video devices compatible with ONVIF(V2.0), PSIA (V1.1) standards, or RTSP protocol. ONVIF  PSIA  RTSP          Note: Specifications are subject to change without notice. Every effort has been made to ensure that the information on this Web site is accurate. No liability is assumed for incidental or consequential damages arising from the use of the information or products contained herein.',
 'category': ['Electronics',
  'Camera &amp; Photo',
  'Video Surveillance',
  'Surveillance Systems',
  'Su

In [11]:
# initialize the lists of data we want to get (for NER model training)
prod_name_list = []
brand_list = []
category_list = []

for ind_prod in prod_meta_data_list:
    i_prod_name = ind_prod.get ('title', 'Z')
    i_brand_name = ind_prod.get ('brand', 'Z')
    i_category = [x for x in ind_prod['category']]
    if i_prod_name != 'Z':
        prod_name_list.append(i_prod_name)
    if i_brand_name != 'Z':
        brand_list.append(i_brand_name)
    if i_category:
        category_list.extend(i_category)
    count += 1

print ('total records processed ...    ', count)

total records processed ...     266055


In [12]:
print ('prod ', len(prod_name_list))
print ('brand ', len(brand_list))
print ('category  ', len(category_list))

prod  266050
brand  266049
category   1191531


In [13]:
prod_name_list = set(prod_name_list)
brand_list = set(brand_list)
category_list = set(category_list)

In [14]:
# unique values

print ('prod ', len(prod_name_list))
print ('brand ', len(brand_list))
print ('category  ', len(category_list))

prod  250768
brand  20194
category   4136


In [18]:
# save the unique values for use with Spacy NER model

prod_name_fn = 'unique_prod_names_may28.pkl'

with open(prod_name_fn, 'wb') as fhandle:
    pickle.dump(prod_name_list, fhandle)

In [16]:
brand_name_fn = 'unique_brand_names_may28.pkl'

with open(brand_name_fn, 'wb') as fb:
    pickle.dump(brand_list, fb)

In [17]:
cat_name_fn = 'unique_category_names_may28.pkl'

with open(cat_name_fn, 'wb') as fc:
    pickle.dump(category_list, fc)