This notebook uses pre-processed data (in pickle format) and
loads the same into weaviate (and/or other) vector databases.

Initial experiments are on weaviate.

In [1]:
import pickle
import _pickle as cPickle
import numpy
import json


done


Read in data from pre-processed files. (instead of reading from local storage attached to the VM, we can also directly read from the storage bucket)

In [2]:
# load metadata file - written using pickle
prod_meta_fn = 'proc_prod_meta_data_may22.pkl'

with open(prod_meta_fn, 'rb') as f:
    prod_meta_list = pickle.load(f)

In [3]:
# load reviews data - file created using _pickle 

prod_review_fn = 'proc_review_data_may22.pkl'

with open(prod_review_fn, 'rb') as f1:
    rev_list = cPickle.load(f1)

In [4]:
print ( 'num of products ...   ', len(prod_meta_list))
print (' num of reviews ....  ', len(rev_list)) 

num of products ...    266050
 num of reviews ....   20980243


Create Weaviate Schema -- for the time being declaring only a limited number
of fields as 'text' and rest as 'string'. Only text data gets vectorized.

In [2]:
import sys
!{sys.executable} -m pip install weaviate-client

Defaulting to user installation because normal site-packages is not writeable


In [3]:
# initialize weaviate client
import weaviate

# for sandbox 
#weaviate_url = ' https://mlopssearchtest.semi.network'
#secret = weaviate.AuthClientPassword("sridiyer@gmail.com", "xxx")

weaviate_url = 'http://34.67.249.252:8080/'
secret = weaviate.AuthClientPassword("admin", "admin")

# Initiate the client with the secret
client = weaviate.Client(weaviate_url, secret)

In [17]:
product_class_schema = {
    "class": "Product",
    "description" : "A Product class to store the product information and its reviewers",
    "properties" : [
                 {
                    "name": "asin",
                    "dataType": ["string"],
                    "description": "product id from amazon",
                },
                {
                    "name": "title",
                    "dataType": ["text"],
                    "description": "The title of the product",
                },
                {
                    "name": "description",
                    "dataType": ["text"],
                    "description": "The product description",
                },
                {                
                    "name": "category",
                    "dataType": ["string[]"],
                    "description": "The product category",
                },
                {                
                    "name": "mainCategory",
                    "dataType": ["text"],
                    "description": "The product main category",
                },
                {                
                    "name": "brand",
                    "dataType": ["string"],
                    "description": "The product brand",
                },
                {                
                    "name": "feature",
                    "dataType": ["text[]"],
                    "description": "The product features",
                },                
                {
                    "name": "price",
                    "description": "The price of the wine in dollars",
                    "dataType": ["number"]
                },
                {
                    "name": "hasReviews",
                    "dataType": ["Review"],
                    "description": "List of reviews this product listing has ",
                }                   
    ]
}

In [26]:
review_class_schema = {
    "class": "Review",
    "description": "An Review class to store the review information",
    "properties": [
                {
                    "name": "overall",
                    "dataType": ["number"],
                    "description": "overall score from user ",
                },
                {
                    "name": "reviewText",
                    "dataType": ["text"],
                    "description": "review text",
                },
                {
                    "name": "summary",
                    "dataType": ["text"],
                    "description": "reviewer provided summary of review",
                },
                {
                    "name": "productasin",
                    "dataType": ["string"],
                    "description": "reviewer provided summary of review",
                },
                {
                    "name": "reviewerid",
                    "dataType": ["string"],
                    "description": "reviewer provided summary of review",
                },
                {
                    "name": "reviwerName",
                    "dataType": ["string"],
                    "description": "reviewer provided summary of review",
                },
                {
                    "name": "ofProduct",
                    "dataType": ["Product"],
                    "description": "The product of the review",
                } 
    ]
}


In [5]:
# helper function
def prettify(json_dict): 
    print(json.dumps(json_dict, indent=2))

In [28]:
# create product class

client.schema.create_class(product_class_schema)

#prettify(client.schema.get())

UnexpectedStatusCodeException: Add properties to classes! Unexpected status code: 422, with response body: {'error': [{'message': "property 'hasReviews': invalid dataType: SingleRef class name 'Review' does not exist"}]}

In [29]:
client.schema.create_class(review_class_schema)
#prettify(client.schema.get())

In [30]:
# need to add review to product class (when it was created, hasReviews was not created)
# this is the 9 the property.

client.schema.property.create('Product', product_class_schema['properties'][8])

In [27]:
## DO NOT USE - if we make a mistake, we can delete and rerun.
schema = client.schema.get() # save schema
client.schema.delete_all() # delete all classes
prettify(client.schema.get())

{
  "classes": []
}


In [6]:
prettify(client.schema.get())

{
  "classes": [
    {
      "class": "Product",
      "description": "A Product class to store the product information and its reviewers",
      "invertedIndexConfig": {
        "bm25": {
          "b": 0.75,
          "k1": 1.2
        },
        "cleanupIntervalSeconds": 60,
        "stopwords": {
          "additions": null,
          "preset": "en",
          "removals": null
        }
      },
      "moduleConfig": {
        "text2vec-contextionary": {
          "vectorizeClassName": true
        }
      },
      "properties": [
        {
          "dataType": [
            "string"
          ],
          "description": "product id from amazon",
          "moduleConfig": {
            "text2vec-contextionary": {
              "skip": false,
              "vectorizePropertyName": false
            }
          },
          "name": "asin",
          "tokenization": "word"
        },
        {
          "dataType": [
            "text"
          ],
          "description": "The title

**Load Data**
Schema Created - now populate individual objects into the database.

In [18]:
for ind in prod_meta_list:
    print (ind)
    break

{'asin': '0011300000', 'title': 'Genuine Geovision 1 Channel 3rd Party NVR IP Software with USB Dongle Onvif PSIA', 'description': 'The following camera brands and models have been tested for compatibility with GV-Software. GeoVision  ACTi  Arecont Vision  AXIS  Bosch  Canon CNB  D-Link  EtroVision  HikVision  HUNT  IQEye JVC  LG  MOBOTIX  Panasonic  Pelco  Samsung Sanyo  Sony  UDP  Verint  VIVOTEK      Compatible Standard and Protocol GV-System also allows for integration with all other IP video devices compatible with ONVIF(V2.0), PSIA (V1.1) standards, or RTSP protocol. ONVIF  PSIA  RTSP          Note: Specifications are subject to change without notice. Every effort has been made to ensure that the information on this Web site is accurate. No liability is assumed for incidental or consequential damages arising from the use of the information or products contained herein.', 'category': ['Electronics', 'Camera &amp; Photo', 'Video Surveillance', 'Surveillance Systems', 'Surveillance 

In [31]:
import uuid

loaded_products = {} # list of dict

test_count = 0
for ind_prod in prod_meta_list:
    prod_uuid = uuid.uuid3(uuid.NAMESPACE_URL, "http://aws.amazon.com/prod/asin/"+ind_prod['asin'])
    resp1 = client.data_object.create(
        data_object=ind_prod,
        class_name='Product',
        uuid=prod_uuid 
    )
    #print (' response to prod insert  ', resp1)
    
    loaded_products [ind_prod['asin']] = prod_uuid
    
    # for debugging purpsoe only
    test_count += 1
    if test_count == 1000:
        break
        



In [33]:
print(rev_count)

3230


In [32]:
# keep track of asins of products already loaded - from reviews
# this list may or may not be needed
loaded_reviews_asin = [] 

rev_count = 0
# this list is from product load - previous step (not rev load)
prods_already_loaded = loaded_products.keys() # list of asin ids
review_prod_xref = {} 

for ind_rev in rev_list:
    
    if not ind_rev['productasin'] in prods_already_loaded:
        continue
        
    rev_uuid = uuid.uuid3(uuid.NAMESPACE_URL, "http://google.cnn.com/news/"+str(ind_rev['summary'])+ind_rev['reviewerid'])
    

    resp2 = client.data_object.create(
        data_object=ind_rev,
        class_name='Review',
        uuid=rev_uuid 
    )
    #print ('resp to rev insert ...', resp2, ' prod asin', ind_rev['productasin'])
    
    
    productasin = ind_rev['productasin']
      # add review to product dict
    if productasin in loaded_reviews_asin:
        review_prod_xref[productasin].append(rev_uuid)
    else:
        review_prod_xref[productasin] = [ rev_uuid ]
    loaded_reviews_asin.append(productasin)
    
    rev_count += 1
    if rev_count == 50000:
        break
        

ObjectAlreadyExistsException: 3d5b6468-7a5a-3c0c-99fd-97de21a60798

In [34]:
## ISSUE 1 - product to reviews - one product has multiple reviews. reference do not
## appear to accept (or we may have the schema wrong) list of review IDs.
## i am using one review here - this needs to be debugged

for (k, v) in review_prod_xref.items():

    prod_uuid = str(loaded_products[k])

    # add references
    ## Product to Reviews
    client.data_object.reference.add(
        from_uuid=prod_uuid,
        from_property_name='hasReviews',
        to_uuid=str(v[0])
    )
    ## reviews -> product
    for irev in v:
        client.data_object.reference.add(
            from_uuid=str(irev),
            from_property_name='ofProduct',
            to_uuid=prod_uuid
      )


**SAMPLE QUERIES**


In [7]:
prettify(client.data_object.get('856dd276-bd34-35d5-9655-a05ca9cf29db', with_vector=False))

{
  "class": "Review",
  "creationTimeUnix": 1653268061425,
  "id": "856dd276-bd34-35d5-9655-a05ca9cf29db",
  "lastUpdateTimeUnix": 1653268133243545650,
  "properties": {
    "ofProduct": [
      {
        "beacon": "weaviate://localhost/b9a89291-cba2-365a-8903-1bfca1f60d7f",
        "href": "/v1/objects/b9a89291-cba2-365a-8903-1bfca1f60d7f"
      }
    ],
    "overall": 5,
    "productasin": "0594459451",
    "reviewText": "Everything as expected, timely, excellent condition.",
    "reviewerid": "AXRE3OP9KPRHH",
    "reviwerName": "Jessica Long-Zlotkowski",
    "summary": "excellent condition."
  },
  "vectorWeights": null
}


In [8]:
prettify(client.data_object.get('b9a89291-cba2-365a-8903-1bfca1f60d7f', with_vector=False))

{
  "class": "Product",
  "creationTimeUnix": 1653268046582,
  "id": "b9a89291-cba2-365a-8903-1bfca1f60d7f",
  "lastUpdateTimeUnix": 1653268132818032141,
  "properties": {
    "asin": "0594459451",
    "brand": "Barnes &amp; Noble",
    "category": [
      "Electronics",
      "eBook Readers &amp; Accessories",
      "Power Cables"
    ],
    "description": "Original Barnes &amp; Noble Nook Color or Tablet Power and Connectivity Kit charging cable for the Nook Color(BNRV200) and Tablet(BNTV250).",
    "feature": [
      "<span>\n        BUY MORE AND SAVE! Purchase 2 of this Item and SAVE 25% Buy 3 SAVE 30% Buy 4 SAVE 32%\n        \n        \n        <span class=\"a-declarative\" data-action=\"a-modal\" data-a-modal=\"{&quot;width&quot;:&quot;450&quot;,&quot;header&quot;:&quot;Promotion Details&quot;,&quot;url&quot;:&quot;/gp/promotions/details/popup/A22GMJD9STOZSV&quot;,&quot;height&quot;:&quot;600&quot;}\">\n            <a href=\"javascript:void(0)\" class=\"a-popover-trigger a-declar

In [35]:
get_reviews_query = """
  {
    Get{
      Review(
        nearText: {
          concepts: ["excellent condition"],
          certainty: 0.7
        }
      ){
        reviewText, summary, reviwerName, overall
      }
    }
  }
  """


In [36]:
query_result = client.query.raw(get_reviews_query)
print(query_result)



In [5]:
get_product_query = """
{
    Get{
      Product(
        nearText: {
          concepts: ["Nook"],
          certainty: 0.7
        }
      ){
        description, title
      }
    }
  }
  """

In [6]:
query_result = client.query.raw(get_product_query)
print(query_result)

{'data': {'Get': {'Product': [{'description': 'The Oliver Cover for the NOOK Simple Touch is designed to protect and safeguard your eBook reader. The Nook simple touch case has a smooth, comfortable synthetic leather exterior. Your Nook is held in place with a simple bar-clip, eliminating the need for straps. This Nook case provides easy access to the charging port and other control buttons. Its interior is lined with stone colored suede and the cover features an embossed logo, which acts as the power on/off button for your Nook. This lets you activate your device without opening the eBook reader cover.', 'title': 'Barnes &amp; Noble Nook 2nd Edition Oliver Cover Black/Cream'}, {'description': "HDTV Adapter Kit for NOOK HD and NOOK HD+ This handy kit enables you to stream content from your NOOK HD or NOOK HD+ to a high-definition TV, via the included adapter and High Speed HDMI Cable. The kit also includes a pass-through that allows you to charge your NOOK while streaming, so there's n

In [7]:
rev_query = """
{
    Get {
        Review(limit: 2) {
        reviewText
        overall
        ofProduct {
            ... on Product {
                title
                description
                }
            }
        }
    }
}
"""

In [8]:
query_result = client.query.raw(rev_query)
print(query_result)

{'data': {'Get': {'Review': [{'ofProduct': [{'description': 'The videosecu TV mount is a mounting solution for most 22"-47" LCD LED Plasma TV and some LED up to 55" with VESA 600x400mm (24"x16"), 400x400mm (16"x16"),600x300mm(24"x12"), 400x200mm (16"x8"),300x300mm (12"x12"),300x200mm(12"x8"),200x200mm (8"x8"),200x100mm (8"x4") mounting hole pattern .Heavy gauge steel construction provides safety loading up to 66lbs display .It can tilt 15 degree forward or backward and swivel 180 degree. The removable VESA plate can be taken off for easy installation. Post-installation level adjustment allows the TV to perfectly level. The on arm cable management ring system design, guides wires and prevent cable pinching. Standard hardware and user manual included. <br />Notice: This mount fits most 22-47" TVs (VESA 200x200mm (8x8")/200x100mm(8x4")/100x100mm(4x4") without extender, fits VESA 600x400(24x16")/400x400mm(16x16")/600x300mm(24x12")/400x200mm(16x8")/300x300mm(12x12")/300x200mm(12x8")with 4 p

In [None]:
VideoSecu 24" Long Arm TV Wall Mount Low Profile Articulating Full Motion Cantilever Swing Tilt wall bracket for most 22" to 55" LED LCD TV Monitor Flat Screen VESA 200x200 400x400 up to 600x400mm MAH

In [9]:
txt1 = 'VideoSecu 24" Long Arm TV Wall Mount Low Profile Articulating Full Motion Cantilever Swing Tilt wall bracket for most 22" to 55" LED LCD TV Monitor Flat Screen VESA 200x200 400x400 up to 600x400mm MAH'


In [12]:
prod_filter_query = """
{
    Get 
    {
        {
        operator: Equal
        path:["title"]
        valueString: 'VideoSecu 24" Long Arm TV Wall Mount Low Profile Articulating Full Motion Cantilever Swing Tilt wall bracket for most 22" to 55" LED LCD TV Monitor Flat Screen VESA 200x200 400x400 up to 600x400mm MAH'
        }
    
}
"""

In [32]:
where_filter_1 = {
        'operator': 'Equal',
        'path': "title",
        'valueText': 'VideoSecu 24 Long Arm TV Wall Mount Low Profile Articulating Full Motion Cantilever Swing Tilt wall bracket for most 22 to 55 LED LCD TV Monitor Flat Screen VESA 200x200 400x400 up to 600x400mm MAH'
        }
        

In [33]:
query_result = client.query.get("Product", ["title", "description"]).with_where(where_filter_2).do()
print(query_result)

{'data': {'Get': {'Product': [{'description': 'The videosecu TV mount is a mounting solution for most 22"-47" LCD LED Plasma TV and some LED up to 55" with VESA 600x400mm (24"x16"), 400x400mm (16"x16"),600x300mm(24"x12"), 400x200mm (16"x8"),300x300mm (12"x12"),300x200mm(12"x8"),200x200mm (8"x8"),200x100mm (8"x4") mounting hole pattern .Heavy gauge steel construction provides safety loading up to 66lbs display .It can tilt 15 degree forward or backward and swivel 180 degree. The removable VESA plate can be taken off for easy installation. Post-installation level adjustment allows the TV to perfectly level. The on arm cable management ring system design, guides wires and prevent cable pinching. Standard hardware and user manual included. <br />Notice: This mount fits most 22-47" TVs (VESA 200x200mm (8x8")/200x100mm(8x4")/100x100mm(4x4") without extender, fits VESA 600x400(24x16")/400x400mm(16x16")/600x300mm(24x12")/400x200mm(16x8")/300x300mm(12x12")/300x200mm(12x8")with 4 plate extender)

In [34]:
where_filter_2 = {
        'operator': 'Equal',
        'path': "onProduct.title",
        'valueText': 'VideoSecu 24 Long Arm TV Wall Mount Low Profile Articulating Full Motion Cantilever Swing Tilt wall bracket for most 22 to 55 LED LCD TV Monitor Flat Screen VESA 200x200 400x400 up to 600x400mm MAH'
        }

In [35]:
query_result2 = client.query.get("Review", ["reviewText", "overall"]).with_where(where_filter_2).do()
print(query_result2)

{'data': {'Get': {'Review': None}}, 'errors': [{'locations': [{'column': 6, 'line': 1}], 'message': 'could not extract filters: invalid \'path\' field for filter \'{"operator":"Equal","path":["onProduct.title"],"valueText":"VideoSecu 24 Long Arm TV Wall Mount Low Profile Articulating Full Motion Cantilever Swing Tilt wall bracket for most 22 to 55 LED LCD TV Monitor Flat Screen VESA 200x200 400x400 up to 600x400mm MAH"}\': Expected a valid property name in \'path\' field for the filter, but got \'onProduct.title\'', 'path': ['Get', 'Review']}]}
