This notebook uses pre-processed data (in pickle format) and
loads the same into weaviate (and/or other) vector databases.

Initial experiments are on weaviate.

In [None]:
!pip install opencv-python
!pip install matplotlib
!pip install ipyplot
!pip install Pillow

In [1]:
import pickle
import _pickle as cPickle
import numpy as np
import json
import urllib.request

import matplotlib.pyplot as plt
#from PIL import Image
#from io import BytesIO

#import cv2
import uuid
import base64
#import ipyplot


Read in data from pre-processed files. (instead of reading from local storage attached to the VM, we can also directly read from the storage bucket)

In [2]:
# load metadata file - written using pickle
prod_meta_fn = 'proc_fashion_meta_data_june9.pkl'

with open(prod_meta_fn, 'rb') as f:
    fashion_meta_list = pickle.load(f)

In [3]:
# load reviews data - file created using _pickle 

prod_review_fn = 'proc_fashion_review_data_june9.pkl'

with open(prod_review_fn, 'rb') as f1:
    fashion_rev_list = cPickle.load(f1)

In [4]:
print ( 'num of products ...   ', len(fashion_meta_list))
print (' num of reviews ....  ', len(fashion_rev_list)) 

num of products ...    6797
 num of reviews ....   881898


Create Weaviate Schema -- for the time being declaring only a limited number
of fields as 'text' and rest as 'string'. Only text data gets vectorized.

In [5]:
import sys
#!{sys.executable} -m pip install weaviate
!{sys.executable} -m pip install weaviate-client

Defaulting to user installation because normal site-packages is not writeable


In [26]:
# initialize weaviate client -- FOR TEXT SEARCH
import weaviate

# for sandbox 
#weaviate_url = ' https://mlopssearchtest.semi.network'
#secret = weaviate.AuthClientPassword("sridiyer@gmail.com", "xxx")

weaviate_url = 'http://34.67.249.252:8080/'
secret = weaviate.AuthClientPassword("admin", "admin")

# Initiate the client with the secret
client = weaviate.Client(weaviate_url, secret)

**IMAGE SERARCH**

we can combine both for the time being following for image search

In [6]:
import weaviate
# IP will change
weaviate_url = 'http://34.67.249.252:8080/'
secret = weaviate.AuthClientPassword("admin", "admin")
client = weaviate.Client(weaviate_url, secret)

following schema has image data encoded in base64 (string) format
we have a second variable for path
should we also have a description of the image (for additional search terms etc)

In [60]:
product_class_schema = {
    "class": "Product",
    "description" : "A Product class to store the product information and its reviewers",
    "properties" : [
                 {
                    "name": "asin",
                    "dataType": ["string"],
                    "description": "product id from amazon",
                },
                {
                    "name": "title",
                    "dataType": ["text"],
                    "description": "The title of the product",
                },
                {
                    "name": "description",
                    "dataType": ["text"],
                    "description": "The product description",
                },
                {                
                    "name": "brand",
                    "dataType": ["string"],
                    "description": "The product brand",
                },
                {                
                    "name": "feature",
                    "dataType": ["text[]"],
                    "description": "The product features",
                },                
                {
                    "name": "hasReviews",
                    "dataType": ["Review"],
                    "description": "List of reviews this product listing has ",
                }                   
    ]
}

In [61]:
review_class_schema = {
    "class": "Review",
    "description": "An Review class to store the review information",
    "properties": [
                {
                    "name": "overall",
                    "dataType": ["number"],
                    "description": "overall score from user ",
                },
                {
                    "name": "reviewText",
                    "dataType": ["text"],
                    "description": "review text",
                },
                {
                    "name": "summary",
                    "dataType": ["text"],
                    "description": "reviewer provided summary of review",
                },
                {
                    "name": "productasin",
                    "dataType": ["string"],
                    "description": "reviewer provided summary of review",
                },
                {
                    "name": "reviewerid",
                    "dataType": ["string"],
                    "description": "reviewer provided summary of review",
                },
                {
                    "name": "reviwerName",
                    "dataType": ["string"],
                    "description": "reviewer provided summary of review",
                },
                {
                    "name": "ofProduct",
                    "dataType": ["Product"],
                    "description": "The product of the review",
                } 
    ]
}

In [62]:
image_class_schema = {
    "class": "Image",
    "description": "Image dataset for testing. name holds image name etc.",
    "properties": [
        {
            "name": "image",
            "description": "Image data - vector data. name included in path",
            "dataType": ["blob"]
        },
        {
            "name": "name",
            "description": "image name, path etc. to locate the image",
            "dataType": ["string"]
            
        },
        {
            "name": "ofProduct",
            "dataType": ["Product"],
            "description": "The product info of the image",
                } 
    ],
        "vectorIndexType": "hnsw",
        "moduleConfig": {
            "img2vec-neural": {
                "imageFields": [
                    "image"
                ]
            }
        },
        "vectorizer": "img2vec-neural"
    }

In [63]:
# helper function
def prettify(json_dict): 
    print(json.dumps(json_dict, indent=2))

In [74]:
# create image class
#client.schema.create_class(product_class_schema)
client.schema.create_class(review_class_schema)

client.schema.create_class(image_class_schema)

prettify(client.schema.get())

{
  "classes": [
    {
      "class": "Product",
      "description": "A Product class to store the product information and its reviewers",
      "invertedIndexConfig": {
        "cleanupIntervalSeconds": 60
      },
      "moduleConfig": {
        "text2vec-contextionary": {
          "vectorizeClassName": true
        }
      },
      "properties": [
        {
          "dataType": [
            "string"
          ],
          "description": "product id from amazon",
          "moduleConfig": {
            "text2vec-contextionary": {
              "skip": false,
              "vectorizePropertyName": false
            }
          },
          "name": "asin"
        },
        {
          "dataType": [
            "text"
          ],
          "description": "The title of the product",
          "moduleConfig": {
            "text2vec-contextionary": {
              "skip": false,
              "vectorizePropertyName": false
            }
          },
          "name": "title"
       

In [13]:
prettify(client.schema.get())

{
  "classes": [
    {
      "class": "Product",
      "description": "A Product class to store the product information and its reviewers",
      "invertedIndexConfig": {
        "cleanupIntervalSeconds": 60
      },
      "moduleConfig": {
        "text2vec-contextionary": {
          "vectorizeClassName": true
        }
      },
      "properties": [
        {
          "dataType": [
            "string"
          ],
          "description": "product id from amazon",
          "moduleConfig": {
            "text2vec-contextionary": {
              "skip": false,
              "vectorizePropertyName": false
            }
          },
          "name": "asin"
        },
        {
          "dataType": [
            "text"
          ],
          "description": "The title of the product",
          "moduleConfig": {
            "text2vec-contextionary": {
              "skip": false,
              "vectorizePropertyName": false
            }
          },
          "name": "title"
       

In [72]:
## DO NOT USE - if we make a mistake, we can delete and rerun.
schema = client.schema.get() # save schema
client.schema.delete_all() # delete all classes
prettify(client.schema.get())

{
  "classes": []
}


dir_path = '/home/sridiyer/raw_data/imagesDir/**Load Data**
Schema Created - now populate individual objects into the database.

need to get image data, convert to base64 string format and then load

dir

**SAMPLE QUERIES**


In [80]:
prettify(client.data_object.get('37fc9284-09e1-4251-ba21-b8946a4aca4c', with_vector=False))

{
  "class": "Image",
  "creationTimeUnix": 1654733346641,
  "id": "37fc9284-09e1-4251-ba21-b8946a4aca4c",
  "lastUpdateTimeUnix": 1654733346641,
  "properties": {
    "image": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAAgACADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAA

** Create a test image from saved data **

In [125]:
def save_test_img(idx):
    single_img = np.array(img[idx])
    single_img_reshaped = np.transpose(np.reshape(single_img,(3, 32,32)), (1,2,0))
    pil_img = Image.fromarray(single_img_reshaped)
    pil_img.save('test4.jpg', format="JPEG")
    print ('test image label ', labels[idx])
    print ('test image name  ', names[idx])
    

In [126]:
save_test_img(6050)

test image label  2
test image name   b'struthio_camelus_s_000810.png'


In [127]:
nearImage = {
        "image": 'test4.jpg',
    }


In [128]:
res = client.query.get(
        "Image", ["name", "label"]
    ).with_near_image(
        nearImage, encode=True
    ).with_limit(3).do()

In [129]:
print(res)

{'data': {'Get': {'Image': [{'label': '2', 'name': "b'bird_s_000650.png'"}, {'label': '6', 'name': "b'bufo_marinus_s_000444.png'"}, {'label': '3', 'name': "b'tabby_cat_s_001397.png'"}]}}}


In [None]:
for ind_prod in fashion_meta_list:
    img_url = ind_prod['imgUrl']
    prod_asin = ind_prod['asin']
    file_prefix = 'img-'+str(prod_asin)+'.jpg'
    save_fn = dir_dir+file_prefix
    urllib.request.urlretrieve("http://www.gunnerkrigg.com//comics/00000001.jpg", save_fn)
    
    insert_dict = {
        'asin' : prod_asin,
        'title' : ind_prod['title'],
        'description' : ind_prod['description'].replace('\n',' '),
        'brand': ind_prod['brand']
    }

In [12]:
count = 0
for ind_prod in fashion_meta_list:
    print (ind_prod)
    count += 1
    if count == 5:
        break

{'asin': '1940280001', 'title': 'Magical Things I Really Do Do Too!', 'description': 'For the professional or amateur magician.  Routines include: The Thought Transmitter (An accidental USAF experiment that became a hit at the Denver Playboy Club and continues through today!), Three Card Match (The trick that fooled Dai Vernon), Numbo-Jumbo (A stack of 85 cards containing random 2 and 4 digit numbers is apparently memorized), The Binary Box (Light the light  even rocket scientists could not figure it out!), Kryptonite! (Anyone can be a superhero and save the day!), Laundry Quandary (Where did the missing sock go?), Comedy Birthday Cake Production Routine (The funniest and most memorable part of any kids birthday party.), Dazzling Divination! (A pocket mentalism effect that is always ready to go. If you do safety magic, heres a strong effect that packs real small.) and Sweet Sixteen (A great commercial effect that really gets your customers message across.)\n\nIncluded in his latest boo

In [75]:
import uuid
dir_path = '/home/sridiyer/raw_data/amznFashionImgs/'
loaded_products = {} # list of dict
skip_count = 0
test_count = 0
for ind_prod in fashion_meta_list:
    skip_count += 1
    if skip_count < 10:
        continue
    prod_uuid = str(uuid.uuid4())
 
    img_url = ind_prod['imgUrl']
    prod_asin = ind_prod['asin']
    file_name = 'img-'+str(prod_asin)+'.jpg'
    save_fn = dir_path+file_name
    urllib.request.urlretrieve(img_url, save_fn)
    
    insert_dict = {
        'asin' : prod_asin,
        'title' : ind_prod['title'],
        'description' : ind_prod['description'].replace('\n',' '),
        'brand': ind_prod['brand']
    }
    
    resp1 = client.data_object.create(
        data_object=insert_dict,
        class_name='Product',
        uuid=prod_uuid 
    )
    

    encoded_image = weaviate.util.image_encoder_b64(f"/home/sridiyer/raw_data/amznFashionImgs/img-{prod_asin}.jpg")
    data_properties = {
        "name": file_name,
        "image": encoded_image
        
    }
    
    img_uuid = uuid.uuid5(uuid.NAMESPACE_DNS, "Image" + dir_path+file_name)
    resp2 = client.data_object.create(
        data_object=data_properties,
        class_name='Image',
        uuid=img_uuid
    )
    
    ## image -> product
    client.data_object.reference.add(
        from_uuid=str(resp2),
        from_property_name='ofProduct',
        to_uuid=resp1
    )
    
    loaded_products [prod_asin] = resp1
    
    # for debugging purpsoe only
    test_count += 1
    if test_count == 500:
        break

ObjectAlreadyExistsException: f27e7b09-9811-545d-9362-b788f8cff9d5

In [76]:
print (test_count)


39


In [77]:
# reviews load
# keep track of asins of products already loaded - from reviews
# this list may or may not be needed
loaded_reviews_asin = [] 

rev_count = 0
# this list is from product load - previous step (not rev load)
prods_already_loaded = loaded_products.keys() # list of asin ids
review_prod_xref = {} 

for ind_rev in fashion_rev_list:
    
    if not ind_rev['productasin'] in prods_already_loaded:
        continue
        
    rev_uuid = uuid.uuid3(uuid.NAMESPACE_URL, "http://google.cnn.com/news/"+str(ind_rev['summary'])+ind_rev['reviewerid'])
    

    resp2 = client.data_object.create(
        data_object=ind_rev,
        class_name='Review',
        uuid=rev_uuid 
    )
    #print ('resp to rev insert ...', resp2, ' prod asin', ind_rev['productasin'])
    
    
    productasin = ind_rev['productasin']
      # add review to product dict
    if productasin in loaded_reviews_asin:
        review_prod_xref[productasin].append(rev_uuid)
    else:
        review_prod_xref[productasin] = [ rev_uuid ]
    loaded_reviews_asin.append(productasin)
    
    rev_count += 1
    if rev_count == 50000:
        break


ObjectAlreadyExistsException: acfe1c99-78de-33fe-a4a1-aac78d428763

In [78]:
print (rev_count)

175


In [79]:
prettify(client.data_object.get('acfe1c99-78de-33fe-a4a1-aac78d428763', with_vector=False))


{
  "class": "Review",
  "creationTimeUnix": 1654829599980,
  "id": "acfe1c99-78de-33fe-a4a1-aac78d428763",
  "lastUpdateTimeUnix": 1654829599980,
  "properties": {
    "overall": 1,
    "productasin": "B0000AWRIF",
    "reviewText": "Not only the sizes are too big according to the provided chart but the rings were all scratched up.  I called Customer Service, Amazon stated that all they can do is to email the Jewelry store and the store will get back to me in 3-4 business days.  Well, it has been a week and I still haven't received any email.  I returned the rings and have to placed another order and put out another 1K (because we were desperate.  Our wedding is the following week!)  My husband went 1 size down & I went 1  size down from the original chart.  We have to pay $12.00 for 1-2 day rush order.  The new rings come and once again...they were all SCRATCHED up and dull looking.  I am so disappointed in Amazon!  You had lost a valuable customer.",
    "reviewerid": "A3PU69IOBTN9E

In [80]:
## ISSUE 1 - product to reviews - one product has multiple reviews. reference do not
## appear to accept (or we may have the schema wrong) list of review IDs.
## i am using one review here - this needs to be debugged

for (k, v) in review_prod_xref.items():

    prod_uuid = str(loaded_products[k])

    # add references
    ## Product to Reviews
    client.data_object.reference.add(
        from_uuid=prod_uuid,
        from_property_name='hasReviews',
        to_uuid=str(v[0])
    )
    ## reviews -> product
    for irev in v:
        client.data_object.reference.add(
            from_uuid=str(irev),
            from_property_name='ofProduct',
            to_uuid=prod_uuid
      )


UnexpectedStatusCodeException: Add property reference to object! Unexpected status code: 422, with response body: {'error': [{'message': "Could not find property 'hasReviews': no such prop with name 'hasReviews' found in class 'Product' in the schema. Check your schema files for which properties in this class are available"}]}

In [85]:
pwd

'/home/sridiyer/raw_data/amznFashionImgs'

[0m[01;35mimg-1940280001.jpg[0m  [01;35mimg-B0000BUWIZ.jpg[0m  [01;35mimg-B0000EVWYO.jpg[0m  [01;35mimg-B00023JWH2.jpg[0m
[01;35mimg-9654263246.jpg[0m  [01;35mimg-B0000BUWK5.jpg[0m  [01;35mimg-B0000EVX1Q.jpg[0m  [01;35mimg-B00023JWIG.jpg[0m
[01;35mimg-B00004T3SN.jpg[0m  [01;35mimg-B0000BUXB5.jpg[0m  [01;35mimg-B0000EVXVG.jpg[0m  [01;35mimg-B00023JX9Y.jpg[0m
[01;35mimg-B00007GDFV.jpg[0m  [01;35mimg-B0000BYA3N.jpg[0m  [01;35mimg-B0000EVXVQ.jpg[0m  [01;35mimg-B00023K7J4.jpg[0m
[01;35mimg-B0000AWOBT.jpg[0m  [01;35mimg-B0000CDVVL.jpg[0m  [01;35mimg-B0000ZH45Y.jpg[0m  [01;35mimg-B0002KBQ0Q.jpg[0m
[01;35mimg-B0000AWPJE.jpg[0m  [01;35mimg-B0000EVVOU.jpg[0m  [01;35mimg-B000162XEE.jpg[0m  [01;35mimg-B0002TKBSU.jpg[0m
[01;35mimg-B0000AWRIF.jpg[0m  [01;35mimg-B0000EVVP4.jpg[0m  [01;35mimg-B00018I6YI.jpg[0m  [01;35mimg-B0002V2T6A.jpg[0m
[01;35mimg-B0000AWTWR.jpg[0m  [01;35mimg-B0000EVVPY.jpg[0m  [01;35mimg-B00018I6Z2.jpg[0m  [01;35mimg

In [92]:
nearImage = {
    "image" :'/home/sridiyer/raw_data/amznFashionImgs/img-B0000AWVWC.jpg'
}

In [93]:
res = client.query.get(
        "Image", ["name"]
    ).with_near_image(
        nearImage, encode=True
    ).with_limit(3).do()

In [94]:
print (res)

{'data': {'Get': {'Image': [{'name': 'img-B0000EVVQI.jpg'}, {'name': 'img-B0000CDVVL.jpg'}, {'name': 'img-B0000AWVWC.jpg'}]}}}
