## Imports

In [1]:
from icrawler.builtin import GoogleImageCrawler, BingImageCrawler

from PIL import Image
from PIL.ExifTags import TAGS

import numpy as np
from pydantic import BaseModel, condecimal, AnyUrl
from typing import Optional
from requests_html import HTMLSession, HTML
import scrapy

from typing import Optional, Any

import os
import time
import random
import base64
import json
import pprint

In [3]:
from django_setup import init_django
init_django('core')


ModuleNotFoundError: No module named 'autogen'

## Knifekits crawling

In [None]:
def read_local_sitemap(file='scrapshop/smproducts.xml'):
    ## Reads local file, parses into a list of (Url, last_modified) tuples.
    
    with open(file) as f:
        txt = f.read()
        page = HTML(html=txt)
    urls, timestamps = page.xpath('//loc/text()'), page.xpath('//lastmod/text()')
    
    return list(zip(urls,timestamps))

urls = read_local_sitemap()

fields = {
    "link": '//*[@rel="canonical"]/@href',
    "sku": '//span[@itemprop="model"]/descendant-or-self::text()',
    "name": '//h1/descendant::span[@itemprop="name"]/text()',
    "main_image": '//div[@class="piGalMain"]/img/@src',
    "products_id": '//input[@name="products_id"]/@value',
    "title": "/html/head/title/text()",
    "keywords": './/meta[@name="keywords"]/@content',
    "short_desc": '//meta[@name="description"]/@content',
    "price": './/*[@itemprop="price"]/text()',
    "description": '//div[@itemprop="description"]',
    "manufacturer": '//*[@itemprop="manufacturer"]',
}
list_fields = {
    "image_urls": '//*[@class="thumbnail"]/@data-image',
    "breadcrumbs": './/*[@class="breadcrumb"]/descendant-or-self::text()',
    "discount_tiers": './/*[@class="DiscountPriceQty"]/descendant-or-self::text()',
    "discount_amount": './/*[@class="DiscountPrice"]/descendant-or-self::text()'
}

def parse_page(page_response=None):
    session = HTMLSession()
    if page_response and page_response is None:
        print("need a page to parse...")
        return None
    elif isinstance(page_response, str) and page_response.startswith('http'):
        page_response = session.get(page_response)
            
    item = {}
    for k, v in fields.items():
        item[k] = page_response.html.xpath(v, first=True)
    for k, v in list_fields.items():
        item[k] = page_response.html.xpath(v)

    return item

item = parse_page(urls[111][0])
for k in item.keys():
    print(f"{k} - {item[k]}")


In [None]:

def fetch_sitemap(url="https://knifekits.com/vcom/smproducts.xml", session=None):
    if session and session is None:
        session = HTMLSession()
    return session.get(url)


def parse_map(response=None):
    if response is None:
        response = fetch_sitemap()
    urls = response.html.xpath('//loc/text()')
    stamps = response.html.xpath('//lastmod/text()')
    return [x for x in zip(urls, stamps)]


def fetch(url, session=None):
    if session is None:
        session = HTMLSession()
    return session.get(url)


def cleanup_description(s):
    """ Takes a string and cleans it by removing newline, tab and whitespace.
    @param s: Any string
    @return: Cleaned up string
    """
    if s:
        r = re.sub('(\r\n)(\t)', ' ', s).strip()
        r = ' '.join([x for x in r.split()])
        if r:
            r = r.replace('\xa0', ' ')  # &nbsp to space
        return r
    else:
        return None


def parse_page(page_response=None):
    session = HTMLSession()
    if page_response is None:
        print("need a page to parse...")
        return None
    elif isinstance(page_response, str) and page_response.startswith('http'):
        page_response = session.get(page_response)
    item = {}
    for k, v in fields.items():
        item[k] = page_response.html.xpath(v, first=True)
    for k, v in list_fields.items():
        item[k] = page_response.html.xpath(v)

    return item


def cleanup_item(item=None):
    if item is None:
        print("Need iutem!")
        return None
    desc = item['description'].full_text
    main_image = "https://knifekits.com/vcom/" + item['main_image']
    images = ["https://knifekits.com/vcom/" +
              img for img in item['image_urls']]
    crumbs = item['breadcrumbs'][4::2]
    if len(item['discount_tiers']) > 1:
        disc_tier = [x.strip() for x in item['discount_tiers']]
        disc_amount = [x.strip() for x in item['discount_amount']]
        item['discount_tiers'] = disc_tier
        item['discount_amount'] = disc_amount

    if item['discount_tiers'] == "[]":
        print(item['discount_tiers'])

    item['description'] = desc
    item['main_image'] = main_image
    item['image_urls'] = images
    item['breadcrumbs'] = crumbs
    return item


In [2]:
def load_data(file="../KK.json"):
    with open(file) as f:
        data = json.load(f)
    return data

kk, hs = load_data("../KK.json"), load_data("../HS.json")




In [3]:
def cleaner(data):
    r=[]
    for item in data:

        item['description'] = item['description'].replace('\r\n\t','\n')
        r.append(item)
    return r
x,y = cleaner(kk), cleaner(hs)


In [4]:
print(x[0]['description'])
print(y[0]['description'])


<div itemprop="description">
		  <p>
<span style="font-size:12px;"><span style="font-family:verdana,geneva,sans-serif;">The Lady Slipper is a 5.25 inch overall length single-bladed slipjoint folding knife design. It features a 2.20 inch blade length with a 1.80 inch cutting edge. This design features pre-mounted 410 stainless steel bolsters on the front and rear of the handle, and offers a 2.0 x 0.75 inch inlay area on each side for handle treatments. The Lady Slipper parts kit includes everything need to assemble this design, with the exception of handle material. Handle mounting pins are included. This kit model also includes 2 pieces of shim stock to aide in construction.<br>
<br>
The Lady Slipper is a USA design, and comes in an elegant gift box. This model offers plenty of builder challenge, and will produce an heirloom quality pocket knife with proper construction and finishing.<br>
<br>
<u><b>Specifications</b></u><br>
Model: Lady Slipper<br>
Type: Slipjoint Folder<br>
Overall L

## Image Utilities

#### Function that searches the folder for image files, converts them to a tensor

In [None]:
google_crawler = GoogleImageCrawler(
    feeder_threads=1,
    parser_threads=1,
    downloader_threads=4,
    storage={'root_dir': 'images/google'}
)
filters = dict(
    size='large',
    # color='orange',
    license='commercial,modify',
    date=((2000,1,1), (2023,1,1))
)
keywords = ""
bing_crawler = BingImageCrawler(downloader_threads=4,storage={'root_dir': "images/bing"})
# google_crawler.crawl(keyword=k,filters=filters,offset=0,max_num=25,min_size=(400,400),max_size=None, file_idx_offset=0)
# bing_crawler.crawl(keyword=k, filters=None, offset=0, max_num=25)
# baidu_crawler.crawl(keyword=k, offset=0, max_num=25, min_size=(400,400), max_size=None)

In [None]:
def create_imgs_matrix(directory, px_size=50):
    global image_files   
    image_files = []
    # create list of all files in directory     
    folder_files = [filename for filename in os.listdir(directory)]  
    
    # create images matrix   
    counter = 0
    for filename in folder_files: 
        # check if the file is accesible and if the file format is an image
        if not os.path.isdir(directory + filename) and imghdr.what(directory + filename):
            # decode the image and create the matrix
            img = cv2.imdecode(np.fromfile(directory + filename, dtype=np.uint8), cv2.IMREAD_UNCHANGED)
            if type(img) == np.ndarray:
                img = img[...,0:3]
                # resize the image based on the given compression value
                img = cv2.resize(img, dsize=(px_size, px_size), interpolation=cv2.INTER_CUBIC)
                if counter == 0:
                    imgs_matrix = img
                    image_files.append(filename)
                    counter += 1
                else:
                    imgs_matrix = np.concatenate((imgs_matrix, img))
                    image_files.append(filename)
    return imgs_matrix