In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

from utils.utils import get_product_name_from_url, load_ner_data_file
from config import URL_LIST_FILE_PATH, URL_LIST_WITH_STATUS_FILE_PATH

## Original URLS

In [3]:
urls_df = pd.read_csv(URL_LIST_FILE_PATH)
urls_df.shape

(704, 1)

## URL Functionality Check

In [4]:
urls_df = pd.read_csv(URL_LIST_WITH_STATUS_FILE_PATH)

In [5]:
good_urls_df = urls_df[urls_df["status_code"] == "200"].copy()
n_status_200_urls = good_urls_df.shape[0]

print(f"Number of urls with status code 200: {n_status_200_urls}")

Number of urls with status code 200: 280


## Extracting Product Name From URL

In [6]:
good_urls_df["product"] = good_urls_df["url"].apply(get_product_name_from_url)

In [7]:
good_urls_df = good_urls_df.dropna()
good_urls_df.head()

Unnamed: 0,url,status_code,product
0,https://www.factorybuys.com.au/products/euro-t...,200,euro top mattress king
1,https://dunlin.com.au/products/beadlight-cirrus,200,beadlight cirrus
2,https://themodern.net.au/products/hamar-plant-...,200,hamar plant stand ash
3,https://furniturefetish.com.au/products/oslo-o...,200,oslo office chair white
6,https://interiorsonline.com.au/products/interi...,200,interiors online gift card


In [8]:
n_product_names_in_urls = good_urls_df.shape[0]

print(f"{n_product_names_in_urls} extracted products from {n_status_200_urls} urls;")
print(f"Or {n_product_names_in_urls / n_status_200_urls:.2%} of all urls with status code 200")

173 extracted products from 280 urls;
Or 61.79% of all urls with status code 200


## Determining Element Classes
By the product name, HTML elements that contain product information can be found.

In [9]:
dataset_dict = load_ner_data_file()

In [10]:
all_product_classes = set()

for values in dataset_dict.values():
    for product_class in values["product_classes"]:
        all_product_classes.add(product_class)

In [11]:
print(f"Number of unique product classes: {len(all_product_classes)}")

Number of unique product classes: 170


In [12]:
product_classes = dataset_dict["https://dunlin.com.au/products/beadlight-cirrus"]["product_classes"]
product_names = dataset_dict["https://dunlin.com.au/products/beadlight-cirrus"]["product_names"]

In [13]:
print(f"Product classes: {product_classes}")
print(f"Product names: {product_names}")

Product classes: ['product__title', 'product__title__wrapper']
Product names: ['beadlight cirrus led reading light']
