In [None]:
!pip install seqeval
!pip install datasets
!pip install evaluate
!pip install optuna
!pip install tempfile
!pip install shutil
!pip install aiohttp beautifulsoup4 pandas nltk spacy

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=8043c290a927a0382b8a0a4ce02559ebb8e6379300c74909dc0795ef5290d8e8
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m99.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Import necessary libraries
import pandas as pd
import aiohttp
from bs4 import BeautifulSoup
import asyncio
import nest_asyncio
import random
import re
from typing import List, Dict, Any
import logging
from aiohttp import ClientTimeout, ClientConnectorError, ClientResponseError
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer,
    pipeline, DataCollatorWithPadding, DataCollatorForTokenClassification,
    EarlyStoppingCallback, AutoConfig, BertTokenizerFast, BertForTokenClassification, AdamW
)
from datasets import Dataset
from evaluate import load as load_metric
from sklearn.model_selection import train_test_split, KFold
import yaml
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import os
import traceback
import numpy as np
from concurrent.futures import ProcessPoolExecutor
import multiprocessing
from multiprocessing import Pool, cpu_count, set_start_method

import spacy
from spacy.matcher import PhraseMatcher

import torch
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader

import optuna
import tempfile
import shutil
import json

from seqeval.metrics import classification_report

import torch


# Ensure NLTK stopwords are downloaded
nltk.download('stopwords', quiet=True)
nltk.download('wordnet')

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_config(config_path: str = 'config.yaml') -> Dict[str, Any]:
    """
    Load configuration from a YAML file.

    Args:
        config_path (str): Path to the configuration file.

    Returns:
        Dict[str, Any]: Configuration dictionary.

    Raises:
        FileNotFoundError: If the configuration file is not found.
        ValueError: If the configuration file is empty or invalid.
        KeyError: If required configuration keys are missing.
    """
    if not os.path.exists(config_path):
        raise FileNotFoundError(f"Configuration file not found: {config_path}")

    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)

    if config is None:
        raise ValueError(f"Failed to load configuration from {config_path}. The file might be empty or invalid.")

    required_keys = ['csv_file', 'num_sites', 'random_seed', 'product_types', 'new_pages_csv', 'output_csv']
    missing_keys = [key for key in required_keys if key not in config]

    if missing_keys:
        raise KeyError(f"Missing required configuration keys: {', '.join(missing_keys)}")

    return config

# Load configuration
config = load_config()

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
class WebCrawler:
    """
    A class to asynchronously crawl websites and fetch their textual content.
    """

    @staticmethod
    async def fetch(session: aiohttp.ClientSession, url: str) -> Dict[str, str]:
        """
        Asynchronously fetches the content of a given URL.

        Args:
            session (aiohttp.ClientSession): The HTTP session to use for the request.
            url (str): The URL to crawl.

        Returns:
            Dict[str, str]: A dictionary containing the website URL and its extracted text.
        """
        try:
            headers = {
                'User-Agent': (
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                    'AppleWebKit/537.36 (KHTML, like Gecko) '
                    'Chrome/91.0.4472.124 Safari/537.36'
                )
            }
            async with session.get(url, headers=headers, timeout=10) as response:
                response.raise_for_status()  # Raises an HTTPError for bad responses
                text = await response.text()
                soup = BeautifulSoup(text, 'html.parser')
                extracted_text = soup.get_text(separator=' ', strip=True)
                return {'website': url, 'text': extracted_text}
        except aiohttp.ClientError as e:
            logging.error(f"Error accessing {url}: {str(e)}")
            return {'website': url, 'text': ''}
        except Exception as e:
            logging.error(f"Unexpected error crawling {url}: {str(e)}")
            return {'website': url, 'text': ''}


class DataCleaner:
    """
    A class to clean and preprocess textual data extracted from websites.
    """

    @staticmethod
    def remove_header_footer(text: str) -> str:
        """
        Removes the header and footer from the text based on line counts.

        Args:
            text (str): The raw text extracted from a website.

        Returns:
            str: The text after removing headers and footers.
        """
        lines = text.split('\n')
        if len(lines) > 20:
            cutoff = max(1, len(lines) // 10)  # Remove first and last 10% of lines
            return '\n'.join(lines[cutoff:-cutoff])
        return text

    @staticmethod
    def remove_contact_info(text: str) -> str:
        """
        Removes contact information such as phone numbers and email addresses from the text.

        Args:
            text (str): The text to clean.

        Returns:
            str: The text after removing contact information.
        """
        # Remove phone numbers (formats like 123-456-7890, 1234567890)
        text = re.sub(r'\b\d{10}\b|\b\d{3}[-.\s]\d{3}[-.\s]\d{4}\b', ' ', text)
        # Remove email addresses
        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', ' ', text)
        return text

    @staticmethod
    def remove_html_tags(text: str) -> str:
        """
        Strips HTML tags from the text more thoroughly.

        Args:
            text (str): The text containing HTML tags.

        Returns:
            str: The text after removing HTML tags and specific attributes.
        """
        # First, use BeautifulSoup to remove all HTML tags
        soup = BeautifulSoup(text, 'html.parser')

        # Remove script and style elements
        for script_or_style in soup(["script", "style"]):
            script_or_style.decompose()

        # Extract text from remaining tags
        text = soup.get_text(separator=' ', strip=True)

        # Remove specific attributes that might remain
        text = DataCleaner.remove_html_attributes(text)

        return text

    @staticmethod
    def remove_html_attributes(text: str) -> str:
        """
        Removes specific HTML attributes that might remain after initial cleaning.

        Args:
            text (str): The text to clean.

        Returns:
            str: The text after removing specific HTML attributes.
        """
        # Remove common attributes that might remain
        patterns = [
            r'\btitle="[^"]*"',
            r'\bhref="[^"]*"',
            r'\balt="[^"]*"',
            r'\bclass="[^"]*"',
            r'\bid="[^"]*"',
            r'\bstyle="[^"]*"',
            r'\bsrc="[^"]*"',
            r'\bdata-[a-zA-Z0-9-]+="[^"]*"'
        ]
        for pattern in patterns:
            text = re.sub(pattern, '', text)

        # Remove any remaining HTML-like tags
        text = re.sub(r'<[^>]+>', '', text)

        return text


    @staticmethod
    def remove_special_characters(text: str) -> str:
        """
        Removes special characters from the text, leaving only alphanumeric characters and spaces.

        Args:
            text (str): The text to clean.

        Returns:
            str: The text after removing special characters.
        """
        return re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

    @staticmethod
    def lemmatize_text(text: str) -> str:
        """
        Lemmatizes the text to reduce words to their base forms.

        Args:
            text (str): The text to lemmatize.

        Returns:
            str: The lemmatized text.
        """
        doc = nlp(text)
        lemmatized = ' '.join([lemmatizer.lemmatize(token.text) for token in doc])
        return lemmatized

    @staticmethod
    def clean_text(text: str) -> str:
        """
        Performs comprehensive cleaning on the text, including removing headers/footers,
        contact info, HTML tags and attributes, special characters, lemmatization, and stop words removal.

        Args:
            text (str): The raw text to clean.

        Returns:
            str: The cleaned text.
        """
        # Remove header and footer
        text = DataCleaner.remove_header_footer(text)

        # Remove contact information
        text = DataCleaner.remove_contact_info(text)

        # Remove HTML tags and attributes
        text = DataCleaner.remove_html_tags(text)

        # Remove special characters
        text = DataCleaner.remove_special_characters(text)

        # Convert to lowercase
        text = text.lower()

        # Remove stop words
        text = ' '.join(word for word in text.split() if word not in stop_words)

        # Lemmatize text
        text = DataCleaner.lemmatize_text(text)

        return text.strip()

    @staticmethod
    def clean_data(df: pd.DataFrame) -> pd.DataFrame:
        """
        Cleans the data by removing duplicates, applying text cleaning, and filtering based on text length.

        Args:
            df (pd.DataFrame): The DataFrame containing raw website data.

        Returns:
            pd.DataFrame: The cleaned DataFrame with an additional 'cleaned_text' column.
        """
        # Drop duplicate texts
        df = df.drop_duplicates(subset=['text']).copy()

        # Apply text cleaning
        df['cleaned_text'] = df['text'].apply(DataCleaner.clean_text)

        # Filter out entries with cleaned text length <= 50
        cleaned_df = df[df['cleaned_text'].str.len() > 50].reset_index(drop=True)

        # Additional Data Validation: Ensure texts are not empty after cleaning
        cleaned_df = cleaned_df[cleaned_df['cleaned_text'].str.isalpha() == False]  # Exclude texts with only alphabets if needed

        return cleaned_df


async def process_batch(session: aiohttp.ClientSession, urls: List[str]) -> List[Dict[str, str]]:
    """
    Processes a batch of URLs by fetching their content asynchronously.

    Args:
        session (aiohttp.ClientSession): The HTTP session to use for requests.
        urls (List[str]): A list of URLs to fetch.

    Returns:
        List[Dict[str, str]]: A list of dictionaries containing website URLs and their extracted text.
    """
    tasks = [WebCrawler.fetch(session, url) for url in urls]
    return await asyncio.gather(*tasks)


async def process_all_websites(csv_file: str, batch_size: int = 50) -> pd.DataFrame:
    """
    Processes all websites listed in the CSV file in batches.

    Args:
        csv_file (str): The path to the CSV file containing website URLs.
        batch_size (int, optional): The number of websites to process in each batch. Defaults to 50.

    Returns:
        pd.DataFrame: A DataFrame containing the crawled data from all websites.
    """
    # Read the CSV file containing website URLs
    df = pd.read_csv(csv_file)
    all_sites = df['website'].dropna().tolist()  # Ensure no NaN URLs
    results = []

    async with aiohttp.ClientSession() as session:
        for i in range(0, len(all_sites), batch_size):
            batch = all_sites[i:i+batch_size]
            batch_results = await process_batch(session, batch)
            results.extend(batch_results)
            logging.info(f"Processed batch {i//batch_size + 1} of {len(all_sites)//batch_size + 1}")

    return pd.DataFrame(results)


def validate_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Performs data validation checks on the cleaned DataFrame.

    Args:
        df (pd.DataFrame): The cleaned DataFrame to validate.

    Returns:
        pd.DataFrame: The validated DataFrame after applying all checks.
    """
    initial_count = len(df)
    logging.info(f"Initial data count: {initial_count}")

    # Check for missing values in essential columns
    df_clean = df.dropna(subset=['website', 'text', 'cleaned_text']).copy()
    dropped_missing = initial_count - len(df_clean)
    if dropped_missing > 0:
        logging.warning(f"Dropped {dropped_missing} records due to missing values.")

    # Validate URL formats using regex
    url_pattern = re.compile(
        r'^(https?:\/\/)?' # http:// or https://
        r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;&=+\$,\w]+@)?[A-Za-z0-9.-]+|' # domain...
        r'(?:www\.|[-;&=+\$,\w]+@)[A-Za-z0-9.-]+)' # ...with or without www
        r'((?:\/[\+~%\/.\w\-]*)?\??(?:[-+=&;%@.\w_]*)#?(?:[\w]*))?)'
        r'$'
    )
    valid_urls = df_clean['website'].apply(lambda x: bool(url_pattern.match(x)))
    invalid_urls = df_clean[~valid_urls]
    if not invalid_urls.empty:
        logging.warning(f"Dropped {len(invalid_urls)} records due to invalid URLs.")
        df_clean = df_clean[valid_urls].copy()

    # Ensure 'cleaned_text' is not empty or too short after cleaning
    df_clean = df_clean[df_clean['cleaned_text'].str.len() > 50].reset_index(drop=True)
    logging.info(f"Validated data count: {len(df_clean)}")

    return df_clean


async def main(config: Dict[str, Any]) -> None:
    """
    The main asynchronous function to orchestrate web crawling and data cleaning.

    Args:
        config (Dict[str, Any]): Configuration dictionary containing 'csv_file' and 'random_seed'.
    """
    # Process all websites and fetch their content
    all_crawled_df = await process_all_websites(config['csv_file'], batch_size=50)
    logging.info(f"Total websites crawled: {len(all_crawled_df)}")

    # Clean the fetched data
    all_cleaned_df = DataCleaner.clean_data(all_crawled_df)
    logging.info(f"Total cleaned records: {len(all_cleaned_df)}")

    # Perform data validation
    validated_df = validate_data(all_cleaned_df)
    logging.info(f"Total validated records: {len(validated_df)}")

    # Randomly select 100 cleaned results
    random.seed(config['random_seed'])
    sample_size = min(100, len(validated_df))
    cleaned_df = validated_df.sample(n=sample_size, random_state=config['random_seed']).reset_index(drop=True)
    logging.info(f"Number of cleaned results selected: {len(cleaned_df)}")

    # Create unused_df with the remaining websites
    used_websites = set(cleaned_df['website'])
    all_websites = set(validated_df['website'])
    unused_websites = all_websites - used_websites
    unused_df = validated_df[validated_df['website'].isin(unused_websites)].reset_index(drop=True)
    logging.info(f"Number of unused websites: {len(unused_df)}")

    # Save the cleaned and unused data to CSV files
    cleaned_df.to_csv('cleaned_furniture_data.csv', columns=['website', 'text', 'cleaned_text'], index=False)
    logging.info("Saved cleaned data to 'cleaned_furniture_data.csv'.")

    unused_df.to_csv('new_furniture_websites.csv', index=False)
    logging.info("Saved unused websites to 'new_furniture_websites.csv'.")

    # Print summary
    print(f"Number of cleaned results: {len(cleaned_df)}")
    print(f"Number of unused websites: {len(unused_df)}")

# Configuration dictionary
config = {
    'csv_file': 'URL_list.csv',  # Path to your input CSV file containing website URLs
    'random_seed': 22,
    'num_sites': 100
}

In [None]:
nest_asyncio.apply()

if __name__ == "__main__":
    asyncio.run(main(config))

ERROR:root:Error accessing https://home-buy.com.au/products/bridger-pendant-larger-lamp-metal-brass: Cannot connect to host home-buy.com.au:443 ssl:default [Name or service not known]
ERROR:root:Error accessing https://beckurbanfurniture.com.au/products/page/2/: Cannot connect to host beckurbanfurniture.com.au:443 ssl:default [Name or service not known]
ERROR:root:Error accessing https://hemisphereliving.com.au/products/: 520, message='', url='https://hemisphereliving.com.au/products/'
ERROR:root:Error accessing https://furnish123watertown.com/products/: Cannot connect to host furnish123watertown.com:443 ssl:True [SSLCertVerificationError: (1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'furnish123watertown.com'. (_ssl.c:1007)")]
ERROR:root:Error accessing https://edenliving.online/collections/summerloving/products/nice-lounge-1: 404, message='Not Found', url='https://edenliving.online/collections/summerloving/products/ni

Number of cleaned results: 100
Number of unused websites: 157


In [None]:
furniture_keywords = [
    'sofa', 'armchair', 'chair', 'table', 'bed', 'bench', 'stool',
    'cabinet', 'desk', 'couch', 'ottoman', 'shelf', 'sideboard',
    'dresser', 'nightstand', 'lounger', 'recliner', 'bookcase',
    'wardrobe', 'chaise longue', 'futon', 'daybed', 'loveseat',
    'sectional', 'coffee table', 'end table', 'console table',
    'dining table', 'buffet', 'hutch', 'credenza', 'vanity',
    'chest of drawers', 'armoire', 'entertainment center',
    'bar stool', 'dining chair', 'office chair', 'bean bag',
    'pouf', 'footstool', 'hammock', 'rocking chair', 'glider',
    'folding chair', 'bunk bed', 'loft bed', 'trundle bed',
    'canopy bed', 'platform bed', 'sleigh bed', 'murphy bed',
    'bookshelf', 'étagère', 'coat rack', 'shoe rack', 'TV stand',
    'media console', 'kitchen island', 'bar cart', 'file cabinet',
    'curio cabinet', 'china cabinet', 'display case', 'trunk',
    'storage bench', 'hall tree', 'room divider', 'screen',
    'mirror', 'headboard', 'footboard', 'dressing table',
    'writing desk', 'computer desk', 'standing desk', 'drafting table',
    'workbench', 'kitchen cabinet', 'pantry', 'wine rack'
]

furniture_descriptions = [
    # Materials
    'oak', 'pine', 'maple', 'birch', 'walnut', 'mahogany', 'cherry',
    'teak', 'bamboo', 'rattan', 'wicker', 'plywood', 'MDF',
    'particleboard', 'glass', 'tempered glass', 'steel', 'stainless steel',
    'wrought iron', 'aluminum', 'brass', 'copper', 'bronze',
    'leather', 'faux leather', 'suede', 'velvet', 'linen', 'cotton',
    'wool', 'polyester', 'nylon', 'acrylic', 'microfiber',
    'marble', 'granite', 'quartz', 'slate', 'concrete', 'terrazzo',
    'resin', 'plastic', 'polypropylene', 'polycarbonate',

    # Colors
    'white', 'black', 'gray', 'beige', 'brown', 'red', 'blue', 'green',
    'yellow', 'orange', 'purple', 'pink', 'turquoise', 'navy',
    'cream', 'ivory', 'tan', 'gold', 'silver', 'bronze', 'copper',
    'charcoal', 'taupe', 'sage', 'olive', 'mustard', 'burgundy',
    'teal', 'aqua', 'lavender', 'mauve', 'coral', 'rust',

    # Styles
    'modern', 'contemporary', 'traditional', 'rustic', 'industrial',
    'minimalist', 'scandinavian', 'mid-century modern', 'art deco',
    'bohemian', 'farmhouse', 'coastal', 'shabby chic', 'eclectic',
    'transitional', 'victorian', 'colonial', 'french country',
    'mediterranean', 'japanese', 'moroccan', 'retro', 'vintage',
    'antique', 'baroque', 'gothic', 'neoclassical', 'art nouveau',

    # Types and Features
    'upholstered', 'tufted', 'quilted', 'slipcovered', 'distressed',
    'reclaimed', 'recycled', 'upcycled', 'handcrafted', 'custom',
    'modular', 'sectional', 'convertible', 'expandable', 'folding',
    'stackable', 'nested', 'adjustable', 'swivel', 'rocking',
    'reclining', 'lift', 'storage', 'floating', 'wall-mounted',
    'freestanding', 'built-in', 'corner', 'L-shaped', 'U-shaped',
    'curved', 'ergonomic', 'orthopedic', 'outdoor', 'indoor-outdoor',
    'weatherproof', 'waterproof', 'stain-resistant', 'pet-friendly',

    # Finishes and Treatments
    'painted', 'stained', 'varnished', 'lacquered', 'oiled',
    'waxed', 'brushed', 'polished', 'matte', 'glossy', 'satin',
    'distressed', 'weathered', 'antiqued', 'patinated', 'powder-coated',
    'anodized', 'galvanized', 'chrome-plated', 'nickel-plated',

    # Patterns and Textures
    'striped', 'plaid', 'checkered', 'floral', 'geometric',
    'abstract', 'solid', 'textured', 'smooth', 'rough', 'embossed',
    'embroidered', 'perforated', 'woven', 'knitted', 'quilted',

    # Size and Scale
    'oversized', 'compact', 'space-saving', 'miniature', 'large-scale',
    'petite', 'tall', 'low', 'wide', 'narrow', 'deep', 'shallow'
]


In [None]:
# Combine keywords and descriptions, and sort by length (descending)
all_furniture_terms = sorted(furniture_keywords + furniture_descriptions, key=len, reverse=True)

def label_text(text: str) -> List[Tuple[str, str]]:
    """
    Label the input text with furniture-related tags.

    Args:
        text (str): The input text to be labeled.

    Returns:
        List[Tuple[str, str]]: A list of tuples containing tokens and their corresponding labels.
    """
    doc = nlp(text)
    labeled_tokens = []
    i = 0

    while i < len(doc):
        token = doc[i]
        if token.is_space or token.is_punct:
            labeled_tokens.append((token.text, "O"))
            i += 1
            continue

        # Check for multi-word furniture terms
        matched_term = None
        for term in all_furniture_terms:
            if text[token.idx:].lower().startswith(term.lower()):
                matched_term = term
                break

        if matched_term:
            term_tokens = nlp(matched_term)
            if matched_term in furniture_keywords:
                labeled_tokens.append((term_tokens[0].text, "B-PRODUCT"))
                labeled_tokens.extend([(t.text, "I-PRODUCT") for t in term_tokens[1:]])
            else:
                labeled_tokens.extend([(t.text, "B-PRODUCT") for t in term_tokens])
            i += len(term_tokens)
        else:
            labeled_tokens.append((token.text, "O"))
            i += 1

    return labeled_tokens

# Read the CSV file
df = pd.read_csv('cleaned_furniture_data.csv')

# Process each row and label the text
labeled_data = []
for text in tqdm(df['cleaned_text'], desc="Labeling data"):
    labeled_tokens = label_text(text)
    labeled_data.append(labeled_tokens)

# Convert labeled data to BERT input format
bert_input = []
for sentence in labeled_data:
    tokens = [token for token, _ in sentence]
    labels = [label for _, label in sentence]
    bert_input.append((tokens, labels))

# Save the labeled data in a format suitable for BERT fine-tuning
import json

with open('labeled_furniture_data.json', 'w') as f:
    json.dump(bert_input, f)

print("Labeling complete. Data saved to 'labeled_furniture_data.json'")

Labeling data:   0%|          | 0/100 [00:00<?, ?it/s]

Labeling complete. Data saved to 'labeled_furniture_data.json'


In [None]:
# Load the labeled data
with open('labeled_furniture_data.json', 'r') as f:
    labeled_data = json.load(f)

# Define the label map
label_map = {"O": 0, "B-PRODUCT": 1, "I-PRODUCT": 2}
id2label = {i: label for label, i in label_map.items()}

class FurnitureDataset(Dataset):
    """
    Custom Dataset for furniture NER task.
    """
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens, labels = self.data[idx]
        encoding = self.tokenizer(tokens,
                                  is_split_into_words=True,
                                  max_length=self.max_length,
                                  padding='max_length',
                                  truncation=True)

        word_ids = encoding.word_ids()

        label_ids = [-100] * len(encoding['input_ids'])
        for word_idx, word_id in enumerate(word_ids):
            if word_id is not None and word_idx < len(labels):
                label_ids[word_idx] = label_map[labels[word_idx]]

        return {
            'input_ids': torch.tensor(encoding['input_ids']),
            'attention_mask': torch.tensor(encoding['attention_mask']),
            'labels': torch.tensor(label_ids)
        }

def train_model(model, train_dataloader, val_dataloader, optimizer, device, num_epochs):
    """
    Train the model and perform validation.

    Args:
        model: The BERT model for token classification.
        train_dataloader: DataLoader for training data.
        val_dataloader: DataLoader for validation data.
        optimizer: The optimizer for training.
        device: The device to run the model on.
        num_epochs: Number of training epochs.

    Returns:
        float: The validation loss.
    """
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        val_loss = 0
        predictions, true_labels = [], []
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

                preds = torch.argmax(outputs.logits, dim=2)
                predictions.extend([
                    [id2label[p.item()] for (p, l) in zip(pred, label) if l != -100]
                    for pred, label in zip(preds, labels)
                ])
                true_labels.extend([
                    [id2label[l.item()] for l in label if l != -100]
                    for label in labels
                ])

        val_loss /= len(val_dataloader)
        print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss:.4f}")
        print(classification_report(true_labels, predictions))

    return val_loss

def objective(trial):
    """
    Objective function for Optuna hyperparameter optimization.

    Args:
        trial: An Optuna trial object.

    Returns:
        float: The validation loss to be minimized.
    """
    # Hyperparameters to optimize
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
    num_epochs = trial.suggest_int('num_epochs', 2, 5)

    # Create datasets and dataloaders
    train_dataset = FurnitureDataset(train_data, tokenizer, max_length=128)
    val_dataset = FurnitureDataset(val_data, tokenizer, max_length=128)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

    # Initialize model and optimizer
    model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_map))
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Train and evaluate
    val_loss = train_model(model, train_dataloader, val_dataloader, optimizer, device, num_epochs)

    return val_loss

# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Split the data
train_data, val_data = train_test_split(labeled_data, test_size=0.2, random_state=42)

# Set up tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

# Hyperparameter optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

print("Best trial:")
trial = study.best_trial
print(f"Value: {trial.value}")
print("Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Train final model with best hyperparameters
best_params = study.best_params
final_model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_map))
final_optimizer = AdamW(final_model.parameters(), lr=best_params['learning_rate'])

train_dataset = FurnitureDataset(train_data, tokenizer, max_length=128)
val_dataset = FurnitureDataset(val_data, tokenizer, max_length=128)
train_dataloader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=best_params['batch_size'])

train_model(final_model, train_dataloader, val_dataloader, final_optimizer, device, best_params['num_epochs'])

# Save the final model
final_model.save_pretrained('./furniture_ner_model')
tokenizer.save_pretrained('./furniture_ner_model')

In [None]:
# Load the fine-tuned model and tokenizer
model_path = './furniture_ner_model'
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForTokenClassification.from_pretrained(model_path)
model.eval()

# Define the label map
label_map = {0: "O", 1: "B-PRODUCT", 2: "I-PRODUCT"}

def extract_furniture_names(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    furniture_names = []
    current_name = []

    for token, prediction in zip(tokens, predictions[0]):
        if label_map[prediction.item()] == "B-PRODUCT":
            if current_name:
                furniture_names.append(" ".join(current_name))
                current_name = []
            current_name.append(token)
        elif label_map[prediction.item()] == "I-PRODUCT":
            current_name.append(token)
        elif current_name:
            furniture_names.append(" ".join(current_name))
            current_name = []

    if current_name:
        furniture_names.append(" ".join(current_name))

    return [name.replace("#", "").strip() for name in furniture_names if name.strip()]

# Load the CSV file
df = pd.read_csv('new_furniture_websites.csv')

# Process each website and extract furniture names
results = {}

for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing websites"):
    website = row['website']
    cleaned_text = row['cleaned_text']

    furniture_names = extract_furniture_names(cleaned_text, tokenizer, model)

    if furniture_names:
        results[website] = list(set(furniture_names))  # Remove duplicates

# Save the results to a JSON file
with open('furniture_products.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Furniture names extraction complete. Results saved to 'furniture_products.json'")

Processing websites:   0%|          | 0/157 [00:00<?, ?it/s]

Furniture names extraction complete. Results saved to 'furniture_products.json'


In [None]:
def clean_product_name(name: str) -> str:
    """
    Clean a product name by removing non-letter characters and extra spaces.

    Args:
        name (str): The original product name.

    Returns:
        str: The cleaned product name.
    """
    # Remove any leading/trailing whitespace
    name = name.strip()
    # Remove any non-letter characters
    name = re.sub(r'[^a-zA-Z\s]', '', name)
    # Remove extra spaces
    name = re.sub(r'\s+', ' ', name)
    return name

# Load the JSON file
with open('furniture_products.json', 'r') as f:
    data = json.load(f)

# Clean the data
cleaned_data = {}

for website, products in data.items():
    cleaned_products = []
    for product in products:
        cleaned_product = clean_product_name(product)
        # Check if the cleaned product name is valid (length > 2 and contains only letters)
        if len(cleaned_product) > 2 and cleaned_product.replace(' ', '').isalpha():
            cleaned_products.append(cleaned_product)

    # Only add the website to the cleaned data if it has valid products
    if cleaned_products:
        cleaned_data[website] = cleaned_products

# Save the cleaned data back to a JSON file
with open('cleaned_furniture_products.json', 'w') as f:
    json.dump(cleaned_data, f, indent=2)

print("Data cleaning complete. Results saved to 'cleaned_furniture_products.json'")

Data cleaning complete. Results saved to 'cleaned_furniture_products.json'


In [None]:
from flask import Flask, render_template_string, request
from IPython.display import HTML
import threading


# Apply nest_asyncio to allow asynchronous operations in Jupyter notebooks
nest_asyncio.apply()

app = Flask(__name__)

# Load the JSON data containing cleaned furniture products
with open('cleaned_furniture_products.json', 'r') as f:
    product_data = json.load(f)

def normalize_url(url: str) -> str:
    """
    Normalize a URL by removing protocol, 'www', and trailing slash.

    Args:
        url (str): The URL to normalize.

    Returns:
        str: The normalized URL.
    """
    # Remove protocol (http:// or https://)
    url = re.sub(r'^https?://', '', url)
    # Remove www. if present
    url = re.sub(r'^www\.', '', url)
    # Remove trailing slash
    url = url.rstrip('/')
    return url

# HTML template as a string
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Furniture Product Search</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
        }
        h1 {
            text-align: center;
        }
        form {
            display: flex;
            margin-bottom: 20px;
        }
        input[type="text"] {
            flex-grow: 1;
            padding: 10px;
            font-size: 16px;
        }
        input[type="submit"] {
            padding: 10px 20px;
            font-size: 16px;
            background-color: #4CAF50;
            color: white;
            border: none;
            cursor: pointer;
        }
        ul {
            list-style-type: none;
            padding: 0;
        }
        li {
            background-color: #f1f1f1;
            margin-bottom: 10px;
            padding: 10px;
            border-radius: 5px;
        }
    </style>
</head>
<body>
    <h1>Furniture Product Search</h1>
    <form method="POST">
        <input type="text" name="search_url" placeholder="Enter website URL" value="{{ search_url }}" required>
        <input type="submit" value="Search">
    </form>
    {% if products %}
        <h2>Products found for: {{ search_url }}</h2>
        <ul>
            {% for product in products %}
                <li>{{ product }}</li>
            {% endfor %}
        </ul>
    {% elif search_url %}
        <p>No products found for the given URL.</p>
    {% endif %}
</body>
</html>
"""

@app.route('/', methods=['GET', 'POST'])
def index():
    """
    Handle GET and POST requests for the main page.

    Returns:
        str: Rendered HTML template with search results.
    """
    products = []
    search_url = ''
    if request.method == 'POST':
        search_url = request.form['search_url']
        normalized_url = normalize_url(search_url)

        for url, prods in product_data.items():
            if normalize_url(url) == normalized_url:
                products = prods
                break

    return render_template_string(html_template, products=products, search_url=search_url)

def run_flask():
    """
    Run the Flask application.
    """
    app.run(port=5000)

# Start Flask in a separate thread
threading.Thread(target=run_flask).start()

# Display a link to the Flask app
HTML('<a href="http://localhost:5000" target="_blank">Click here to open the Flask app</a>')

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
