In [46]:
# !pip install flair beautifulsoup4 pdfkit
# !sudo apt-get install wkhtmltopdf

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from flair.data import Sentence 
from flair.models import SequenceTagger
from tqdm.auto import tqdm
from segtok.segmenter import split_single
import logging
import numpy as np
import ast


tagger = SequenceTagger.load('ner-ontonotes-large')  # ner-ontonotes-large

2022-01-24 19:07:41,254 --------------------------------------------------------------------------------
2022-01-24 19:07:41,255 The model key 'ner-ontonotes-large' now maps to 'https://huggingface.co/flair/ner-english-ontonotes-large' on the HuggingFace ModelHub
2022-01-24 19:07:41,255  - The most current version of the model is automatically downloaded from there.
2022-01-24 19:07:41,255 --------------------------------------------------------------------------------
2022-01-24 19:07:41,834 loading file /home/gabriel/.flair/models/ner-english-ontonotes-large/2da6c2cdd76e59113033adf670340bfd820f0301ae2e39204d67ba2dc276cc28.ec1bdb304b6c66111532c3b1fc6e522460ae73f1901848a4d0362cdf9760edb1


In [29]:
def build_dataset(urls, char_limit=100000, page_limit=None):

    urls = pd.read_csv('furniture stores pages.csv').values.squeeze().tolist()[:page_limit]
    pages = []
    
    for url in urls:

        try:

            data = requests.get(url)
            soup = BeautifulSoup(data.content, 'html.parser')
            tags = soup.find_all()

            text = []
            blacklist = [
                '[document]',
                'noscript',
                'header',
                'html',
                'meta',
                'head',
                'input',
                'script',
                'style',
                'title'
            ]

            for t in tags:
                if t.name not in blacklist and t.parent.name not in blacklist:
                    clean_text = " ".join(t.text.split())
                    if len(clean_text) > 1:
                        text.append(clean_text)

#             if len(clean_text) > char_limit:
#                 print(clean_text)
#                 print(len(clean_text))
#                 clean_text = clean_text[:char_limit]
    
            pages.append(text)

        except Exception as e:
            logging.exception(str(e))
            pages.append(np.nan)

    df = pd.DataFrame(columns=['raw_text'])
    df['raw_text'] = pages
    # df = df.replace(np.nan, '', regex=True)
    return df

df = build_dataset('furniture stores pages.csv', page_limit=5)
df.to_csv('pages_content.csv')  

In [4]:
def batch(iterable, n=16):
    l = len(iterable)
    for ndx in tqdm(range(0, l // 3, n), leave=False, desc='Sentences Progress'):
        yield iterable[ndx:min(ndx + n, l)]

pages = pd.read_csv('pages_content.csv')['raw_text'].values.tolist()

for page in tqdm(pages, desc='Pages Progress'):
    page = ast.literal_eval(page)
    sentences = [Sentence(sent, use_tokenizer=True) for line in page for sent in split_single(line)
                 if sent != '' and sent != ' ' and sent != '\n' and sent != '\t' and len(sent) <= 512]
    
    for sent_batch in batch(sentences):        
        tagger.predict(sent_batch)

    for sent in sentences:
        for entity in sent.get_spans('ner'):
            if entity.tag == 'PRODUCT' and entity.score > 0.9:
                print(entity)


Pages Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Sentences Progress:   0%|          | 0/20 [00:00<?, ?it/s]

Sentences Progress:   0%|          | 0/12 [00:00<?, ?it/s]

Span [3,4,5,6]: "Cirrus LED Reading Light"   [− Labels: PRODUCT (0.9991)]
Span [13]: "Cirrus"   [− Labels: PRODUCT (0.9999)]
Span [12]: "Beadlight"   [− Labels: PRODUCT (0.9097)]


Sentences Progress:   0%|          | 0/19 [00:00<?, ?it/s]

Sentences Progress:   0%|          | 0/11 [00:00<?, ?it/s]

Span [14,15,16]: "Waterfall Seat Edge"   [− Labels: PRODUCT (0.9779)]
Span [14,15,16]: "Waterfall Seat Edge"   [− Labels: PRODUCT (0.9779)]
Span [14,15,16]: "Waterfall Seat Edge"   [− Labels: PRODUCT (0.9779)]
Span [16]: "Edge"   [− Labels: PRODUCT (0.9225)]


Sentences Progress:   0%|          | 0/6 [00:00<?, ?it/s]