In [14]:
from bs4 import BeautifulSoup
import re

In [30]:
class ParsedItemLocators:
    """
    Locators for an item in the HTML page.

    This allows us to easily see what our code will be looking at
    as well as change it quickly if we notice it is now different.
    """
    NAME_LOCATOR = 'article.product_pod h3 a'
    LINK_LOCATOR = 'article.product_pod h3 a'
    PRICE_LOCATOR = 'article.product_pod p.price_color'
    RATING_LOCATOR = 'article.product_pod p.star-rating'
    IMAGE_LOCATOR = 'article div a img'

In [26]:
class ParsedItem:
    '''A class to take in an HTML page (or a part of it) and find properties of an item in it.'''
    def __init__(self, page):
        self.soup = BeautifulSoup(page, 'html.parser')
    @property
    def name(self):
        locator = ParsedItemLocators.NAME_LOCATOR #locator = 'article.product_pod h3 a'
        item_link = self.soup.select_one(locator)
        item_name = item_link.attrs['title']   #item_link.attrs.get('title')
        return(item_name)
    @property
    def link(self):
        locator = ParsedItemLocators.LINK_LOCATOR #locator = 'article.product_pod h3 a'
        item_link = self.soup.select_one(locator)
        item_link_string = item_link.attrs['href']   #item_link.attrs.get('title')
        return(item_link_string)
    @property
    def image_source(self):
        locator = ParsedItemLocators.IMAGE_LOCATOR #locator = 'article.product_pod div a img'
        item_link = self.soup.select_one(locator)
        item_image_source = item_link.attrs['src']   #item_link.attrs.get('title')
        return(item_image_source)
    @property
    def price(self):
        locator = ParsedItemLocators.PRICE_LOCATOR
        item_price_full = self.soup.select_one(locator).string #£51.77
        pattern = '£([0-9]+.[0-9]+)'
        matches = re.search(pattern,item_price_full)
        item_price_float = float(matches[1])
        return item_price_float
    @property
    def rating(self):
        locator = ParsedItemLocators.RATING_LOCATOR  #'article.product_pod p.star_rating.Three'
        star_rating_tag = self.soup.select_one(locator)
        classes = star_rating_tag.attrs['class'] # class order may not be the way it is written in the html file, class order might change.
        #rating_classes = [r for r in classes if r != 'star-rating']
        rating_classes = filter(lambda x: x != 'star-rating', classes)
        return(list(rating_classes)[0])


In [17]:
ITEM_HTML = '''<html><head></head><body>
<li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
    <article class="product_pod">
            <div class="image_container">
                    <a href="catalogue/a-light-in-the-attic_1000/index.html"><img src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg" alt="A Light in the Attic" class="thumbnail"></a>
            </div>
            <p class="star-rating Three">
                <i class="icon-star"></i>
                <i class="icon-star"></i>
                <i class="icon-star"></i>
                <i class="icon-star"></i>
                <i class="icon-star"></i>
            </p>
            <h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
            <div class="product_price">
                <p class="price_color">£51.77</p>
                <p class="instock availability">
                    <i class="icon-ok"></i>

                        In stock

                </p>
                 <form>
                    <button type="submit" class="btn btn-primary btn-block" data-loading-text="Adding...">Add to basket</button>
                </form>
            </div>
    </article>
</li>

</body></html>
'''

In [18]:
item = ParsedItem(ITEM_HTML)
print(item.image_source)

NameError: name 'IMAGE_LOCATOR' is not defined