### Web Crawler ###

In [1]:
import logging
import sys
import requests
import re
import csv
from html import unescape

def get_category_list(content):
    """get_category_list() takes content of home page and returns
    a list of categories and their urls
    """
    return category_pat.findall(content)

def get_book_list(content):
    """get_book_list() takes content of a book list page and returns
    a list of books (name and url)
    """
    content = content.replace("\n", " ")
    return book_list_pat.findall(content)

def get_product_details(content):
    """get_product_details() takes content of a product page, parses
    the page and returns details about a product
    """
    image_base = "http://books.toscrape.com/"
    result = img_pat.findall(content)
    if len(result) == 0:
        logging.warn("Image url not found!")
        image_url = ""
    else:
        img_url = result[0]
        img_url = img_url.replace("../../", "")
        image_url = image_base + img_url
    result = desc_pat.findall(content)
    if len(result) == 0:
        logging.warn("Description not found!")
        description = ""
    else:
        description = unescape(result[0])
    result = upc_pat.findall(content)
    if len(result) == 0:
        logging.warn("UPC not found!")
        upc = ""
    else:
        upc = result[0]
    result = price_pat.findall(content)
    if len(result) == 0:
        logging.warn("Price not found!")
        price = ""
    else:
        price = result[0]
    result = avail_pat.findall(content)
    if len(result) == 0:
        logging.warn("Availability not found!")
        availability = ""
    else:
        availability = result[0]
    return upc, price, image_url, availability, description

def get_page_content(url):
    """get_page_content() takes a url and returns the content of the page
    """
    try:
        response = requests.get(url)
    except requests.exceptions.RequestException as e:
        logging.error(e)
    if response.ok:
        return response.text
    logging.error("Can not get content from URL:" + url)
    return ""

def get_next_page(url, content):
    """get_next_page() checks the content of a book list page and
    returns link of  the next page, returns None, if there is no
    more next page
    """
    result = next_page_pat.findall(content)
    if len(result) == 0:
        return None
    i = url.rfind("/")
    return url[0:i + 1] + result[0]

def scrape_book_info(book_info, category_name):
    """scrape_book_info() gets the content of a book details page,
    and parses different components and stores the info """

    book_url, book_name = book_info
    book_name = unescape(book_name)
    book_dict = {"Name": book_name, "Category": category_name}
    book_url = book_url.replace("../../../", "")
    book_url = "http://books.toscrape.com/catalogue/" + book_url
    book_dict["URL"] = book_url
    print("Scraping book", book_name)
    logging.info("Scraping : " + book_url)
    content = get_page_content(book_url)
    content = content.replace("\n", " ")
    upc, price, image_url, availability, desc = get_product_details(content)
    book_dict["UPC"] = upc
    book_dict["Price"] = price
    book_dict["ImageURL"] = image_url
    book_dict["Availability"] = availability
    book_dict["Description"] = desc
    csv_writer.writerow(book_dict)

def crawl_category(category_name, category_url):
    """crawl_category() crawls a particular category of books
    """
    while True:
        content = get_page_content(category_url)
        book_list = get_book_list(content)
        for book in book_list:
            scrape_book_info(book, category_name)
        next_page = get_next_page(category_url, content)
        if next_page is None:
            break
        category_url = next_page

def crawl_website():
    """crawl_website() is the main function that coordinates the whole crawling task
    """
    url = "http://books.toscrape.com/index.html"
    host_name = "books.toscrape.com"
    content = get_page_content(url)
    if content == "":
        logging.critical("Got empty content from " + url)
        sys.exit(1)
    category_list = get_category_list(content)
    for category in category_list:
        category_url, category_name = category
        category_url = "http://" + host_name + "/" + category_url
        crawl_category(category_name, category_url)

if __name__ == "__main__":
    # Compile different regular expression patterns
    category_pat = re.compile(r'<li>\s*<a href="(catalogue/category/books/.*?)">\s*(\w+[\s\w]+\w)\s*?<',re.M | re.DOTALL)
    next_page_pat = re.compile(r'<li class="next"><a href="(.*?)">next</a></li>')
    book_list_pat = re.compile(r'<h3><a href="(.*?)" title="(.*?)">')
    img_pat = re.compile(r'<div class="item active">\s*<img src="(.*?)"')
    desc_pat = re.compile(r'<div id="product_description" class="sub-header">.*?<p>(.*?)</p>')
    upc_pat = re.compile(r'<th>UPC</th>\s*<td>(.*?)</td>')
    price_pat = re.compile(r'<th>Price \(incl. tax\)</th>\s*<td>\D+([\d.]+?)</td>')
    avail_pat = re.compile(r'<th>Availability</th>\s*<td>(.*?)</td>')
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',filename="bookstore_crawler.log", level=logging.DEBUG)
    header_fields = ["Name", "Category", "UPC", "URL", "ImageURL","Price", "Availability", "Description"]

    with open("book_list.csv", "w", encoding="ISO-8859-1") as csvf:
        csv_writer = csv.DictWriter(csvf, fieldnames=header_fields)
        csv_writer.writeheader()
        crawl_website()
        print("Crawling Done!")

Scraping book It's Only the Himalayas
Scraping book Full Moon over Noahâs Ark: An Odyssey to Mount Ararat and Beyond
Scraping book See America: A Celebration of Our National Parks & Treasured Sites
Scraping book Vagabonding: An Uncommon Guide to the Art of Long-Term World Travel
Scraping book Under the Tuscan Sun
Scraping book A Summer In Europe
Scraping book The Great Railway Bazaar
Scraping book A Year in Provence (Provence #1)
Scraping book The Road to Little Dribbling: Adventures of an American in Britain (Notes From a Small Island #2)
Scraping book Neither Here nor There: Travels in Europe
Scraping book 1,000 Places to See Before You Die
Scraping book Sharp Objects
Scraping book In a Dark, Dark Wood
Scraping book The Past Never Ends
Scraping book A Murder in Time
Scraping book The Murder of Roger Ackroyd (Hercule Poirot #4)
Scraping book The Last Mile (Amos Decker #2)
Scraping book That Darkness (Gardiner and Renner #1)
Scraping book Tastes Like Fear (DI Marnie Rome #3)
Scraping

Scraping book Wuthering Heights
Scraping book The Picture of Dorian Gray
Scraping book The Complete Stories and Poems (The Works of Edgar Allan Poe [Cameo Edition])
Scraping book Beowulf
Scraping book And Then There Were None
Scraping book The Story of Hong Gildong
Scraping book The Little Prince
Scraping book Sense and Sensibility
Scraping book Of Mice and Men
Scraping book Emma
Scraping book Alice in Wonderland (Alice's Adventures in Wonderland #1)


  logging.warn("Description not found!")


Scraping book Sophie's World
Scraping book The Death of Humanity: and the Case for Life
Scraping book The Stranger
Scraping book Proofs of God: Classical Arguments from Tertullian to Barth
Scraping book Kierkegaard: A Christian Missionary to Christians
Scraping book At The Existentialist CafÃ©: Freedom, Being, and apricot cocktails with: Jean-Paul Sartre, Simone de Beauvoir, Albert Camus, Martin Heidegger, Edmund Husserl, Karl Jaspers, Maurice Merleau-Ponty and others
Scraping book Critique of Pure Reason
Scraping book Run, Spot, Run: The Ethics of Keeping Pets
Scraping book The Nicomachean Ethics
Scraping book Meditations
Scraping book Beyond Good and Evil
Scraping book Chase Me (Paris Nights #2)
Scraping book Black Dust
Scraping book Her Backup Boyfriend (The Sorensen Family #1)
Scraping book First and First (Five Boroughs #3)
Scraping book Fifty Shades Darker (Fifty Shades #2)
Scraping book The Wedding Dress
Scraping book Suddenly in Love (Lake Haven #1)
Scraping book Something More

Scraping book Algorithms to Live By: The Computer Science of Human Decisions
Scraping book The Power of Now: A Guide to Spiritual Enlightenment
Scraping book The Omnivore's Dilemma: A Natural History of Four Meals
Scraping book The Genius of Birds
Scraping book The Artist's Way: A Spiritual Path to Higher Creativity
Scraping book So You've Been Publicly Shamed
Scraping book Daring Greatly: How the Courage to Be Vulnerable Transforms the Way We Live, Love, Parent, and Lead
Scraping book Big Magic: Creative Living Beyond Fear
Scraping book Becoming Wise: An Inquiry into the Mystery and Art of Living
Scraping book Agnostic: A Spirited Manifesto
Scraping book Whole Lotta Creativity Going On: 60 Fun and Unusual Exercises to Awaken and Strengthen Your Creativity
Scraping book What's It Like in Space?: Stories from Astronauts Who've Been There
Scraping book The Year of Magical Thinking
Scraping book The Literature Book (Big Ideas Simply Explained)
Scraping book The Bad-Ass Librarians of Timbu

Scraping book The Psychopath Test: A Journey Through the Madness Industry
Scraping book The Kite Runner
Scraping book The Girl on the Train
Scraping book The Emerald Mystery
Scraping book The Bridge to Consciousness: I'm Writing the Bridge Between Science and Our Old and New Beliefs.
Scraping book The Art of War
Scraping book Secrets and Lace (Fatal Hearts #1)
Scraping book Romero and Juliet: A Tragic Tale of Love and Zombies
Scraping book Poses for Artists Volume 1 - Dynamic and Sitting Poses: An Essential Reference for Figure Drawing and the Human Form
Scraping book Miss Peregrineâs Home for Peculiar Children (Miss Peregrineâs Peculiar Children #1)
Scraping book Large Print Heart of the Pride
Scraping book Grumbles
Scraping book First Steps for New Christians (Print Edition)
Scraping book Eureka Trivia 6.0
Scraping book Drive: The Surprising Truth About What Motivates Us
Scraping book Done Rubbed Out (Reightman & Bailey #1)
Scraping book Beauty Restored (Riley Family Legacy Novel

Scraping book The Art Forger
Scraping book On a Midnight Clear
Scraping book Judo: Seven Steps to Black Belt (an Introductory Guide for Beginners)
Scraping book Shobu Samurai, Project Aryoku (#3)
Scraping book Modern Romance
Scraping book The White Queen (The Cousins' War #1)
Scraping book The Song of Achilles
Scraping book The Immortal Life of Henrietta Lacks
Scraping book The Dovekeepers
Scraping book More Than Music (Chasing the Dream #1)
Scraping book Code Name Verity (Code Name Verity #1)
Scraping book Cell
Scraping book Angels Walking (Angels Walking #1)
Scraping book A Series of Catastrophes and Miracles: A True Story of Love, Science, and Cancer
Scraping book A People's History of the United States
Scraping book A Brush of Wings (Angels Walking #3)
Scraping book Rook
Scraping book The Midnight Watch: A Novel of the Titanic and the Californian
Scraping book The Gray Rhino: How to Recognize and Act on the Obvious Dangers We Ignore
Scraping book The Children
Scraping book One with

Scraping book The Most Perfect Thing: Inside (and Outside) a Bird's Egg
Scraping book Immunity: How Elie Metchnikoff Changed the Course of Modern Medicine
Scraping book Sorting the Beef from the Bull: The Science of Food Fraud Forensics
Scraping book Tipping Point for Planet Earth: How Close Are We to the Edge?
Scraping book The Fabric of the Cosmos: Space, Time, and the Texture of Reality
Scraping book Diary of a Citizen Scientist: Chasing Tiger Beetles and Other New Ways of Engaging the World
Scraping book The Origin of Species
Scraping book The Grand Design
Scraping book Peak: Secrets from the New Science of Expertise
Scraping book The Elegant Universe: Superstrings, Hidden Dimensions, and the Quest for the Ultimate Theory
Scraping book The Disappearing Spoon: And Other True Tales of Madness, Love, and the History of the World from the Periodic Table of the Elements
Scraping book Surely You're Joking, Mr. Feynman!: Adventures of a Curious Character
Scraping book Seven Brief Lessons 

Scraping book The Art of Simple Food: Notes, Lessons, and Recipes from a Delicious Revolution
Scraping book Hungry Girl Clean & Hungry: Easy All-Natural Recipes for Healthy Eating in the Real World
Scraping book Redeeming Love
Scraping book Close to You
Scraping book Shadows of the Past (Logan Point #1)
Scraping book Like Never Before (Walker Family #2)
Scraping book Counted With the Stars (Out from Egypt #1)
Scraping book If I Run (If I Run #1)
Scraping book The Dirty Little Secrets of Getting Your Dream Job
Scraping book The Third Wave: An Entrepreneurâs Vision of the Future
Scraping book The 10% Entrepreneur: Live Your Startup Dream Without Quitting Your Day Job
Scraping book Shoe Dog: A Memoir by the Creator of NIKE
Scraping book Made to Stick: Why Some Ideas Survive and Others Die
Scraping book Quench Your Own Thirst: Business Lessons Learned Over a Beer or Two
Scraping book The Art of Startup Fundraising
Scraping book Born for This: How to Find the Work You Were Meant to Do
Scr