In [15]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import logging
import csv
from requests.exceptions import HTTPError, ConnectionError, Timeout, RequestException

In [16]:
def extract_data(url):
    response = requests.get(url, timeout=20)
    soup = BeautifulSoup(response.text, 'html.parser')
    result = {}
    
    result['title'] = soup.select('.product-name')[0].get_text().lstrip('\r\n ').rstrip(' ')
    try:
        result['old-price'] = soup.select('.uk-width-1-1 h4')[0].get_text()
    except IndexError:
        result['old-price'] = 'NOT FOUND'

    try:
        result['new-price'] = soup.select('.uk-width-1-1 h2')[1].get_text().lstrip('\r\n ').rstrip(' ')
    except IndexError:
        result['new-price'] = 'NOT FOUND'

    detail_box = soup.select('.uk-list li a')
    result['main-subject']  = detail_box[3].get_text().lstrip(' \r\n ').rstrip(' \r\n ').rstrip(' ')
    result['sub-subject']   = detail_box[4].get_text().lstrip(' \r\n ').rstrip(' \r\n ').rstrip(' ')
    result['writer']        = detail_box[5].get_text().lstrip(' \r\n ').rstrip(' \r\n ').rstrip(' ')
    result['translator']    = detail_box[6].get_text().lstrip(' \r\n ').rstrip(' \r\n ').rstrip(' ')
    result['producer']      = detail_box[7].get_text().lstrip(' \r\n ').rstrip(' \r\n ').rstrip(' ')

    result['seen'] = soup.select('.uk-width-auto span')[12].get_text().lstrip(' \r\n ').rstrip(' \r\n ')
    return result

In [17]:
def scrape(url, logger):
    
    logger.info('Starting to scrape the page [{}]'.format(url))
    results = []
    
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status() 
    except HTTPError as http_err:
        logger.error('HTTP error occurred: {}'.format(http_err))
        return results
    except ConnectionError as conn_err:
        logger.error('Connection error occurred: {}'.format(conn_err))
        return results
    except Timeout as timeout_err:
        logger.error('Timeout error occurred: {}'.format(timeout_err))
        return results
    except RequestException as req_err:
        logger.error('Request error occurred: {}'.format(req_err))
        return results
    
    print(f'response = {response.status_code} ')
    soup = BeautifulSoup(response.text, 'html.parser')
    new_links = soup.select('.uk-margin-medium-bottom div div a')
    for i in range(10):
        try:
            book_url = new_links[i].get('href')
            results.append(extract_data(f'https://www.30book.com{book_url}'))
            logger.info('scraped successfully.')
        except:
            logger.warning(f'Failed to extract data')
    
    return results

In [20]:
# Set up logging
logging.basicConfig(filename='30book.log', filemode='w', format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
all_results = []

for page_num in range(20):
    url = 'https://www.30book.com/category/bts-1/%DA%A9%D8%AA%D8%A7%D8%A8-%D8%B9%D9%85%D9%88%D9%85%DB%8C?cQ=False&st=7&stO=True&pg={page_num}'
    all_results.extend(scrape(url, logger)) 
    print(f'scraped page:{page_num}')

response = 200 
scraped page:0
response = 200 
scraped page:1
response = 200 
scraped page:2
response = 200 
scraped page:3
response = 200 
scraped page:4
response = 200 
scraped page:5
response = 200 
scraped page:6
response = 200 
scraped page:7
response = 200 
scraped page:8
response = 200 
scraped page:9
response = 200 
scraped page:10
response = 200 
scraped page:11
response = 200 
scraped page:12
response = 200 
scraped page:13
response = 200 
scraped page:14
response = 200 
scraped page:15
response = 200 
scraped page:16
response = 200 
scraped page:17
response = 200 
scraped page:18
response = 200 
scraped page:19


In [21]:
with open('30book.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['title','old-price','new-price','main-subject', 'sub-subject', 'writer'
                                            ,'translator', 'producer', 'seen'])
    writer.writeheader()
    for co_dic in all_results:
        writer.writerow(co_dic)