In [4]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
import re

In [47]:
class DevirScrapper():
    
    def __init__(self,base_url='https://devir.cl/categoria/juegos-de-mesa/'):
        self.base_url = base_url
        self.board_game_list = list()
        
    def get_next_page_url(self,page_url='https://devir.cl/categoria/juegos-de-mesa/'):
        try:
            if page_url == self.base_url:
                url = urllib.request.urlopen(
                    "{}".format(
                        self.base_url
                    )
                )
            else:
                url = urllib.request.urlopen(
                    "{}".format(page_url)
                )
            soup = BeautifulSoup(url,'html.parser')
            next_url = soup.find(
                "a",
                {"class":"next page-numbers"}
            ).get('href')
            return next_url
        except AttributeError as error:
            return None
        except Exception as error:
            print(
                "Error al obtener la siguiente página: {}".format(
                    repr(error)
                )
            )
            return None
        
    def get_products_url(self,page_url):
        try:
            url = urllib.request.urlopen(
                "{}".format(page_url)
            )
            soup = BeautifulSoup(url,'html.parser')
            products_url = [url.get('href') for url in soup.find_all(href=re.compile("/producto/"))]
            return list(set(products_url))
        except Exception as error:
            print(
                "Error en get_products_url: {}".format(
                    repr(error)
                )
            )
            return None
    
    def get_product_info(self,page_url):
        try:
            url = urllib.request.urlopen(
                '{}'.format(
                    page_url
                )
            )
            soup = BeautifulSoup(url,'html.parser')
            
            #delete strong tags
            for tag in soup.find_all('strong'):
                tag.replaceWith('')
            for tag in soup.find_all('b'):
                tag.replaceWith('')
            for tag in soup.find_all('i'):
                tag.replaceWith('')
            for tag in soup.find_all('em'):
                tag.replaceWith('')
                
            product_dict = dict()
            
            #get image
            image_div = soup.find('div',{'class':'product-image-slider owl-carousel show-nav-hover has-ccols ccols-1'})
            product_dict['image'] = image_div.img.get('src')
            
            
            
            #get title and price
            #summary_div = soup.find('div',{'class':'summary entry-summary col-md-6'})
            #print(summary_div)
            product_dict['title'] = soup.find('h2',{'class':'product_title entry-title show-product-nav'}).contents[0]
            
            product_dict['price'] = soup.find('span',{'class':'woocommerce-Price-amount amount'}).contents[1]
            
            
            #get description
            description_div = soup.find(
                'div',
                {'class':'description woocommerce-product-details__short-description'}
            )
            
            if description_div != None:
                description_p = [
                    description.contents[0] 
                    for description in description_div.findAll('p') 
                    if (not description.find('span')) and (not description.find('a')) and (not description.find('iframe'))
                ] 


                product_dict['description'] = '\n '.join(description_p)
            else:
                product_dict['description'] = 'Descripción no proporcionada.'
            product_dict['url'] = page_url
            self.board_game_list.append(product_dict)
            
        except Exception as error:
            print(page_url)
            print(error)
            
            
    def recursive_scrapper(self,page_url):
        urls = self.get_products_url(page_url)
        for product_url in urls:
            self.get_product_info(product_url)
        next_url = self.get_next_page_url(page_url)
        if next_url:
            print("Next Page: {}".format(next_url))
            return self.recursive_scrapper(next_url)
        else:
            return
        
    def get_contents(self):
        try:
            self.recursive_scrapper(self.base_url)
            df = pd.DataFrame(self.board_game_list)
            return df
        except Exception as error:
            print(
                "Error en get_context: {}".format(
                    repr(error)
                )
            )
            return None
            

In [48]:
devir_scrapper = DevirScrapper()

In [49]:
df_devir = devir_scrapper.get_contents()

Next Page: https://devir.cl/categoria/juegos-de-mesa/page/2/
Next Page: https://devir.cl/categoria/juegos-de-mesa/page/3/
Next Page: https://devir.cl/categoria/juegos-de-mesa/page/4/
Next Page: https://devir.cl/categoria/juegos-de-mesa/page/5/
Next Page: https://devir.cl/categoria/juegos-de-mesa/page/6/
Next Page: https://devir.cl/categoria/juegos-de-mesa/page/7/
Next Page: https://devir.cl/categoria/juegos-de-mesa/page/8/
Next Page: https://devir.cl/categoria/juegos-de-mesa/page/9/
Next Page: https://devir.cl/categoria/juegos-de-mesa/page/10/
Next Page: https://devir.cl/categoria/juegos-de-mesa/page/11/
Next Page: https://devir.cl/categoria/juegos-de-mesa/page/12/


KeyboardInterrupt: 

In [None]:
df_devir.head()

In [None]:
class SkyshipScrapper:
    def __init__(self,base_url='https://www.skyship.cl/categoria-producto/juegos-de-tablero/'):
        self.base_url = base_url
        self.board_game_list = list()
    
    def get_next_page_url(self,page_url='https://www.skyship.cl/categoria-producto/juegos-de-tablero/'):
        try:
            if page_url == self.base_url:
                url = urllib.request.urlopen(
                    "{}".format(
                        self.base_url
                    )
                )
            else:
                url = urllib.request.urlopen(
                    "{}".format(page_url)
                )
            soup = BeautifulSoup(url,'html.parser')
            next_url = soup.find(
                "a",
                {"class":"next page-numbers"}
            ).get('href')
            return next_url
        except AttributeError as error:
            return None
        except Exception as error:
            print(
                "Error al obtener la siguiente página: {}".format(
                    repr(error)
                )
            )
            return None
    
    def get_products_url(self,page_url):
        try:
            url = urllib.request.urlopen(
                "{}".format(page_url)
            )
            soup = BeautifulSoup(url,'html.parser')
            products_url = [url.get('href') for url in soup.find_all(href=re.compile("/producto/"))]
            return list(set(products_url))
        except Exception as error:
            print(
                "Error en get_products_url: {}".format(
                    repr(error)
                )
            )
            return None
    
    def get_product_info(self,page_url):
        try:
            url = urllib.request.urlopen(
                '{}'.format(
                    page_url
                )
            )
            soup = BeautifulSoup(url,'html.parser')
            
            #delete strong tags
            for tag in soup.find_all('strong'):
                tag.replaceWith('')
            for tag in soup.find_all('b'):
                tag.replaceWith('')
            for tag in soup.find_all('i'):
                tag.replaceWith('')
            for tag in soup.find_all('em'):
                tag.replaceWith('')
                
            product_dict = dict()
            
            #get image
            image_div = soup.find('div',{'class':'woocommerce-product-gallery__image'})
            product_dict['image'] = image_div.a.contents[0].get('data-large_image')
            
            #get title and price
            summary_div = soup.find('div',{'class':'summary entry-summary'})
            product_dict['title'] = summary_div.h1.contents[0]
            product_dict['price'] = summary_div.findAll('p',{'class':'pvrp'})[1].contents[0]
            
            #get description
            description_div = soup.find(
                'div',
                {'id':'tab-description'}
            )
            
            if description_div != None:
                description_p = [
                    description.contents[0] 
                    for description in description_div.findAll('p') 
                    if (not description.find('span')) and (not description.find('a')) and (not description.find('iframe'))
                ] 


                product_dict['description'] = '\n '.join(description_p)
            else:
                product_dict['description'] = 'Descripción no proporcionada.'
            product_dict['url'] = page_url
            
            self.board_game_list.append(product_dict)
        except Exception as error:
            print(page_url)
            print(error)
            
    def recursive_scrapper(self,page_url):
        urls = self.get_products_url(page_url)
        for product_url in urls:
            self.get_product_info(product_url)
        next_url = self.get_next_page_url(page_url)
        if next_url:
            print("Next Page: {}".format(next_url))
            return self.recursive_scrapper(next_url)
        else:
            return
        
    def get_contents(self):
        try:
            self.recursive_scrapper(self.base_url)
            df = pd.DataFrame(self.board_game_list)
            return df
        except Exception as error:
            print(
                "Error en get_context: {}".format(
                    repr(error)
                )
            )
            return None
        
        

In [291]:
skyship_scrapper = SkyshipScrapper()

In [292]:
df_skyship = skyship_scrapper.get_contents()

Next Page: https://www.skyship.cl/categoria-producto/juegos-de-tablero/page/2/
Next Page: https://www.skyship.cl/categoria-producto/juegos-de-tablero/page/3/
Next Page: https://www.skyship.cl/categoria-producto/juegos-de-tablero/page/4/
Next Page: https://www.skyship.cl/categoria-producto/juegos-de-tablero/page/5/
Next Page: https://www.skyship.cl/categoria-producto/juegos-de-tablero/page/6/
Next Page: https://www.skyship.cl/categoria-producto/juegos-de-tablero/page/7/
Next Page: https://www.skyship.cl/categoria-producto/juegos-de-tablero/page/8/
Next Page: https://www.skyship.cl/categoria-producto/juegos-de-tablero/page/9/
Next Page: https://www.skyship.cl/categoria-producto/juegos-de-tablero/page/10/
Next Page: https://www.skyship.cl/categoria-producto/juegos-de-tablero/page/11/
Next Page: https://www.skyship.cl/categoria-producto/juegos-de-tablero/page/12/
Next Page: https://www.skyship.cl/categoria-producto/juegos-de-tablero/page/13/
Next Page: https://www.skyship.cl/categoria-prod

In [293]:
df_skyship.head()

Unnamed: 0,description,image,price,title,url
0,"Ha llegado la hora de la verdad, ya que en un...",https://www.skyship.cl/wp-content/uploads/2017...,Valor: $ 24.990 *,13 Días: La crisis de los Misiles en Cuba,https://www.skyship.cl/producto/13-dias-la-cri...
1,"Londres, 1899: La ciudad es sacudida por atroc...",https://www.skyship.cl/wp-content/uploads/2018...,Valor: $21.990*,13 Pistas,https://www.skyship.cl/producto/13-pistas/
2,Todo el mundo tiene secretos.\n ¡Vuestro desaf...,https://www.skyship.cl/wp-content/uploads/2018...,Valor: $12.990 *,3 Secretos,https://www.skyship.cl/producto/3-secretos/
3,"En “4 Gods” serás un dios, encargado de crear ...",https://www.skyship.cl/wp-content/uploads/2017...,Valor: $ 35.990 *,4 Gods,https://www.skyship.cl/producto/4-gods-2/
4,Gobierna una de las siete grandes ciudades del...,https://www.skyship.cl/wp-content/uploads/2016...,Valor: $ 37.990 *,7 Wonders,https://www.skyship.cl/producto/7-wonders/
