In [1]:
import numpy as np
import datetime
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
import re

In [116]:
import psycopg2
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL

USER_DATA = {
    'user': 'postgres',
    'password': 'admin'
}

BD_DATA = {
    'host': 'localhost',
    'database': 'tablegames',
    'port': '5432'
}

def get_psycopg2_connection(user_data, db_data):
    """
    Entrega una conexión a la base de datos presente en db_data,
    considerando el usuario en user_data

    :param user_data: Diccionario con los datos de conexión del usuario
    :param db_data: Diccionario con los datos de conexión de la base de datos
    :return: psycopg2 connection
    """
    try:
        conn = psycopg2.connect(user=user_data['user'],
                                password=user_data['password'],
                                host=db_data['host'],
                                port=db_data['port'],
                                database=db_data['database'])
        return conn
    except Exception as error:
        print("Error al conectarse a la BD \n {}".format(error))
        return None
    
def do_sql_upsert(user_data, db_data, upsert_query):
    """

    :param user_data: Diccionario con los datos de conexión del usuario
    :param db_data: Diccionario con los datos de conexión de la bd
    :param upsert_query: Upsert Query
    :return:
    """
    try:
        conn = get_psycopg2_connection(user_data, db_data)
        cur = conn.cursor()
        cur.execute(upsert_query)
        conn.commit()
        return True
    except (Exception, psycopg2.Error) as error:
        print(
            "Error en do_sql_upsert: {}".format(
                repr(error)
            )
        )
        return None
    finally:
        cur.close()
        conn.close()

def get_sql_alchemy_engine(user_data, db_data):
    """
        Retorna una engine de SqlAlchemy

    :param user_data: Diccionario con los datos del usuario
    :param db_data: Diccionario con los datos de la base de datos
    :return: Sqlalchemy Engine
    """
    try:
        engine = create_engine(
            URL(
                'postgresql+psycopg2',
                username=user_data['user'],
                password=user_data['password'],
                host=db_data['host'],
                port=db_data['port'],
                database=db_data['database']
            )
        )
        return engine
    except Exception as error:
        print("Error al generar un engine: {}".format(repr(error)))
        return None

In [163]:
GAMES_UPSERT = '''

INSERT INTO public.games(
        permalink,
        title,
        price,
        image,
        url,
        description)
    VALUES {} 
    ON CONFLICT ON CONSTRAINT games_pkey 
        DO UPDATE 
        SET price = EXCLUDED.price,
            url = EXCLUDED.url,
            description = EXCLUDED.description
'''

PRICES_INSERT = '''
INSERT INTO public.price(
        permalink,
        date,
        price)
        VALUES {}
'''

In [164]:
class DataLoader:
    def __init__(self,df):
        self.df = df
        
    def upsert_games(self,upsert_query):
        try:
            df_aux = self.df[['permalink','title','price','image','url','description']]
            values = [
                # reemplazar " por ' para casos con ' en el email
                "{}".format(value).replace(r'"', r"'")
                if value else 'NULL'
                for value in df_aux.itertuples(
                    index=False,
                    name=None)
            ]
            if do_sql_upsert(
                USER_DATA,
                BD_DATA,
                upsert_query.format(
                    ','.join(values).replace('None', 'NULL')
                )  # Es necesario reemplazar None por NULL en la consulta
            ):
                print(
                    "Juegos insertados y/o actualizados")
            else:
                raise Exception('Error en do_sql_upsert')
        except Exception as error:
            print(
                "Error en upload_data: {}".format(
                    repr(error)
                )
            )
            
    def insert_price(self,insert_query):
        try:
            self.df.date = self.df.date.astype(str)
            df_aux = self.df[['permalink','date','price']]
            
            values = [
                # reemplazar " por ' para casos con ' en el email
                "{}".format(value).replace(r'"', r"'")
                if value else 'NULL'
                for value in df_aux.itertuples(
                    index=False,
                    name=None)
            ]
            
            if do_sql_upsert(
                USER_DATA,
                BD_DATA,
                insert_query.format(
                    ','.join(values).replace('None', 'NULL')
                )):
                print(
                    "Precios insertados")
            else:
                raise Exception('Error en do_sql_upsert')
        except Exception as error:
            print('Error en insert_price')

In [165]:
devir_dataloader = DataLoader(df_devir_clean)
devir_dataloader.upsert_games(GAMES_UPSERT)
devir_dataloader.insert_price(PRICES_INSERT)

Juegos insertados y/o actualizados
Precios insertados


In [73]:
class SkyshipScrapper:
    def __init__(self,base_url='https://tienda.skyship.cl/22-juegos-de-tablero'):
        self.base_url = base_url
        self.board_game_list = list()
    
    def get_next_page_url(self,page_url='https://tienda.skyship.cl/22-juegos-de-tablero'):
        try:
            if page_url == self.base_url:
                url = urllib.request.urlopen(
                    "{}".format(
                        self.base_url
                    )
                )
            else:
                url = urllib.request.urlopen(
                    "{}".format(page_url)
                )
            soup = BeautifulSoup(url,'html.parser')
            next_url = soup.find(
                "a",
                {"class":"next page-numbers"}
            ).get('href')
            return next_url
        except AttributeError as error:
            print('hola')
            return None
        except Exception as error:
            print(
                "Error al obtener la siguiente página: {}".format(
                    repr(error)
                )
            )
            return None
    
    def get_products_url(self,page_url):
        try:
            url = urllib.request.urlopen(
                "{}".format(page_url)
            )
            soup = BeautifulSoup(url,'html.parser')
            products_url = [url.get('href') for url in soup.find_all(href=re.compile("/producto/"))]
            return list(set(products_url))
        except Exception as error:
            print(
                "Error en get_products_url: {}".format(
                    repr(error)
                )
            )
            return None
    
    def get_product_info(self,page_url):
        try:
            url = urllib.request.urlopen(
                '{}'.format(
                    page_url
                )
            )
            soup = BeautifulSoup(url,'html.parser')
            
            #delete strong tags
            for tag in soup.find_all('strong'):
                tag.replaceWith('')
            for tag in soup.find_all('b'):
                tag.replaceWith('')
            for tag in soup.find_all('i'):
                tag.replaceWith('')
            for tag in soup.find_all('em'):
                tag.replaceWith('')
                
            product_dict = dict()
            
            #get image
            image_div = soup.find('div',{'class':'woocommerce-product-gallery__image'})
            product_dict['image'] = image_div.a.contents[0].get('data-large_image')
            
            #get title and price
            summary_div = soup.find('div',{'class':'summary entry-summary'})
            product_dict['title'] = summary_div.h1.contents[0]
            product_dict['price'] = summary_div.findAll('p',{'class':'pvrp'})[1].contents[0]
            
            #get description
            description_div = soup.find(
                'div',
                {'id':'tab-description'}
            )
            
            if description_div != None:
                description_p = [
                    description.contents[0] 
                    for description in description_div.findAll('p') 
                    if (not description.find('span')) and (not description.find('a')) and (not description.find('iframe'))
                ] 


                product_dict['description'] = '\n '.join(description_p)
            else:
                product_dict['description'] = 'Descripción no proporcionada.'
            product_dict['url'] = page_url
            
            self.board_game_list.append(product_dict)
        except Exception as error:
            print(page_url)
            print(error)
            
    def recursive_scrapper(self,page_url):
        urls = self.get_products_url(page_url)
        for product_url in urls:
            self.get_product_info(product_url)
        next_url = self.get_next_page_url(page_url)
        if next_url:
            print("Next Page: {}".format(next_url))
            return self.recursive_scrapper(next_url)
        else:
            return
        
    def get_contents(self):
        try:
            self.recursive_scrapper(self.base_url)
            df = pd.DataFrame(self.board_game_list)
            df['date'] = datetime.datetime.today()
            return df
        except Exception as error:
            print(
                "Error en get_context: {}".format(
                    repr(error)
                )
            )
            return None
        
        

In [74]:
skyship_scrapper = SkyshipScrapper()

In [75]:
df_skyship = skyship_scrapper.get_contents()

hola


In [76]:
df_skyship.head()

Unnamed: 0,date
