In [714]:
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import ast

In [715]:
pd.read_json('../Classifier/Links/Sites.json').columns

Index(['Gregory', 'Damyller', 'Dassi', 'Excluiva', 'CutiCutiBaby', 'BluK',
       'Villevie', 'Achados96', 'DWZ', 'BellaSeda'],
      dtype='object')

In [727]:
def wrapper_gregory(c):
    soup = BeautifulSoup(open(c), 'html.parser')
    
    title = soup.select(".fn")
    price = soup.select(".skuBestPrice")
    description = soup.select(".productDescription")
    try:
        size_color = re.findall('"dimensionsMap":{.*"Tamanho":(\[".*"+\])}',\
                           soup.find(text=re.compile(r"Tamanho")))
    except:
        size_color = []
    
    title = title[0].get_text() if len(title) > 0 else None
    price = price[0].get_text() if len(price) > 0 else None
    sizes = ast.literal_eval(size_color[0]) if len(size_color) > 0 else None
    if sizes is not None:
        sizes = ",".join(sizes)
    if title is not None:
        colors = title.split("-")[1]
    else:
        colors = None
    description = description[0].get_text() if len(description) > 0 else None
    
    return title, price, colors, sizes, description

In [717]:
def wrapper_damyller(c):
    soup = BeautifulSoup(open(c), 'html.parser')
    
    title = soup.select(".fn")
    price = soup.select(".skuBestPrice")
    description = soup.select(".productDescription")
    try:
        size_color = re.findall('"dimensionsMap":{.*"Cor":(\[.*\]),.*"Tamanho":(\[".*"+\])}',\
                           soup.find(text=re.compile(r"Tamanho")))
    except:
        size_color = []
    
    title = title[0].get_text() if len(title) > 0 else None
    price = price[0].get_text() if len(price) > 0 else None
    colors = ast.literal_eval(size_color[0][0]) if len(size_color) > 0 else None
    sizes = ast.literal_eval(size_color[0][1]) if len(size_color) > 0 else None
    if sizes is not None:
        sizes = ",".join(sizes)
    if colors is not None:
        colors = ",".join(colors)
    description = description[0].get_text() if len(description) > 0 else None
    
    return title, price, colors, sizes, description

In [718]:
def wrapper_dassi(c):
    soup = BeautifulSoup(open(c), 'html.parser')
    
    title, price, colors, sizes, description = [None] * 5
    
    title = soup.select(".prod-title")
    price = soup.select(".product_price")
    try:
        sizes = soup.select("select.product_option.form-control > option")[1:-2]
        sizes = ",".join([str(item.get_text()) for item in sizes])
        if sizes == "tamanho único":
            sizes = "unico"
    except:
        sizes = None
    colors = soup.select("button.sub.prod-variant-btn > span")
    description = soup.select(".MsoNormal > span")

    title = title[0].get_text().strip() if len(title) > 0 else None
    price = price[0].get_text() if len(price) > 0 else None
    colors = ",".join([str(item.get_text().strip()) for item in colors])
#     colors = [colors[0].get_text().strip() if len(colors) > 0 else None
#     colors = ",".join(colors)
    description = description[1].get_text().strip() if len(description) > 1 else None
    
    return title, price, colors, sizes, description

In [719]:
def wrapper_lojasexclusiva(c):
    soup = BeautifulSoup(open(c), 'html.parser')
    
    title, price, colors, sizes, description = [None] * 5
    
    title = soup.select(".prod-title")
    price = soup.select(".product_price")
#     colors = soup.select(".s-color")
    try:
        sizes = soup.select(".prod-variants .btn")
        sizes = ",".join([str(item.get_text()) for item in sizes])
    except:
        sizes = None
    description = soup.select(".prod-excerpt > p > br")
    
    title = title[0].get_text().strip() if len(title) > 0 else None
    price = price[0].get_text() if len(price) > 0 else None
    colors = "unico"
    description = description[0].get_text().strip() if len(description) > 0 else None
    
    return title, price, colors, sizes, description

In [720]:
def wrapper_cuticutibaby(c):
    soup = BeautifulSoup(open(c), 'html.parser')
    
    title, price, colors, sizes, description = [None] * 5
    
    title = soup.select(".nome-produto.titulo")
    price = soup.select(".preco-promocional")
    colors = "unico"
    try:
        sizes = soup.select(".atributo-item")
        sizes = ",".join([str(item.get_text().strip()) for item in sizes])
    except:
        sizes = None
    description = soup.select("#descricao > p")
    
    title = title[0].get_text().strip() if len(title) > 0 else None
    price = price[0].get_text().strip() if len(price) > 0 else None
    description = description[0].get_text().strip() if len(description) > 0 else None
    
    return title, price, colors, sizes, description

In [721]:
def wrapper_bluk(c):
    soup = BeautifulSoup(open(c), 'html.parser')
    
    title = soup.select(".fn")
    price = soup.select(".skuBestPrice")
    description = soup.select(".productDescription")
    try:
        sizes = soup.select(".value-field.Tamanho")
        sizes = ",".join([str(item.get_text().strip()) for item in sizes])
    except:
        sizes = None
        
    try:
        colors = soup.select(".value-field.Cores")
        colors = ",".join([str(item.get_text().strip()) for item in colors])
    except:
        colors = None
    
    title = title[0].get_text() if len(title) > 0 else None
    price = price[0].get_text() if len(price) > 0 else None
    description = description[0].get_text() if len(description) > 0 else None
    
    return title, price, colors, sizes, description

In [722]:
def wrapper_villevie(c):
    soup = BeautifulSoup(open(c), 'html.parser')
    
    title, price, colors, sizes, description = [None] * 5
    
    title = soup.select(".product-name > h1")
    price = soup.select(".price")
    colors = "unico"
    sizes = soup.select(".swatch-label")
    if len(sizes) > 0:
        sizes = ",".join([str(item.get_text().strip()) for item in sizes])
    else:
        sizes = None
    description = soup.select(".short-description > div")
    
    title = title[0].get_text() if len(title) > 0 else None
    price = price[len(price) - 1].get_text() if len(price) > 0 else None
    description = description[0].get_text() if len(description) > 0 else None
    
    return title, price, colors, sizes, description

In [723]:
def wrapper_achados96(c):
    soup = BeautifulSoup(open(c), 'html.parser')
    
    title, price, colors, sizes, description = [None] * 5
    
    title = soup.select(".product-name")
    price = soup.select("#price_display")
    colors = "unico"
    sizes = soup.select(".btn-variant-content")
    if len(sizes) > 0:
        sizes = ",".join([str(item.get_text().strip()) for item in sizes])
    else:
        sizes = None
    description = soup.select(".description > p")
    
    title = title[0].get_text().strip() if len(title) > 0 else None
    price = price[0].get_text().strip() if len(price) > 0 else None
    description = description[0].get_text() if len(description) > 0 else None
    
    return title, price, colors, sizes, description

In [724]:
def wrapper_dwz(c):
    soup = BeautifulSoup(open(c), 'html.parser')
    
    title, price, colors, sizes, description = [None] * 5
    
#     title = soup.select(".col.col-12.name")
#     price = soup.select("#price_display")
#     colors = "unico"
#     sizes = soup.select(".swatch-label")[0].get_text()
#     description = soup.select(".wd-descriptions-text")
    
#     title = title[0].get_text().strip() if len(title) > 0 else None
#     price = price[0].get_text().strip() if len(price) > 0 else None
#     description = description[0].get_text() if len(description) > 0 else None
    
    return title, price, colors, sizes, description

In [725]:
def wrapper_bellaseda(c):
    soup = BeautifulSoup(open(c), 'html.parser')
    
    title, price, colors, sizes, description = [None] * 5
    
    title = soup.select(".product-name")
    price = soup.select("#detail_finalPrice")
    colors = "unico"
    sizes = None#soup.select(".swatch-label")
    description = soup.select("#tab_info")

    title = title[0].get_text().strip() if len(title) > 0 else None
    price = price[0].get_text().strip() if len(price) > 0 else None
    description = description[0].get_text().strip() if len(description) > 0 else None
    
    return title, price, colors, sizes, description

In [728]:
funcs = {
    "gregory": wrapper_gregory,
    "damyller": wrapper_damyller,
    "boutiquedassi": wrapper_dassi,
    "lojasexclusiva": wrapper_lojasexclusiva,
    "cuticutibaby": wrapper_cuticutibaby,
    "bluk": wrapper_bluk,
    "lojavillevie": wrapper_villevie,
    "achados96": wrapper_achados96,
    "dwz": wrapper_dwz,
    "bellaseda": wrapper_bellaseda
}

df = pd.read_csv("../Results/positive_docs.csv").drop(["Unnamed: 0"], axis=1)
url_re = '((www\.)?[a-zA-Z0-9]+\.[a-zA-Z0-9]+(\.[a-zA-Z0-9]+)*)'
info_list = []
for url, html in df.values:
    domain = re.findall(url_re, url)[0][0].split(".")[1]
    info = funcs[domain]("../Results/pages/" + html)
    info_list.append((info) + (url, html))
pd.DataFrame(info_list, columns=["title", "price", "colors", "sizes", "description", "url", "html"]).\
    to_csv(path_or_buf=f"../Results/wrapper_docs.csv")