In [94]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime

In [16]:
# Realizando a request do site, utilizando um agent simulando um browser para evitar problemas com a requisição
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get( url, headers=headers )

In [103]:
# Basicamente, você utiliza o request para requisitar os dados do url simulando um browser
## E o instância o texto para o BeautifulSoup, para realizar a extração dos dados HTML
### O parser é a forma que o BeautifulSoup vai ler os dados do HTML
soup = BeautifulSoup(page.text, 'html.parser')

In [105]:
total_item = soup.find('h2', class_='load-more-heading')
total_item = total_item.get('data-total')

page_number = np.round(int(n_page)/36)

In [106]:
url01 = url + '?sort=stock&image-size=small&image=model&offset=0&page-size=' + str(int(page_number*36))
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get( url01, headers=headers )
soup = BeautifulSoup(page.text, 'html.parser')

In [107]:
# Indexar a busca na lista de produtos
products = soup.find('ul', class_='products-listing small')

In [108]:
product_list = products.find_all('article', class_='hm-product-item')

In [109]:
# Id
product_id = [p.get('data-articlecode') for p in product_list]

In [110]:
# Product Category
product_category = [p.get('data-category') for p in product_list]

In [111]:
# Product Name
product_list = products.find_all('a', class_='link')
product_name = [p.get_text() for p in product_list]

In [112]:
# Product color

In [113]:
# Product Composition

In [114]:
# Product Price
product_list = products.find_all('span',class_='price regular')
product_price = [p.get_text() for p in product_list]

In [115]:
# Create a DataFrame
data = pd.DataFrame([product_id, product_name, product_category,product_price]).T
data.columns = ['product_id', 'product_name', 'product_category','product_price']

# Scrapy date-time
data['scrapy-datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

## One product

In [196]:
# Get color and composition one product - API request

url = 'https://www2.hm.com/en_us/productpage.0985197001.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get( url, headers=headers )

# BeautifulSoup Object
soup = BeautifulSoup(page.text, 'html.parser')

#=======================color name====================#
# A primeira cor, quando esta selecionada a classe altera para active, neste sentido, criei 2 modos e coleta
product_list = soup.find_all('a', class_='filter-option miniature active')
color_name = [p.get('data-color') for p in product_list]

product_list = soup.find_all('a', class_='filter-option miniature')
color_name2 = [p.get('data-color') for p in product_list]

color_name = color_name + color_name2

# color id
product_list = soup.find_all('a', class_='filter-option miniature active')
color_id1 = [p.get('data-articlecode') for p in product_list]

product_list = soup.find_all('a', class_='filter-option miniature')
color_id2 = [p.get('data-articlecode') for p in product_list]

color_id = color_id1 + color_id2

df_color = pd.DataFrame([color_id, color_name]).T
df_color.columns = ['product_id','color_name']

# generate style id + color id
df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

#======================= composition ====================#
product_composistion_list = soup.find_all('div', class_='pdp-description-list-item')
product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composistion_list]

# rename dataframe
df_composition = pd.DataFrame(product_composition).T
df_composition.columns = df_composition.iloc[0]

# drop first row
df_composition = df_composition.iloc[1:].fillna(method='ffill')
df_composition

# generate style id
df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition

#======================= merge ====================#
data_sku = pd.merge(df_color, df_composition[['style_id','Fit','Composition']], how='left', on='style_id')

## Multi products

In [202]:

data.head()

Unnamed: 0,product_id,product_name,product_category,product_price,scrapy-datetime
0,985197001,Slim Jeans,men_jeans_slim,$ 19.99,2021-07-15 12:44:01
1,985159001,Skinny Jeans,men_jeans_skinny,$ 19.99,2021-07-15 12:44:01
2,690449022,Skinny Jeans,men_jeans_ripped,$ 39.99,2021-07-15 12:44:01
3,427159006,Trashed Skinny Jeans,men_jeans_ripped,$ 39.99,2021-07-15 12:44:01
4,985197002,Slim Jeans,men_jeans_slim,$ 19.99,2021-07-15 12:44:01


In [211]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

#empty dataframe
data_details = pd.DataFrame()

#unique columns for all products composition
aux = []

cols = ['Art. No.', 'Composition', 'Fit', 'More sustainable materials', 'Product safety', 'Size']

df_pattern = pd.DataFrame(columns=cols)

for i in range(len(data)):
    # Get color and composition one product - API request
    url = 'https://www2.hm.com/en_us/productpage.'+ data.loc[i, 'product_id'] +'.html'
    page = requests.get( url, headers=headers )

    # BeautifulSoup Object
    soup = BeautifulSoup(page.text, 'html.parser')

    #=======================color name====================#
    # A primeira cor, quando esta selecionada a classe altera para active, neste sentido, criei 2 modos e coleta
    product_list = soup.find_all('a', class_='filter-option miniature active')
    color_name = [p.get('data-color') for p in product_list]

    product_list = soup.find_all('a', class_='filter-option miniature')
    color_name2 = [p.get('data-color') for p in product_list]

    color_name = color_name + color_name2

    # color id
    product_list = soup.find_all('a', class_='filter-option miniature active')
    color_id1 = [p.get('data-articlecode') for p in product_list]

    product_list = soup.find_all('a', class_='filter-option miniature')
    color_id2 = [p.get('data-articlecode') for p in product_list]

    color_id = color_id1 + color_id2

    df_color = pd.DataFrame([color_id, color_name]).T
    df_color.columns = ['product_id','color_name']

    # generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

    #======================= composition ====================#
    product_composistion_list = soup.find_all('div', class_='pdp-description-list-item')
    product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composistion_list]

    # rename dataframe
    df_composition = pd.DataFrame(product_composition).T
    df_composition.columns = df_composition.iloc[0]

    # drop first row
    df_composition = df_composition.iloc[1:].fillna(method='ffill')

    # garantee the same number of columns
    df_compostion = pd.concat([df_pattern, df_composition], axis=0)
    
    # generate style id
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    
    aux = aux + df_composition.columns.tolist()
    
    #======================= merge ====================#
    data_sku = pd.merge(df_color, df_composition, how='left', on='style_id')
    
    # all products details
    data_details = pd.concat([data_details, data_sku], axis=0)

# join data showroom + data details
data['style_id'] = data['product_id'].apply(lambda x: x[:-3])
data_raw = pd.merge(data, data_details[['style_id','color_id','color_name','Fit','Composition','Product safety','Size','More sustainable materials']], how='left', on='style_id')