In [1]:
import pandas as pd
import numpy as np
import requests
import re

from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
# Realizando a request do site, utilizando um agent simulando um browser para evitar problemas com a requisição
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get( url, headers=headers )

In [3]:
# Basicamente, você utiliza o request para requisitar os dados do url simulando um browser
## E o instância o texto para o BeautifulSoup, para realizar a extração dos dados HTML
### O parser é a forma que o BeautifulSoup vai ler os dados do HTML
soup = BeautifulSoup(page.text, 'html.parser')

In [4]:
total_item = soup.find('h2', class_='load-more-heading')
total_item = total_item.get('data-total')

page_number = np.round(int(total_item)/36)

In [5]:
url01 = url + '?sort=stock&image-size=small&image=model&offset=0&page-size=' + str(int(page_number*36))
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get( url01, headers=headers )
soup = BeautifulSoup(page.text, 'html.parser')

In [6]:
# Indexar a busca na lista de produtos
products = soup.find('ul', class_='products-listing small')

In [7]:
product_list = products.find_all('article', class_='hm-product-item')

In [9]:
# Id
product_id = [p.get('data-articlecode') for p in product_list]

In [11]:
len(product_id)

72

In [9]:
# Product Category
product_category = [p.get('data-category') for p in product_list]

In [10]:
# Product Name
product_list = products.find_all('a', class_='link')
product_name = [p.get_text() for p in product_list]

In [11]:
# Product Price
product_list = products.find_all('span',class_='price regular')
product_price = [p.get_text() for p in product_list]

In [12]:
# Create a DataFrame
data = pd.DataFrame([product_id, product_name, product_category,product_price]).T
data.columns = ['product_id', 'product_name', 'product_category','product_price']

# Scrapy date-time
data['scrapy-datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

## One product scrapy

In [None]:
# Get color and composition one product - API request

url = 'https://www2.hm.com/en_us/productpage.0985197001.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get( url, headers=headers )

# BeautifulSoup Object
soup = BeautifulSoup(page.text, 'html.parser')

#=======================color name====================#
# A primeira cor, quando esta selecionada a classe altera para active, neste sentido, criei 2 modos e coleta
product_list = soup.find_all('a', class_='filter-option miniature active')
color_name = [p.get('data-color') for p in product_list]

product_list = soup.find_all('a', class_='filter-option miniature')
color_name2 = [p.get('data-color') for p in product_list]

color_name = color_name + color_name2

# color id
product_list = soup.find_all('a', class_='filter-option miniature active')
color_id1 = [p.get('data-articlecode') for p in product_list]

product_list = soup.find_all('a', class_='filter-option miniature')
color_id2 = [p.get('data-articlecode') for p in product_list]

color_id = color_id1 + color_id2

df_color = pd.DataFrame([color_id, color_name]).T
df_color.columns = ['product_id','color_name']

# generate style id + color id
df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

#======================= composition ====================#
product_composistion_list = soup.find_all('div', class_='pdp-description-list-item')
product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composistion_list]

# rename dataframe
df_composition = pd.DataFrame(product_composition).T
df_composition.columns = df_composition.iloc[0]

# drop first row
df_composition = df_composition.iloc[1:].fillna(method='ffill')
df_composition

# generate style id
df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition

#======================= merge ====================#
data_sku = pd.merge(df_color, df_composition[['style_id','Fit','Composition']], how='left', on='style_id')

## Multi products scrapy

In [24]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

#empty dataframe
data_details = pd.DataFrame()

#unique columns for all products composition
aux = []

cols = ['Art. No.', 'Composition', 'Fit', 'Product safety', 'Size']

df_pattern = pd.DataFrame(columns=cols)

for i in range(len(data)):
    # Get color and composition one product - API request
    url = 'https://www2.hm.com/en_us/productpage.'+ data.loc[i, 'product_id'] +'.html'
    page = requests.get( url, headers=headers )

    # BeautifulSoup Object
    soup = BeautifulSoup(page.text, 'html.parser')

    #=======================color name====================#
    # A primeira cor, quando esta selecionada a classe altera para active, neste sentido, criei 2 modos e coleta
    product_list = soup.find_all('a', class_=['filter-option miniature active','filter-option miniature'])
    color_name = [p.get('data-color') for p in product_list]

    # color id
    product_list = soup.find_all('a', class_=['filter-option miniature active','filter-option miniature'])
    color_id = [p.get('data-articlecode') for p in product_list]

    df_color = pd.DataFrame([color_id, color_name]).T
    df_color.columns = ['product_id','color_name']

    # generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

    #======================= composition ====================#
    product_composistion_list = soup.find_all('div', class_='pdp-description-list-item')
    product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composistion_list]

    # rename dataframe
    df_composition = pd.DataFrame(product_composition).T
    df_composition.columns = df_composition.iloc[0]

    # drop first row
    df_composition = df_composition.iloc[1:].fillna(method='ffill')

    # garantee the same number of columns
    df_compostion = pd.concat([df_pattern, df_composition], axis=0)
    
    # generate style id
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    
    aux = aux + df_composition.columns.tolist()
    
    #======================= merge ====================#
    data_sku = pd.merge(df_color, df_composition, how='left', on='style_id')
    
    # all products details
    data_details = pd.concat([data_details, data_sku], axis=0)

# join data showroom + data details
data['style_id'] = data['product_id'].apply(lambda x: x[:-3])
data_raw = pd.merge(data, data_details[['style_id','color_id','color_name','Fit','Composition','Product safety','Size']], how='left', on='style_id')

In [313]:
data_raw.to_csv('data-raw.csv', index=False)

NameError: name 'data_raw' is not defined

In [71]:
#===================== Clean Data =====================# 

data = pd.read_csv('data-raw.csv')

# data clean
df = data.copy()
df.columns = [item.lower() for item in df.columns]

#product_id
#product_name
df['product_name'] = df['product_name'].apply(lambda x: x.replace(' ', '_').lower())

#product_category
df['product_category'] = df['product_category'].fillna('no_category') 

#product_price
df['product_price'] = df['product_price'].apply(lambda x: x.replace('$ ','')).astype(float)

#scrapy-datetime
#style_id
#color_id
#color_name
df['color_name'] = df['color_name'].apply(lambda x: x.replace(' ', '_').lower())

#Fit
df['fit'] = df['fit'].apply(lambda x: x.replace(' ', '_').lower())

#Size
df['size_number'] = df['size'].apply(lambda x: re.search('\d{3}cm', x).group(0) if pd.notnull(x) else x)
df['size_number'] = df['size_number'].apply(lambda x: re.search('\d{3}', x).group(0) if pd.notnull(x) else x)

df['size_model'] = df['size'].str.extract('(\d+/\\d+)')

#Composition
df = df[~df['composition'].str.contains('Pocket lining:', na=False)]
df = df[~df['composition'].str.contains('Lining:', na=False)]
df = df[~df['composition'].str.contains('Shell:', na=False)]

#Product safety
df = df.drop(columns=['product safety','size'], axis=1)

#============= Break Composition Comma ==================#

df1 = df['composition'].str.split(',', expand=True)
df_ref = pd.DataFrame(index=np.arange(len(df)), columns=['cotton', 'polyester', 'elastano','elasterell'] )

#cotton
df_cotton = df1[0]
df_cotton.name = 'cotton'
df_ref = pd.concat([df_ref, df_cotton], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

#polyester
df_polyester = df1.loc[df1[1].str.contains('Polyester', na=True), 1]
df_polyester.name = 'polyester'
df_ref = pd.concat([df_ref, df_polyester], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]


#elastano
df_elastano = df1.loc[df1[1].str.contains('Elastane', na=True), 1]
df_elastano.name = 'elastano'

df_ref = pd.concat([df_ref, df_elastano], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

#elasterell
df_elasterell = df1.loc[df1[1].str.contains('Elasterell', na=True), 1]
df_elasterell.name = 'elasterell'

df_ref = pd.concat([df_ref, df_elasterell], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

#final join
df = pd.concat([df, df_ref], axis=1, join='inner')


#format composition data
df['cotton'] = df['cotton'].apply(lambda x: int( re.search('\d+',x).group(0)) / 100 if pd.notnull(x) else x)
df['polyester'] = df['polyester'].apply(lambda x: int( re.search('\d+',x).group(0)) / 100 if pd.notnull(x) else x)
df['elastano'] = df['elastano'].apply(lambda x: int( re.search('\d+',x).group(0)) / 100 if pd.notnull(x) else x)
df['elasterell'] = df['elasterell'].apply(lambda x: int( re.search('\d+',x).group(0)) / 100 if pd.notnull(x) else x)


In [88]:
cols = ['product_id', 'product_name', 'product_category', 'product_price',
       'style_id', 'color_id', 'color_name', 'fit','composition',
       'size_number', 'size_model', 'cotton', 'polyester',
       'elastano', 'elasterell', 'scrapy-datetime']

In [87]:
df.reindex(columns=cols)

Unnamed: 0,product_id,product_name,product_category,product_price,style_id,color_id,color_name,fit,composition,size_number,size_model,cotton,polyester,elastano,elasterell,scrapy-datetime
82,427159006,trashed_skinny_jeans,men_jeans_ripped,39.99,427159,1,black_denim,skinny_fit,"Cotton 93%, Polyester 6%, Elastane 1%",184,31/32,0.93,0.06,,,2021-07-18 18:24:31
83,427159006,trashed_skinny_jeans,men_jeans_ripped,39.99,427159,2,blue_washed_out,skinny_fit,"Cotton 93%, Polyester 6%, Elastane 1%",184,31/32,0.93,0.06,,,2021-07-18 18:24:31
84,427159006,trashed_skinny_jeans,men_jeans_ripped,39.99,427159,3,denim_blue,skinny_fit,"Cotton 93%, Polyester 6%, Elastane 1%",184,31/32,0.93,0.06,,,2021-07-18 18:24:31
85,427159006,trashed_skinny_jeans,men_jeans_ripped,39.99,427159,4,light_denim_blue,skinny_fit,"Cotton 93%, Polyester 6%, Elastane 1%",184,31/32,0.93,0.06,,,2021-07-18 18:24:31
86,427159006,trashed_skinny_jeans,men_jeans_ripped,39.99,427159,5,dark_denim_blue,skinny_fit,"Cotton 93%, Polyester 6%, Elastane 1%",184,31/32,0.93,0.06,,,2021-07-18 18:24:31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4023,730863040,skinny_jeans,no_category,29.99,730863,33,black/no_fade_black,skinny_fit,"Cotton 98%, Elastane 2%",,,0.98,,0.02,,2021-07-18 18:24:31
4024,730863040,skinny_jeans,no_category,29.99,730863,38,denim_blue,skinny_fit,"Cotton 98%, Elastane 2%",,,0.98,,0.02,,2021-07-18 18:24:31
4025,730863040,skinny_jeans,no_category,29.99,730863,39,blue,skinny_fit,"Cotton 98%, Elastane 2%",,,0.98,,0.02,,2021-07-18 18:24:31
4026,730863040,skinny_jeans,no_category,29.99,730863,40,denim_blue,skinny_fit,"Cotton 98%, Elastane 2%",,,0.98,,0.02,,2021-07-18 18:24:31
