In [142]:
import pandas as pd
import numpy as np
import requests
import re

from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
# Realizando a request do site, utilizando um agent simulando um browser para evitar problemas com a requisição
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get( url, headers=headers )

In [3]:
# Basicamente, você utiliza o request para requisitar os dados do url simulando um browser
## E o instância o texto para o BeautifulSoup, para realizar a extração dos dados HTML
### O parser é a forma que o BeautifulSoup vai ler os dados do HTML
soup = BeautifulSoup(page.text, 'html.parser')

In [4]:
total_item = soup.find('h2', class_='load-more-heading')
total_item = total_item.get('data-total')

page_number = np.round(int(total_item)/36)

In [5]:
url01 = url + '?sort=stock&image-size=small&image=model&offset=0&page-size=' + str(int(page_number*36))
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get( url01, headers=headers )
soup = BeautifulSoup(page.text, 'html.parser')

In [6]:
# Indexar a busca na lista de produtos
products = soup.find('ul', class_='products-listing small')

In [7]:
product_list = products.find_all('article', class_='hm-product-item')

In [8]:
# Id
product_id = [p.get('data-articlecode') for p in product_list]

In [9]:
# Product Category
product_category = [p.get('data-category') for p in product_list]

In [10]:
# Product Name
product_list = products.find_all('a', class_='link')
product_name = [p.get_text() for p in product_list]

In [11]:
# Product Price
product_list = products.find_all('span',class_='price regular')
product_price = [p.get_text() for p in product_list]

In [12]:
# Create a DataFrame
data = pd.DataFrame([product_id, product_name, product_category,product_price]).T
data.columns = ['product_id', 'product_name', 'product_category','product_price']

# Scrapy date-time
data['scrapy-datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

## One product scrapy

In [None]:
# Get color and composition one product - API request

url = 'https://www2.hm.com/en_us/productpage.0985197001.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get( url, headers=headers )

# BeautifulSoup Object
soup = BeautifulSoup(page.text, 'html.parser')

#=======================color name====================#
# A primeira cor, quando esta selecionada a classe altera para active, neste sentido, criei 2 modos e coleta
product_list = soup.find_all('a', class_='filter-option miniature active')
color_name = [p.get('data-color') for p in product_list]

product_list = soup.find_all('a', class_='filter-option miniature')
color_name2 = [p.get('data-color') for p in product_list]

color_name = color_name + color_name2

# color id
product_list = soup.find_all('a', class_='filter-option miniature active')
color_id1 = [p.get('data-articlecode') for p in product_list]

product_list = soup.find_all('a', class_='filter-option miniature')
color_id2 = [p.get('data-articlecode') for p in product_list]

color_id = color_id1 + color_id2

df_color = pd.DataFrame([color_id, color_name]).T
df_color.columns = ['product_id','color_name']

# generate style id + color id
df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

#======================= composition ====================#
product_composistion_list = soup.find_all('div', class_='pdp-description-list-item')
product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composistion_list]

# rename dataframe
df_composition = pd.DataFrame(product_composition).T
df_composition.columns = df_composition.iloc[0]

# drop first row
df_composition = df_composition.iloc[1:].fillna(method='ffill')
df_composition

# generate style id
df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition

#======================= merge ====================#
data_sku = pd.merge(df_color, df_composition[['style_id','Fit','Composition']], how='left', on='style_id')

## Multi products scrapy

In [24]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

#empty dataframe
data_details = pd.DataFrame()

#unique columns for all products composition
aux = []

cols = ['Art. No.', 'Composition', 'Fit', 'Product safety', 'Size']

df_pattern = pd.DataFrame(columns=cols)

for i in range(len(data)):
    # Get color and composition one product - API request
    url = 'https://www2.hm.com/en_us/productpage.'+ data.loc[i, 'product_id'] +'.html'
    page = requests.get( url, headers=headers )

    # BeautifulSoup Object
    soup = BeautifulSoup(page.text, 'html.parser')

    #=======================color name====================#
    # A primeira cor, quando esta selecionada a classe altera para active, neste sentido, criei 2 modos e coleta
    product_list = soup.find_all('a', class_=['filter-option miniature active','filter-option miniature'])
    color_name = [p.get('data-color') for p in product_list]

    # color id
    product_list = soup.find_all('a', class_=['filter-option miniature active','filter-option miniature'])
    color_id = [p.get('data-articlecode') for p in product_list]

    df_color = pd.DataFrame([color_id, color_name]).T
    df_color.columns = ['product_id','color_name']

    # generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

    #======================= composition ====================#
    product_composistion_list = soup.find_all('div', class_='pdp-description-list-item')
    product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composistion_list]

    # rename dataframe
    df_composition = pd.DataFrame(product_composition).T
    df_composition.columns = df_composition.iloc[0]

    # drop first row
    df_composition = df_composition.iloc[1:].fillna(method='ffill')

    # garantee the same number of columns
    df_compostion = pd.concat([df_pattern, df_composition], axis=0)
    
    # generate style id
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    
    aux = aux + df_composition.columns.tolist()
    
    #======================= merge ====================#
    data_sku = pd.merge(df_color, df_composition, how='left', on='style_id')
    
    # all products details
    data_details = pd.concat([data_details, data_sku], axis=0)

# join data showroom + data details
data['style_id'] = data['product_id'].apply(lambda x: x[:-3])
data_raw = pd.merge(data, data_details[['style_id','color_id','color_name','Fit','Composition','Product safety','Size']], how='left', on='style_id')

In [26]:
data_raw.to_csv('data-raw.csv', index=False)

In [159]:
# data clean
df = data_raw.copy()
df.columns = [item.lower() for item in df.columns]

#product_id
#product_name
df['product_name'] = df['product_name'].apply(lambda x: x.replace(' ', '_').lower())

#product_category
#product_price
df['product_price'] = df['product_price'].apply(lambda x: x.replace('$ ','')).astype(float)

#scrapy-datetime
#style_id
#color_id
#color_name
df['color_name'] = df['color_name'].apply(lambda x: x.replace(' ', '_').lower())

#Fit
df['fit'] = df['fit'].apply(lambda x: x.replace(' ', '_').lower())

#Composition
#Product safety
#Size
df['size_number'] = df['size'].apply(lambda x: re.search('\d{3}cm', x).group(0) if pd.notnull(x) else x)
df['size_number'] = df['size_number'].apply(lambda x: re.search('\d{3}', x).group(0) if pd.notnull(x) else x)

df['size_model'] = df['size'].str.extract('(\d+/\\d+)')

df = df.drop(columns=['product safety','size'], axis=1)

In [160]:
df.head()

Unnamed: 0,product_id,product_name,product_category,product_price,scrapy-datetime,style_id,color_id,color_name,fit,composition,size_number,size_model
0,985197001,slim_jeans,men_jeans_slim,19.99,2021-07-18 18:24:31,985197,1,black,slim_fit,Pocket lining: Cotton 100%,189,32/32
1,985197001,slim_jeans,men_jeans_slim,19.99,2021-07-18 18:24:31,985197,1,black,slim_fit,"Shell: Cotton 98%, Elastane 2%",189,32/32
2,985197001,slim_jeans,men_jeans_slim,19.99,2021-07-18 18:24:31,985197,2,midnight_blue,slim_fit,Pocket lining: Cotton 100%,189,32/32
3,985197001,slim_jeans,men_jeans_slim,19.99,2021-07-18 18:24:31,985197,2,midnight_blue,slim_fit,"Shell: Cotton 98%, Elastane 2%",189,32/32
4,985197001,slim_jeans,men_jeans_slim,19.99,2021-07-18 18:24:31,985197,3,denim_blue,slim_fit,Pocket lining: Cotton 100%,189,32/32


In [131]:
df.head()

Unnamed: 0,product_id,product_name,product_category,product_price,scrapy-datetime,style_id,color_id,color_name,fit,composition,product safety,size
0,985197001,slim_jeans,men_jeans_slim,19.99,2021-07-18 18:24:31,985197,1,black,slim_fit,Pocket lining: Cotton 100%,,"The model is 189cm/6'2"" and wears a size 32/32"
1,985197001,slim_jeans,men_jeans_slim,19.99,2021-07-18 18:24:31,985197,1,black,slim_fit,"Shell: Cotton 98%, Elastane 2%",,"The model is 189cm/6'2"" and wears a size 32/32"
2,985197001,slim_jeans,men_jeans_slim,19.99,2021-07-18 18:24:31,985197,2,midnight_blue,slim_fit,Pocket lining: Cotton 100%,,"The model is 189cm/6'2"" and wears a size 32/32"
3,985197001,slim_jeans,men_jeans_slim,19.99,2021-07-18 18:24:31,985197,2,midnight_blue,slim_fit,"Shell: Cotton 98%, Elastane 2%",,"The model is 189cm/6'2"" and wears a size 32/32"
4,985197001,slim_jeans,men_jeans_slim,19.99,2021-07-18 18:24:31,985197,3,denim_blue,slim_fit,Pocket lining: Cotton 100%,,"The model is 189cm/6'2"" and wears a size 32/32"


In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4028 entries, 0 to 4027
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   product_id        4028 non-null   object 
 1   product_name      4028 non-null   object 
 2   product_category  4028 non-null   object 
 3   product_price     4028 non-null   float64
 4   scrapy-datetime   4028 non-null   object 
 5   style_id          4028 non-null   object 
 6   color_id          4028 non-null   object 
 7   color_name        4028 non-null   object 
 8   Fit               4028 non-null   object 
 9   Composition       4028 non-null   object 
 10  Product safety    904 non-null    object 
 11  Size              1473 non-null   object 
dtypes: float64(1), object(11)
memory usage: 409.1+ KB


In [85]:
for a in range(len(df.columns)):
    print('No text, column:' + df.columns[a], df[df[df.columns[a]] == ''].shape[0]) 

No text, column:product_id 0
No text, column:product_name 0
No text, column:product_category 1294
No text, column:product_price 0
No text, column:scrapy-datetime 0
No text, column:style_id 0
No text, column:color_id 0
No text, column:color_name 0
No text, column:Fit 0
No text, column:Composition 0
No text, column:Product safety 0
No text, column:Size 0


In [87]:
df.isna().sum()

product_id             0
product_name           0
product_category       0
product_price          0
scrapy-datetime        0
style_id               0
color_id               0
color_name             0
Fit                    0
Composition            0
Product safety      3124
Size                2555
dtype: int64