In [1]:
import re
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
# Select all products
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
page = requests.get( url, headers=headers)

soup = BeautifulSoup(page.text, 'html.parser')

total_item = soup.find_all( 'h2', class_='load-more-heading' )[0].get('data-total')

page_number = np.round( int( total_item ) / 36 )

url02 = url + '?page-size=' + str( int( page_number*36 ) )

In [None]:
page_number

In [3]:
# Extracting all products data 

page = requests.get( url02, headers=headers)

soup = BeautifulSoup(page.text, 'html.parser')

products = soup.find( 'ul', class_='products-listing small')

product_list = products.find_all('article', class_='hm-product-item')

# Product ID
product_id = [p.get( 'data-articlecode' ) for p in product_list]

# Product Category
product_category = [p.get( 'data-category' ) for p in product_list]

# Product Name
product_list = products.find_all('a', class_='link')
product_name= [p.get_text('data-category') for p in product_list]

# Product Price
product_list = products.find_all('span', class_='price regular')
product_price = [p.get_text('price regular') for p in product_list]

# DataFrame Definition
data = pd.DataFrame([product_id, product_category, product_name, product_price]).T
data.columns = ['product_id','product_category','product_name','product_price']

# Scrapy Date
data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [None]:
data.shape

# Multiple Products

In [10]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5).AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
# empty dataframe
df_details = pd.DataFrame()
# unique columns for all products
aux = []
cols = ['Art. No.', 'Composition', 'Fit', 'Product safety', 'Size']
df_pattern = pd.DataFrame( columns=cols )

for i in range( len( data ) ):
    # API Requests
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id']+ '.html'
           
    page = requests.get( url, headers=headers )
    
    # Beautiful Soup object
    soup = BeautifulSoup( page.text, 'html.parser' )

    # ==================== color name =================================
    product_list = soup.find_all( 'a', class_='filter-option miniature' )
    color_name = [p.get( 'data-color' ) for p in product_list]

    # product id
    product_id = [p.get( 'data-articlecode' ) for p in product_list]
    df_color = pd.DataFrame( [product_id, color_name] ).T
    df_color.columns = ['product_id', 'color_name']
    
    # generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply( lambda x: x[:-3] )
    df_color['color_id'] = df_color['product_id'].apply( lambda x: x[-3:] )

    # ==================== composition =================================
    product_composition_list = soup.find_all( 'div', class_='pdp-description-list-item' )
    product_composition = [list( filter( None, p.get_text().split( '\n' ) ) ) for p in product_composition_list]

    # reaname dataframe
    df_composition = pd.DataFrame( product_composition ).T
    df_composition.columns = df_composition.iloc[0]

    # delete first row
    df_composition = df_composition.iloc[1:].fillna( method='ffill' )

    # garantee the same number of columns
    df_composition = pd.concat( [df_pattern, df_composition], axis=0 )

    # generate style id + color id
    df_composition['style_id'] = df_composition['Art. No.'].apply( lambda x: x[:-3] )
    df_composition['color_id'] = df_composition['Art. No.'].apply( lambda x: x[-3:] )
    aux = aux + df_composition.columns.tolist()

    # merge data color + decomposition
    data_sku = pd.merge( df_color, df_composition[['style_id', 'Fit','Composition', 'Size', 'Product safety']], how='left', on='style_id' )          
    
    # all details products
    df_details = pd.concat( [df_details, data_sku], axis=0 )

# # Join Showroom data + details
# data['style_id'] = data['product_id'].apply( lambda x: x[:-3] )
# data['color_id'] = data['product_id'].apply( lambda x: x[-3:] )
# data_raw = pd.merge( data, df_details[['style_id', 'color_name', 'Fit','Composition', 'Size', 'Product safety']],how='left', on='style_id' )

In [11]:
data_raw.shape

(3223, 12)

In [12]:
data_raw.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,Fit,Composition,Size,Product safety
0,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-11-18 21:04:16,985197,1,Midnight blue,Slim fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32",
1,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-11-18 21:04:16,985197,1,Midnight blue,Slim fit,"Shell: Cotton 98%, Spandex 2%","The model is 189cm/6'2"" and wears a size 32/32",
2,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-11-18 21:04:16,985197,1,Denim blue,Slim fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32",
3,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-11-18 21:04:16,985197,1,Denim blue,Slim fit,"Shell: Cotton 98%, Spandex 2%","The model is 189cm/6'2"" and wears a size 32/32",
4,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-11-18 21:04:16,985197,1,Dark denim blue,Slim fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32",


In [15]:
# Product Name data set
data_raw['product_name'] = data_raw['product_name'].apply( lambda x: x.replace( ' ','_' ).lower() )

# Product Fit data set
data_raw['Fit'] = data_raw['Fit'].apply( lambda x: x.replace( ' ','_' ).lower() if pd.notnull( x ) else x )

# Product Price data set
# data_raw['product_price'] = data_raw['product_price'].apply( lambda x: x.replace( '$ ','')).astype( float )

# Product ID
data_raw['product_id'] = data_raw['product_id'].astype( int )

# Product Color
data_raw['color_id'] = data_raw['color_id'].astype( int )

# Style
data_raw['style_id'] = data_raw['style_id'].astype( int )

# Date
data_raw['scrapy_datetime'] = pd.to_datetime( data_raw['scrapy_datetime'], format='%Y-%m-%d %H:%M:%S' )

# Select the size
data_raw['size_number'] = data_raw['Size'].apply( lambda x: re.search('\d{3}cm', x).group(0) if pd.notnull( x ) else x )
data_raw['size_number'] = data_raw['size_number'].apply( lambda x: re.search('\d+', x).group(0) if pd.notnull( x ) else x)
data_raw['size_model'] = data_raw['Size'].str.extract('(\d+/\\d+)') 
data_raw = data_raw.drop( columns = ['Size','Product safety'])

# Cleanning unkown composition
data_raw = data_raw[~data_raw['Composition'].str.contains('Pocket lining', na = False )]
data_raw = data_raw[~data_raw['Composition'].str.contains('Shell', na = False )]
data_raw = data_raw[~data_raw['Composition'].str.contains('Lining', na = False )]
data_raw = data_raw[~data_raw['Composition'].str.contains('Pocket', na = False )]

In [16]:
data_raw.shape

(1519, 12)

In [17]:
# Break composition by comma
df1 = data_raw['Composition'].str.split( ',', expand=True )

# cotton | Spandex | elasterell
df_ref = pd.DataFrame(columns=['cotton','spandex', 'elasterell'])

# cotton
df_cotton = df1[0]
df_cotton.name = 'cotton'
df_ref = pd.concat( [df_ref, df_cotton ], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last') ]
df_ref['cotton'] = df_ref['cotton'].apply( lambda x: int( re.search('\d+', x ).group(0) ) / 100 if pd.notnull( x ) else x )

# Spandex
df_spandex = df1.loc[df1[1].str.contains( 'Spandex', na=True ), 1]
df_spandex.name = 'spandex'
df_ref = pd.concat( [df_ref, df_spandex], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last') ]
df_ref['spandex'] = df_ref['spandex'].apply( lambda x: int( re.search('\d+', x ).group(0) ) / 100 if pd.notnull( x ) else x )

# elasterell
df_elasterell = df1.loc[df1[1].str.contains( 'Elasterell', na=True ), 1]
df_elasterell.name = 'elasterell'
df_ref = pd.concat( [df_ref, df_elasterell], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last') ]
df_ref['elasterell'] = df_ref['elasterell'].apply( lambda x: int( re.search('\d+', x ).group(0) ) / 100 if pd.notnull( x ) else x )

# final join
data_raw = pd.concat( [data_raw, df_ref], axis=1 )

In [18]:
data_raw

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,Fit,Composition,size_number,size_model,cotton,spandex,elasterell
145,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-11-18 21:04:16,690449,22,Light denim blue/trashed,skinny_fit,"Cotton 98%, Spandex 2%",184,31/32,0.98,0.02,
147,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-11-18 21:04:16,690449,22,Denim blue,skinny_fit,"Cotton 98%, Spandex 2%",184,31/32,0.98,0.02,
149,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-11-18 21:04:16,690449,22,Black/washed,skinny_fit,"Cotton 98%, Spandex 2%",184,31/32,0.98,0.02,
151,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-11-18 21:04:16,690449,22,Light denim blue,skinny_fit,"Cotton 98%, Spandex 2%",184,31/32,0.98,0.02,
153,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-11-18 21:04:16,690449,22,Black washed out,skinny_fit,"Cotton 98%, Spandex 2%",184,31/32,0.98,0.02,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3218,636207001,men_jeans_slim,slim_jeans,19.99,2021-11-18 21:04:16,636207,1,Midnight blue,slim_fit,"Cotton 89%, Polyester 10%, Spandex 1%",,,0.89,,
3219,636207001,men_jeans_slim,slim_jeans,19.99,2021-11-18 21:04:16,636207,1,Dark gray,slim_fit,"Cotton 89%, Polyester 10%, Spandex 1%",,,0.89,,
3220,636207001,men_jeans_slim,slim_jeans,19.99,2021-11-18 21:04:16,636207,1,Denim blue,slim_fit,"Cotton 89%, Polyester 10%, Spandex 1%",,,0.89,,
3221,636207001,men_jeans_slim,slim_jeans,19.99,2021-11-18 21:04:16,636207,1,White,slim_fit,"Cotton 89%, Polyester 10%, Spandex 1%",,,0.89,,
