In [12]:
# imports
import os
import re
import sqlite3
import logging
import requests
import numpy as np
import pandas as pd

from datetime   import datetime
from bs4        import BeautifulSoup
from sqlalchemy import create_engine

#### DATA COLLECTION#####
def data_collection(url,headers):
     # Entrando home page, para ver e número total de produtos e fazer a paginação

     page = requests.get (url, headers = headers)
     soup = BeautifulSoup(page.text , 'html.parser')

     total_items = soup.find_all('h2', class_ = 'load-more-heading')[0].get('data-total')

     page_number = np.ceil(int(total_items)/36)

     #===== Adicionando paginação =========
     url2 = url + '?page-size=' + str(int(page_number*36))

     headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
     page2 = requests.get (url2, headers = headers)

     # beutifull soap object
     soup2 = BeautifulSoup(page2.text , 'html.parser')

     #lista de produtos
     products = soup2.find('ul', class_ = 'products-listing')

     #========= Coleta de dados ===========
     #product_id and product_category
     product_list = products.find_all('article', class_ = 'hm-product-item')
     product_id = [p.get('data-articlecode')for p in product_list]

     product_cat = [p.get('data-category')for p in product_list]

     # product_name
     product_list = products.find_all('a', class_ = 'link')
     product_name = [p.get_text('link')for p in product_list]

     #product_price
     product_list = products.find_all('span', class_ = 'price regular')
     product_price = [p.get_text().replace('$ ','').strip() for p in product_list]

     # product_datetime
     data = pd.DataFrame([product_id,product_cat,product_name,product_price]).T
     data.columns = ['product_id','product_cat','product_name','product_price']

     data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

     return data


 # empty dataframe
 ##### DATA COLLECTION BY PRODUCT#####
def data_collection_by_product(data,headers):
     df_compositions = pd.DataFrame()

     # unique columns for all products
     aux = []

     df_pattern = pd.DataFrame(columns=['Fit','Art. No.', 'Composition'])
     for i in range(len(data)):
         # API Requests
         url = 'https://www2.hm.com/en_us/productpage.' + str(data.loc[i,'product_id']) + '.html'
         logger.debug('Product: %s', url)

         page = requests.get(url, headers = headers)

         # BeautifulSoup Objetc criation
         soup = BeautifulSoup(page.text , 'html.parser')

      # ================color name ==========================
         product_list = soup.find_all('a', class_ ='filter-option miniature active') + soup.find_all('a', class_ ='filter-option miniature')
         color_name = [p.get('data-color') for p in product_list]
         product_id = [p.get('data-articlecode') for p in product_list]

         df_color = pd.DataFrame([product_id,color_name]).T
         df_color.columns = ['product_id','color_name']

         for j in range( len(df_color)):

             try:
             # API Requests
                 url = 'https://www2.hm.com/en_us/productpage.' + str(df_color.loc[j,'product_id']) + '.html'
                 logger.debug('Product: %s', url)
     #
             # request
                 page = requests.get (url, headers = headers)

             # BeautifulSoup Objetc criation
                 soup = BeautifulSoup(page.text , 'html.parser')

             #product_name
                 product_list = soup.find_all('h1')
                 product_name = [p.get_text('h1')for p in product_list][0]

             #product_price
                 product_list = soup.find_all('div', class_ = 'primary-row product-item-price')
                 product_price = [p.get_text().strip().replace('$','') for p in product_list][0]

             #        size
                 product_list = soup.find_all('dl')
                 product_size = [p.get_text().split('\n')[3] for p in product_list]
                 product_size = [(product_size)[0].strip()][0]

             #         ================composition==========================
                 product_list = soup.find_all('div', class_ ='details-attributes-list-item')
                 product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_list]

             #        # criando DataFrame
                 df_composition = pd.DataFrame(product_composition).T

             #         # Renomeando os valores das colunas, de acordo com a primeira linha.
                 df_composition.columns = df_composition.iloc[0]

             #        # dropando a primeira linha
                 df_composition = df_composition.iloc[1:].fillna(method='pad')
                 df_composition = df_composition.drop_duplicates(subset=None,keep='first', inplace = False, ignore_index=False)

                 df_composition['Composition'] = df_composition['Composition'].replace( 'Pocket lining: ','', regex=True)
                 df_composition['Composition'] = df_composition['Composition'].replace( 'Shell: ','', regex=True)
                 df_composition['Composition'] = df_composition['Composition'].replace( 'Lining: ','', regex=True)
                 df_composition['Composition'] = df_composition['Composition'].replace( 'Pocket: ','', regex=True)


             # garantee the same number of columns
                 df_composition =  pd.concat( [df_pattern, df_composition], axis = 0)
                 df_composition = df_composition[['Art. No.','Composition','Fit']]

             #    Rename colums
                 df_composition.columns = ['product_id','composition','fit']

             # adding new columns
                 df_composition['product_name'] = product_name
                 df_composition['product_price'] = product_price
                 df_composition['product_size'] = product_size

             #         #keeps new columns if it shows up
                 aux = aux + df_composition.columns.tolist()

             #         # merge color and composition
                 df_composition = pd.merge(df_composition, df_color, how='left', on='product_id')
             #
             # all products
                 df_compositions = pd.concat( [df_compositions, df_composition], axis=0 )
                  # generate style_id and color_id (separando product_id para obter o numero do style_id e o color_id)
             except(IndexError):
                 pass

     df_compositions['style_id'] = df_compositions['product_id'].apply(lambda x: x[:-3] )
     df_compositions['color_id'] = df_compositions['product_id'].apply(lambda x: x[-3:] )

     # # #Scrapy Date
     df_compositions['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

     return df_compositions

 ##### DATA CLEANING #####
def data_cleaning(data_product):
     # product_id - ok
     df_data = data_product.dropna(subset=['product_id'])

     # product_name
     df_data['product_name'] = df_data['product_name'].apply(lambda x: x.replace(' ','_').lower())

     # product_price  - ok
     df_data['product_price'] = df_data['product_price'].astype(float)

     # color_name
     df_data['color_name'] = df_data['color_name'].apply(lambda x: x.replace(' ','_').replace('/','_').lower() if pd.notnull(x) else x )
     # Fit
     df_data['fit'] = df_data['fit'].apply(lambda x: x.replace(' ','_').lower() if pd.notnull(x) else x )

     # Size number
     # df_data['size_number']=df_data['product_size'].apply(lambda x:re.search('\d{3}cm',x).group(0) if pd.notnull(x) else x)
     df_data['size_model']=df_data['product_size'].str.split(' ', expand=True)[3]
     df_data['size_model']=df_data['product_size'].str.extract('(\d{3})cm')

     #size_model
     df_data['size_number']=df_data['product_size'].str.extract('(\d+/\\d+)')

     # break compositon by comma
     df1 = df_data['composition'].str.split(',',  expand=True).reset_index(drop=True)

     # #columns order = cotton | Polyester | Elastomultiester | Spandex
     df_ref = pd.DataFrame( index = np.arange(len( data)), columns= ['cotton', 'polyester','elastomultiester','spandex'])

     # # =============== Composition =========================
     # ====== cotton ======
     df_cotton_0 = df1.loc[ df1[0].str.contains('Cotton', na=True), 0]
     df_cotton_0.name ='cotton'
     df_cotton_1 = df1.loc[ df1[1].str.contains('Cotton', na=True), 1]
     df_cotton_1.name ='cotton'
     df_cotton = df_cotton_0.combine_first( df_cotton_1 )
     df_ref = pd.concat([df_ref, df_cotton], axis=1)
     df_ref = df_ref.iloc[:,~df_ref.columns.duplicated(keep = 'last')]

     # ======== polyester===========
     df_polyester_0 = df1.loc[df1[0].str.contains('Polyester ', na=True),0]
     df_polyester_0.name = 'polyester'
     df_polyester_1 = df1.loc[df1[1].str.contains('Polyester ', na=True),1]
     df_polyester_1.name = 'polyester'
     df_polyester = df_polyester_0.combine_first( df_polyester_1 )
     df_ref = pd.concat([df_ref, df_polyester],axis =1)
     df_ref = df_ref.iloc[:,~df_ref.columns.duplicated(keep = 'last')]

     # ================Spandex ===============
     df_spandex_1 = df1.loc[df1[1].str.contains('Spandex', na=True),1]
     df_spandex_1.name = 'spandex'
     df_spandex_2 = df1.loc[df1[2].str.contains('Spandex', na=True),2]
     df_spandex_2.name = 'spandex'

     # # Combine spandex from both columns 1 e 2
     df_spandex = df_spandex_1.combine_first( df_spandex_2 )

     df_ref = pd.concat([df_ref, df_spandex],axis =1)
     df_ref = df_ref.iloc[:,~df_ref.columns.duplicated(keep = 'last')]

     # ============= Elastomultiester ===========
     df_elastomultiester = df1.loc[df1[1].str.contains('Elastomultiester', na=True),1]
     df_elastomultiester.name = 'elastomultiester'
     df_ref = pd.concat([df_ref, df_elastomultiester],axis =1)
     df_ref = df_ref.iloc[:,~df_ref.columns.duplicated(keep = 'last')]

     # join  of combine  with product_id
     df_aux = pd.concat([df_data['product_id'].reset_index(drop=True),df_ref], axis=1 )

     # #remove word from composition
     df_aux['cotton'] = df_aux['cotton'].apply(lambda x : int(re.search('\d+', x ).group(0))/ 100 if pd.notnull(x) else x )
     df_aux['polyester'] = df_aux['polyester'].apply(lambda x : int(re.search('\d+', x ).group(0))/ 100 if pd.notnull(x) else x )
     df_aux['elastomultiester'] = df_aux['elastomultiester'].apply(lambda x : int(re.search('\d+', x ).group(0))/ 100 if pd.notnull(x) else x )
     df_aux['spandex'] = df_aux['spandex'].apply(lambda x : int(re.search('\d+', x ).group(0))/ 100 if pd.notnull(x) else x )

     # final join
     df_aux = df_aux.groupby ( 'product_id' ).max().reset_index().fillna(0)
     df_data = pd.merge(df_data, df_aux, on='product_id', how='left')

     # #droping columns unneeded
     df_data =df_data.drop(columns = ['composition','product_size'])

     df_data = df_data.drop_duplicates().reset_index(drop=True)
     return df_data

 ##### DATA INSERT #####
def data_insert( df_data ):
     # ## Data Insert - DataBase
     data_insert = df_data[[
         'product_id',
         'style_id',
         'color_id',
         'product_name',
         'color_name',
         'fit',
         'product_price',
         'size_number',
         'size_model',
         'cotton',
         'polyester',
         'spandex',
         'elastomultiester',
         'scrapy_datetime'
     ]]

     #create database connection
     conn =create_engine ('sqlite:///database_hm.sqlite', echo=False)

     #data insert
     data_insert.to_sql('vitrine', con=conn, if_exists='append', index=False)
     return None

if __name__ == '__main__':
     #logging
     path = 'C:\\Users\\vande\\CienciaDeDados\\DS_ao_DEV\\Module_06'

     if not os.path.exists(path + 'Logs'):
         os.makedirs(path + 'Logs')

     logging.basicConfig(
         filename=path + 'Logs\webscraping_hm.log',
         level=logging.DEBUG,
         format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
         datefmt='%Y-%m-%d %H:%M:%S'
     )
     logger = logging.getLogger('webscraping_hm')

     #parameters and constants
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}

     url = 'https://www2.hm.com/en_us/men/products/jeans.html'

     #data collection
     data = data_collection(url,headers)
     logger.info('data collection done')
     #data collection by product
     data_product = data_collection_by_product(data,headers)
     logger.info('data collection by product done')

     #data cleaning
     data_product_cleaned = data_cleaning(data_product)
     logger.info('data product cleaned done')

#     #insertion
     data_insert(data_product_cleaned)
     logger.info('data insertion done')




