In [1]:
"""
Process Name            : STEP 7 : WEB SCRAPPING
"""
#=======================================================================================================
### Required Imports ###
#============================================================================================================
try:
    import sys  # System-specific parameters and functions
    import logging # for logs
    import socket # network communication
    import warnings  # Warning control
    import os # Interacting with the operating system
    import pandas as pd  # Data handling
    from datetime import datetime
    from selenium import webdriver  # Web browser automation
    from selenium.webdriver.common.by import By  # Locate elements on a page
    from selenium.webdriver.support import expected_conditions  # Wait for conditions to be met
    from selenium.webdriver.support.wait import WebDriverWait  # Explicit wait
    from selenium.webdriver.common.keys import Keys  # Keyboard actions
    from selenium.webdriver.chrome.options import Options  # Chrome browser options
    from selenium.webdriver.support import expected_conditions as EC  # Alias for expected conditions
    from selenium.webdriver.support.ui import Select  # Handle dropdown menus
    from selenium.webdriver.chrome.service import Service  # Manage ChromeDriver service
    from time import sleep  # Pause execution
    import requests  # Send HTTP requests
except Exception as err:
    print("Exception raised while importing the packages")
    print(f'Exception: {err}')
    #input("press Enter to Close")
    sys.exit()

In [2]:
#=========================================================================
### Initialization ###
#=========================================================================
try:
    path = os.getcwd()
    curr_time = datetime.now()

    ### Log Files declaration ###
    log_folder = os.path.join(path, 'Logs')
    log_date_fmt = str(curr_time.strftime('%Y')) + '-' + str(curr_time.strftime('%m')) + '-' + str(curr_time.strftime('%d')) + "_" + str(curr_time.strftime("%H")) + "-" + str(curr_time.strftime("%M"))

    audit_log_file = "Audit_webscrapping.log"
    audit_log_file = os.path.join(log_folder, audit_log_file)
    
    error_log_file = "Error_webscrapping.log"
    error_log_file = os.path.join(log_folder, error_log_file)

    ### Creating log folder ###
    if not os.path.exists(log_folder):
        os.makedirs(log_folder)
        
    ### Function: Logger setup ###
    def setup_logger(logger_name, log_file, level=logging.INFO):
        logger = logging.getLogger(logger_name)
        formatter = logging.Formatter(socket.gethostname()+' : '+'%(asctime)s : %(levelname)s : [%(filename)s:%(lineno)d] : %(message)s')

        fileHandler = logging.FileHandler(log_file, mode='w')

        fileHandler.setFormatter(formatter)

        streamHandler = logging.StreamHandler(sys.stdout)
        streamHandler.setFormatter(formatter)

        logger.setLevel(level)
        logger.addHandler(fileHandler)
        logger.addHandler(streamHandler)
        return logger
        
    ### Setting up the logger ###
    setup_logger('audit', audit_log_file, level=logging.INFO)
    setup_logger('error', error_log_file, level=logging.ERROR)

    audit_logger = logging.getLogger('audit')
    error_logger = logging.getLogger('error')
    audit_logger.info('Process start')

except Exception as err:
    print('Setting up the logger failed')
    print(f'Exception: {err}')
    #input("press Enter to Close")
    sys.exit()

192.168.1.112 : 2024-09-05 15:56:20,031 : INFO : [2840147301.py:45] : Process start


In [3]:
#==================================================================================================
### Ignore Warnings ### 
#==================================================================================================
try:
    audit_logger.info('Ignore Warnings')
    warnings.filterwarnings('ignore') ## Suppress all warnings
except Exception as err:
    audit_logger.info('Ignore Warnings - Failed')
    error_logger.error('Ignore Warnings - Failed')
    error_logger.error('Exception: ', exc_info=True)
    #input("press Enter to Close")
    sys.exit()



In [4]:
#==================================================================================================
### Create folder structure ### 
#==================================================================================================
try:
    audit_logger.info('Create folder structure')
    path
    input_folder = os.path.join(path, 'Input')
    if not os.path.isdir(input_folder):
        audit_logger.info('No input folder')
        error_logger.error('No input folder')
        raise Exception
    excel_folder = os.path.join(path, 'Excel_files')
    if not os.path.isdir(excel_folder):
        os.mkdir(excel_folder)
    saved_folder = os.path.join(path, 'Saved_files')
    if not os.path.isdir(saved_folder):
        os.mkdir(saved_folder)
    output_folder = os.path.join(path, 'Output')
    if not os.path.isdir(output_folder):
        os.mkdir(output_folder)
except Exception as err:
    audit_logger.info('Create folder structure - Failed')
    error_logger.error('Create folder structure - Failed')
    error_logger.error('Exception: ', exc_info=True)
    #input("press Enter to Close")
    sys.exit()

192.168.1.112 : 2024-09-05 15:56:20,039 : INFO : [300162134.py:5] : Create folder structure


In [5]:
#==================================================================================================
### Load the inputs ### 
#==================================================================================================
try:
    audit_logger.info('Loading inputs') 
    result_df = pd.read_excel(os.path.join(output_folder, "output.xlsx"))
except Exception as err:
    audit_logger.info('Loading inputs - Failed')
    error_logger.error('Loading inputs - Failed')
    error_logger.error('Exception: ', exc_info=True)
    #input("press Enter to Close")
    sys.exit()

192.168.1.112 : 2024-09-05 15:56:20,046 : INFO : [1197277583.py:5] : Loading inputs


In [6]:
#==================================================================================================
### Web scrapping ### 
#==================================================================================================
try:
    audit_logger.info('Web scrapping') 
    for j in range(0, len(result_df)):
        new_folder = os.path.join(output_folder, str(j))
        if not os.path.isdir(new_folder):
                os.mkdir(new_folder)
        if result_df.loc[j, "Preferred_Clothing"] == "One Piece Wear":
            # Set up options for Chrome
            options = webdriver.ChromeOptions()
            preferences = {
                "profile.default_content_settings.popups": 0,
                "download.default_directory": path,
                "directory_upgrade": True,
                "browser.download.manager.showWhenStarting": False
            }
            options.add_experimental_option('prefs', preferences)
            driver = webdriver.Chrome(options=options)
            # Optional: Set Chrome to headless mode (no GUI)
            # options.add_argument("--headless")
            audit_logger.info(f'Search for {result_df.loc[j, "Singlepiece"]}')
            
            # Open a webpage
            print("Browsing 'https://www.shein.co.uk'")
            driver.get("https://www.shein.co.uk")
            print("Browsing 'https://www.shein.co.uk/' completed")
            sleep(2)
            
            z=0
            while z < 20:
                try:
                    print(z)
                    form = driver.find_element(By.XPATH, '//div[@class="div-input header-search-input j-header-search-input fsp-element"]')
                    z=20
                except:
                    z=z+1
                    if z < 19:
                        sleep(10)
                    else:
                        raise Exception
            print("test_3")
            form = driver.find_element(By.XPATH, '//div[@class="div-input header-search-input j-header-search-input fsp-element"]')
            form.send_keys(str(result_df.loc[j, "Gender"])+"+"+result_df.loc[j, "Singlepiece"] + Keys.ENTER)
            sleep(2)
            i=1
            images = driver.find_elements(By.XPATH,'//img[@class="fsp-element crop-image-container__img"]')
            for img in images:
                if str(img.get_attribute("outerHTML").lower()).find(result_df.loc[j, "Singlepiece"].split()[-1]) != -1:
                    src = img.get_attribute('src')
                    # Send an HTTP GET request to the URL
                    response = requests.get(src)
                    # Check if the request was successful
                    if response.status_code == 200:
                    # Open a file in binary write mode and save the image
                        with open(f"{os.path.join(new_folder, str(i))}.webp", "wb") as file:
                            file.write(response.content)
                            i=i+1
                    else:
                             audit_logger.info("Failed to download the image.")
            sleep(2)
            driver.quit()
            sleep(2)
        else:
            ####BOTTOMS####
            new_folder_1 = os.path.join(new_folder, "Bottoms")
            if not os.path.isdir(new_folder_1):
                os.mkdir(new_folder_1)
            audit_logger.info(f'Search for {result_df.loc[j, "Bottoms"]}')
            
            # Set up options for Chrome
            options = webdriver.ChromeOptions()
            preferences = {
                "profile.default_content_settings.popups": 0,
                "download.default_directory": path,
                "directory_upgrade": True,
                "browser.download.manager.showWhenStarting": False
            }
            options.add_experimental_option('prefs', preferences)
            driver = webdriver.Chrome(options=options)
            # Optional: Set Chrome to headless mode (no GUI)
            # options.add_argument("--headless")
            
            # Open a webpage
            print("Browsing 'https://www.shein.co.uk'")
            driver.get("https://www.shein.co.uk")
            print("Browsing 'https://www.shein.co.uk/' completed")
            sleep(2)
            
            z=0
            while z < 20:
                try:
                    print(z)
                    form = driver.find_element(By.XPATH, '//div[@class="div-input header-search-input j-header-search-input fsp-element"]')
                    z=20
                except:
                    z=z+1
                    if z < 19:
                        sleep(10)
                    else:
                        raise Exception
            print("test_3")
            form = driver.find_element(By.XPATH, '//div[@class="div-input header-search-input j-header-search-input fsp-element"]')
            sleep(1)
            form.send_keys(str(result_df.loc[j, "Gender"])+"+"+result_df.loc[j, "Bottoms"] + Keys.ENTER)
            sleep(2)
            i=1
            if "pants" in result_df.loc[j, "Bottoms"]:
                x = "pants"
            else:
                x = result_df.loc[j, "Bottoms"].split()[-1]
            images = driver.find_elements(By.XPATH,'//img[@class="fsp-element crop-image-container__img"]')
            for img in images:
                if str(img.get_attribute("outerHTML").lower()).find(x) != -1:
                    src = img.get_attribute('src')
                    # Send an HTTP GET request to the URL
                    response = requests.get(src)
            
                    # Check if the request was successful
                    if response.status_code == 200:
                    # Open a file in binary write mode and save the image
                        with open(f"{os.path.join(new_folder_1, str(i))}.webp", "wb") as file:
                            file.write(response.content)
                            i=i+1
                    else:
                             audit_logger.info("Failed to download the image.")
            sleep(2)
            driver.quit()
            sleep(2)
            
            ####UPPERWEAR#####
            new_folder_2 = os.path.join(new_folder, "Upperwear")
            if not os.path.isdir(new_folder_2):
                os.mkdir(new_folder_2)
            audit_logger.info(f'Search for {result_df.loc[j, "Upperwear"]}')
            # Set up options for Chrome
            options = webdriver.ChromeOptions()
            preferences = {
                "profile.default_content_settings.popups": 0,
                "download.default_directory": path,
                "directory_upgrade": True,
                "browser.download.manager.showWhenStarting": False
            }
            options.add_experimental_option('prefs', preferences)
            driver = webdriver.Chrome(options=options)
            # Optional: Set Chrome to headless mode (no GUI)
            # options.add_argument("--headless")
            
            # Open a webpage
            print("Browsing 'https://www.shein.co.uk'")
            driver.get("https://www.shein.co.uk")
            print("Browsing 'https://www.shein.co.uk/' completed")
            sleep(2)
            
            z=0
            while z < 20:
                try:
                    print(z)
                    form = driver.find_element(By.XPATH, '//div[@class="div-input header-search-input j-header-search-input fsp-element"]')
                    z=20
                except:
                    z=z+1
                    if z < 19:
                        sleep(10)
                    else:
                        raise Exception
            print("test_3")
            form = driver.find_element(By.XPATH, '//div[@class="div-input header-search-input j-header-search-input fsp-element"]')
            sleep(1)
            form.send_keys(str(result_df.loc[j, "Gender"])+"+"+result_df.loc[j, "Upperwear"] + Keys.ENTER)
            sleep(2)
            i=1
            images = driver.find_elements(By.XPATH,'//img[@class="fsp-element crop-image-container__img"]')
            for img in images:
                if str(img.get_attribute("outerHTML").lower()).find(result_df.loc[j, "Upperwear"].split()[-1]) != -1:
                    src = img.get_attribute('src')
                    # Send an HTTP GET request to the URL
                    response = requests.get(src)
            
                    # Check if the request was successful
                    if response.status_code == 200:
                    # Open a file in binary write mode and save the image
                        with open(f"{os.path.join(new_folder_2, str(i))}.webp", "wb") as file:
                            file.write(response.content)
                            i=i+1
                    else:
                             audit_logger.info("Failed to download the image.")
            sleep(2)
            driver.quit()
            sleep(2)
except Exception as err:
    audit_logger.info('Web scrapping - Failed')
    error_logger.error('Web scrapping - Failed')
    error_logger.error('Exception: ', exc_info=True)
    #input("press Enter to Close")
    sys.exit()

192.168.1.112 : 2024-09-05 15:56:20,189 : INFO : [2896285803.py:5] : Web scrapping
192.168.1.112 : 2024-09-05 15:57:14,562 : INFO : [2896285803.py:23] : Search for black full sleeves collar neck above knee length medium fit withbelt cotton dress
Browsing 'https://www.shein.co.uk'
Browsing 'https://www.shein.co.uk/' completed
0
test_3


In [7]:
audit_logger.info('Process run success')
total_time = datetime.now() - curr_time
audit_logger.info(f'Total time taken = {total_time}')

192.168.1.112 : 2024-09-05 16:01:54,292 : INFO : [2214019538.py:1] : Process run success
192.168.1.112 : 2024-09-05 16:01:54,297 : INFO : [2214019538.py:3] : Total time taken = 0:05:34.266348
