### scrapper

In [1]:
import requests
from PIL import Image
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from io import BytesIO
from openpyxl import load_workbook

from selenium.webdriver import Chrome
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time

import threading
import psycopg2
#from sql_key import dbname, user, password, host, port


In [2]:
import sys, os
sys.path.append(os.path.join(os.path.dirname('__file__'), '..', 'DB_and_Azure'))
import sql_db_functions as SQLf

In [3]:
def create_soup(url):

    chrome_options = Options()
    #chrome_options.add_argument("--headless")  # Ensure GUI is off
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")


    browser = Chrome(options=chrome_options)

    
    # Open the URL
    browser.maximize_window()
    browser.get(url)
    
    # Wait for the page to fully load
    wait = WebDriverWait(browser, 120)
    wait.until(lambda driver: driver.execute_script('return document.readyState') == 'complete')
    
    # Wait for the cookie button to be clickable
    cookie_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.Button_button__qoiWI.CookiesContent_button__2Vj_R')))
    cookie_button.click()

    # Scroll to the bottom of the page
    last_height = browser.execute_script("return document.body.scrollHeight")
    
    while True:
        for i in range(5):  # 5 iterations to make 20% scroll each
            browser.execute_script("window.scrollBy(0, document.body.scrollHeight / 5);")
            time.sleep(1)  # Wait for 1 second between each scroll
        
        # Calculate new scroll height and compare with the last scroll height
        new_height = browser.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


    # Get the page source after scrolling
    html = browser.page_source

    
 
    browser.quit()
     


    page_soup = BeautifulSoup(html, 'html.parser')

    return page_soup


In [4]:
class TimeoutException(Exception):
    pass

def create_simple_soup(url, timeout=5):
    def fetch_soup():
        nonlocal soup, error
        try:
            ua = UserAgent()
            header = {'User-Agent': str(ua.chrome)}

            # Send an HTTP request to the URL
            response = requests.get(url, headers=header)
            soup = BeautifulSoup(response.content, 'html.parser')
        except Exception as e:
            error = e

    soup = None
    error = None

    thread = threading.Thread(target=fetch_soup)
    thread.start()
    thread.join(timeout)

    if thread.is_alive():
        raise TimeoutException("Timeout while fetching the URL")
    if error:
        raise error

    return soup

In [5]:
def contains_required_prods(line):
    
    keeping = ["Articolo senza modello","Dettaglio dell'articolo 8","Retro dell'articolo"]


    return any(num in line for num in keeping)

In [6]:
def get_image_links(prod_soup):
    
    item_list_1 = prod_soup.select('ul li button img')

    ##### get image
    images = []
    i = 0
    for item in item_list_1: 
            
        if contains_required_prods(item['alt']) :
            images = images + [item['src']]
        
        i += 1


    
    print(images)
    return images

In [7]:
def get_description(prod_soup):
    
    description = prod_soup.select('div.Description_description__IDgi6 p')[0].text
    description = description + ' ' + prod_soup.select('div.ProductDetail_properties__UStvB div.PropertiesContent_content__T7Hg5 p')[0].text
    

    return description

    

In [8]:
import re

def get_price(prod_soup):

    text = prod_soup.select('span.SinglePrice_end__Hz2J7')[0].text


    # Remove any non-numeric characters except for ',' and '.'
    cleaned_text = re.sub(r'[^\d,\.]', '', text)
    
    # Replace comma with a period if there's no period already (to handle decimal part)
    if ',' in cleaned_text and '.' not in cleaned_text:
        cleaned_text = cleaned_text.replace(',', '.')

    elif ',' not in cleaned_text and '.' in cleaned_text:
        cleaned_text = cleaned_text.replace('.', '')

    elif ',' in cleaned_text and '.' in cleaned_text:
        # If both ',' and '.' are present, keep only the period as the decimal separator
        cleaned_text = cleaned_text.replace('.', '')
        cleaned_text = cleaned_text.replace(',', '.')
    
    # Convert the string to a float
    number = float(cleaned_text)
    
    return number

In [9]:
def mango(category_url,n_products, Clothing_type,Testing):

    soup = create_soup(category_url)
    item_list = soup.select('div.ProductImage_productImage__cS5d9')


    br = 0
    for item in item_list:
        if br == n_products:
            break

        retry_attempts = 3
        while retry_attempts > 0:
            try:
                # Scrape product details
                try:
                    prod_url = item.a['href']
                    prod_soup = create_simple_soup(item.a['href'])
                except TimeoutException:
                    print(f"Timeout fetching product {br}. Retrying in 5 seconds...")
                    time.sleep(5)
                    retry_attempts -= 1
                    continue
                except Exception as e:
                    print(f"Error fetching product {br}: {e}")
                    break


                prod_images_links = get_image_links(prod_soup=prod_soup)
                prod_description = get_description(prod_soup=prod_soup)
                prod_price = get_price(prod_soup=prod_soup)

            
                print(f'Starting product {br}')
                
                time.sleep(3)
                # Links to image, load to blob and return prod_images_names

                time.sleep(1)
                conn, cursor = SQLf.sql_db_functions.connect_sql()

                SQLf.sql_db_functions.insert_description_image_to_db(
                    conn=conn,
                    cursor=cursor,
                    brand='Mango',
                    descript=prod_description,
                    price=prod_price,
                    prod_link = prod_url,
                    Clothing_type = Clothing_type,
                    images_links=prod_images_links,
                    Testing = Testing
                )

                conn.close()
                cursor.close()
                time.sleep(10)
                break  # Exit the retry loop if successful
            except Exception as e:
                print(f"Error processing product {br}: {e}")
                break  # Exit the retry loop if an exception occurs

        if retry_attempts == 0:
            print(f"Failed to process product {br} after 3 attempts. Exiting function.")
            return False  # Exit the function if failed after 3 attempts  

        br += 1



# Execute scrapper 

In [10]:
link = 'https://shop.mango.com/it/it/c/donna/bluse-e-camicie_b8003173'

mango(link,80, 'CAMICIE DONNA',False)

['https://shop.mango.com/assets/rcs/pics/static/T6/fotos/S/67030437_01_R.jpg?imwidth=2048&imdensity=1&ts=1706261699916', 'https://shop.mango.com/assets/rcs/pics/static/T6/fotos/S/67030437_01_D8.jpg?imwidth=2048&imdensity=1&ts=1697623377637', 'https://shop.mango.com/assets/rcs/pics/static/T6/fotos/S/67030437_01_B.jpg?imwidth=2048&imdensity=1&ts=1697623377637']
Starting product 0
inserting
Brand_Prod_id generated: 83
Data inserted successfully to DB
['https://shop.mango.com/assets/rcs/pics/static/T6/fotos/S/67046320_37_R.jpg?imwidth=2048&imdensity=1&ts=1707472987889', 'https://shop.mango.com/assets/rcs/pics/static/T6/fotos/S/67046320_37_D8.jpg?imwidth=2048&imdensity=1&ts=1706632395332', 'https://shop.mango.com/assets/rcs/pics/static/T6/fotos/S/67046320_37_B.jpg?imwidth=2048&imdensity=1&ts=1706632395332']
Starting product 1
inserting
Brand_Prod_id generated: 84
Data inserted successfully to DB
['https://shop.mango.com/assets/rcs/pics/static/T6/fotos/S/67084036_10_R.jpg?imwidth=2048&imdens