In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [22]:
import time

BASE_URL = "https://betocosmetics.com"
COLLECTION_URL = "https://betocosmetics.com/collections/skin-care"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
}

def get_soup(url):
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")

def extract_product_links(soup):
    product_tags = soup.find_all("a", class_="productitem--image-link")
    links = [BASE_URL + tag['href'] for tag in product_tags if tag.get('href')]
    return links

def scrape_all_product_links(start_url):
    page = 1
    all_links = set()

    while True:
        paged_url = f"{start_url}?page={page}"
        print(f"Scraping page {page}...")

        soup = get_soup(paged_url)
        links = extract_product_links(soup)

        if not links:
            print("No more products found. Stopping.")
            break

        for link in links:
            all_links.add(link)

        page += 1
        time.sleep(5)  # be polite to the server

    return list(all_links)

In [23]:
links_list = []

In [24]:
links_list = scrape_all_product_links(COLLECTION_URL)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
No more products found. Stopping.


### writing extraction functions

In [5]:
def get_product_title(n):
    try:
        brand = n.find('h1', attrs='product-title').text.strip()
    except AttributeError:
        brand = ""
    return brand

In [101]:
get_product_title(new_page)

'7 days 7 nights Brightening Lotion 500ml'

In [6]:
def get_product_vendor(n):
    try:
        brand = n.find('div', attrs='product-vendor').text.strip()
    except AttributeError:
        brand = ""
    return brand
    

In [104]:
get_product_vendor(new_page)

'by Dodo Cosmetics'

In [7]:
def get_product_availability(n):
    try:
        brand = n.find('div', attrs='product-stock-level__badge-text').text.strip()
    except AttributeError:
        brand = ""
    return brand
    

In [106]:
get_product_availability(new_page)

'Out of stock'

In [8]:
def get_product_badge(n):
    try:
        brand = n.find('span', attrs='product__badge').text.strip()
    except AttributeError:
        brand = ""
    return brand

In [108]:
get_product_badge(new_page)

'Sold out'

In [9]:
def get_price(soup):
    try:
        container = soup.find('div', class_='price__current')
        price_tag = container.find('span', class_='money') if container else None
        return price_tag.text.strip() if price_tag else ""
    except Exception:
        return ""

In [110]:
get_price(new_page)

'5.000 BD'

In [10]:
def get_product_image_url(n):
    img = n.find('img', class_='product-gallery--loaded-image')
    if img and img.get('src'):
        return 'https:' + img['src']
    return ''

In [112]:
get_product_image_url(new_page)

'https://betocosmetics.com/cdn/shop/files/C7A4FAA8-92B8-4DD5-AD78-5A28071DBB55_305x440.jpg?v=1732245682'

In [11]:
import re

def get_description(n):
    try:
        about = n.find('div', class_='product-description rte').get_text(separator=' ', strip=True)
        about = re.sub(r'\s+', ' ', about)  # Collapse multiple whitespace/newlines into a single space
    except AttributeError:
        about = ""
    return about

In [114]:
    get_description(new_page)

'"7 DAYS NIGHTS" Super Lightening Body Lotion The lightening body lotion 7 days 7 nights fights all skin imperfections uneven complexion, black spots actively fight the black surfaces of your skin to give you skin softness and a clearer complexion. This body lotion also contains snail slime which is a product natural whose objective is to firm and rejuvenate your skin'

In [12]:
def get_product_line(soup):
    try:
        # Only search inside breadcrumb or product section
        breadcrumb = soup.find('nav', class_='breadcrumbs-container') or soup.find('div', class_='breadcrumbs-delimiter')
        if breadcrumb:
            tag = breadcrumb.find_all('a', href=lambda href: href and href.startswith('/collections/'))
            if tag:
                return tag[-1].get_text(strip=True)  # Last breadcrumb is usually the specific category
    except Exception:
        pass
    return ''

In [13]:
def get_collection_links(soup):
    return soup.find_all('a', href=lambda href: href and href.startswith('/collections/'))

In [82]:
get_product_line(new_page)

'Skin Care'

In [73]:
def get_collection_links(soup):
    return soup.find_all('a', href=lambda href: href and href.startswith('/collections/'))


In [14]:
def get_website_name(soup):
     try:
        title_text = soup.title.string.strip()
        # Split by em dash or hyphen, get the last part
        if '—' in title_text:
            return title_text.split('—')[-1].strip()
        elif '-' in title_text:
            return title_text.split('-')[-1].strip()
        else:
            return title_text  # fallback
     except AttributeError:
        return "" 

In [164]:
get_website_name(new_page)

'Beto Cosmetics'

In [139]:
soup.find('title')

<title>Skin Care — Beto Cosmetics</title>

In [123]:
get_website_name(new_page)

'betocosmetics.com'

In [120]:
get_domain(URL)

'betocosmetics.com'

In [25]:
d = {'product_name':[],'product_line':[],'vendor':[],'availability':[],'badge':[],'price':[],'image_url':[],'description':[],'website_name':[]}

In [26]:
for link in links_list:
    if link.startswith("http"):
        full_url = link
    else:
        full_url = 'https://betocosmetics.com' + link

    page = get_soup(full_url)

    if not page:
        print(f"Failed to load page: {full_url}")
        continue

    try:
        product_data = {
            'product_name': get_product_title(page),
            'product_line': get_product_line(page),
            'vendor': get_product_vendor(page),
            'availability': get_product_availability(page),
            'badge': get_product_badge(page),
            'price': get_price(page),
            'image_url': get_product_image_url(page),
            'description': get_description(page),
            'website_name': get_website_name(page),
        }

        for key in d:
            d[key].append(product_data.get(key, ""))

    except Exception as e:
        print(f"Error processing {full_url}: {e}")
        continue


In [27]:
df = pd.DataFrame.from_dict(d)

In [28]:
df.shape

(445, 9)

In [29]:
df.tail(10)

Unnamed: 0,product_name,product_line,vendor,availability,badge,price,image_url,description,website_name
435,Orbi 20 Anti Marks Papaya Face Cream,Skin Care,by Orbis 20,"in stock, ready to be shipped",,3.000 BD,https://betocosmetics.com/cdn/shop/files/s-l12...,Orbi 20 Anti-pimple glow face cream smooths an...,Beto Cosmetics
436,B.B. Clear Unifying Skin lightening soap,Skin Care,by Rodis,"in stock, ready to be shipped",,3.000 BD,https://betocosmetics.com/cdn/shop/products/bb...,B.b. Clear Unifying Skin lightening soap B.b. ...,Beto Cosmetics
437,Jaune d' Oeuf Clarifying & Treatment Egg Yolk ...,Skin Care,by Jaune D'Oeuf,"in stock, ready to be shipped",,2.500 BD,https://betocosmetics.com/cdn/shop/files/05_62...,Jaune D'oeuf Huile (Egg Yolk ) Treating & Clar...,Beto Cosmetics
438,New Light Zaban Cream,Skin Care,by Others,"in stock, ready to be shipped",,3.000 BD,https://betocosmetics.com/cdn/shop/products/Ot...,New Light Zaban Cream is a cosmetic product th...,Beto Cosmetics
439,Fair & White Gold 2 Maxi Tone Body Lotion - Br...,Skin Care,by F&W,Out of stock,Sold out,15.000 BD,https://betocosmetics.com/cdn/shop/files/51KFJ...,,Beto Cosmetics
440,Paw Paw Clarifying Soap with Vitamin E and Pap...,Skin Care,by Dream Cosmetics,"in stock, ready to be shipped",,2.500 BD,https://betocosmetics.com/cdn/shop/files/51VnX...,Paw Paw Skin Clarifying Papaya Cream The benef...,Beto Cosmetics
441,Mamado Papaya Lightening serum Vitamin E Enric...,Skin Care,by Mamado,"in stock, ready to be shipped",,6.500 BD,https://betocosmetics.com/cdn/shop/products/87...,"About this item 100% Natural , NO SIDE EFFECTS...",Beto Cosmetics
442,"Clere Lanolin and Glycerin Body Cream, 300 ml",Skin Care,by Clere,"in stock, ready to be shipped",,3.000 BD,https://betocosmetics.com/cdn/shop/files/61Fp9...,Buy Clere Body Creme Nourishing Lanolin & Glyc...,Beto Cosmetics
443,Medix 5.5 Vitamin C Clinical Solutions Cream 5...,Skin Care,by Medix 5.5,"in stock, ready to be shipped",,5.000 BD,https://betocosmetics.com/cdn/shop/files/51692...,Brightening: Vitamin C is known for its abilit...,Beto Cosmetics
444,Moco De Gorila Gel Extreme Hold 340g (Yellow),Skin Care,by Moco De Gorilla,"in stock, ready to be shipped",,4.000 BD,https://betocosmetics.com/cdn/shop/files/71SRH...,Holds approx. 11.9 oz. Maximum hold formula. H...,Beto Cosmetics


### Preparing data for analysis

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  445 non-null    object
 1   product_line  445 non-null    object
 2   vendor        445 non-null    object
 3   availability  445 non-null    object
 4   badge         445 non-null    object
 5   price         445 non-null    object
 6   image_url     445 non-null    object
 7   description   445 non-null    object
 8   website_name  445 non-null    object
dtypes: object(9)
memory usage: 31.4+ KB


In [67]:
df1 = df.copy()

In [66]:
# clean product data
def clean_product_data(df):
    """
    Cleans the scraped product data DataFrame:
    - Strips 'by' from vendor names
    - Extracts numeric price
    - Fills missing values
    - Standardizes column formats
    """
    # 1. Remove 'by ' prefix from vendor names
    df1['vendor'] = df1['vendor'].str.replace(r'^by\s+', '', regex=True).str.strip()

    # 2. Extract numeric value from price column and convert to float
    df1['price_num'] = df1['price'].str.extract(r'(\d+\.\d+)').astype(float)

    # 3. Clean badge field (normalize empty vs 'Sold out')
    df1['badge'] = df1['badge'].str.strip().replace('', 'In stock')

    # 4. Fill missing values with empty strings to avoid NaNs in text columns
    df1.fillna('', inplace=True)

    #5. Rename column 'vendor' to 'brand'
    df1.rename(columns={'vendor':'brand'}, inplace=True) 
    
    #6. Clean Badge column
    df1['badge'] = df1['badge'].str.replace(r'\b(\w+)(\s+\1)+\b', r'\1', regex=True)
    

    return df1


In [68]:
clean_product_data(df1)

Unnamed: 0,product_name,product_line,brand,availability,badge,price,image_url,description,website_name,price_num
0,Clear Essence Platinum Medicated Fade Creme Wi...,Skin Care,Clear Essence,Out of stock,Sold out,6.000 BD,https://betocosmetics.com/cdn/shop/files/clear...,Clear Essence® Platinum Medicated Fade Creme W...,Beto Cosmetics,6.0
1,Lalala Huile Eclaircissante Lightening Oil 100ml,Skin Care,La La La,Out of stock,Sold out,5.000 BD,https://betocosmetics.com/cdn/shop/files/lala_...,Lalala Lightening Oil is in a multi-lightening...,Beto Cosmetics,5.0
2,Fair & White So White Skin perfector Cream Jar,Skin Care,Fair and White,Low stock,In stock,15.000 BD,https://betocosmetics.com/cdn/shop/products/Fa...,Formulated with antibacterial properties and p...,Beto Cosmetics,15.0
3,NATURE SECRETE Whitening and Exfoliating Gomma...,Skin Care,Nature Secrete,"in stock, ready to be shipped",In stock,5.000 BD,https://betocosmetics.com/cdn/shop/products/Na...,NATURE SECRETE Natural Exfoliating Gommant Soa...,Beto Cosmetics,5.0
4,Melano Melaakne Acne Soap 100g,Skin Care,Melano,"in stock, ready to be shipped",In stock,2.500 BD,https://betocosmetics.com/cdn/shop/files/image...,Melano Melaakne Acne Soap 100g Key features Hi...,Beto Cosmetics Melano Melaakne Acne Soap 100g,2.5
...,...,...,...,...,...,...,...,...,...,...
440,Paw Paw Clarifying Soap with Vitamin E and Pap...,Skin Care,Dream Cosmetics,"in stock, ready to be shipped",In stock,2.500 BD,https://betocosmetics.com/cdn/shop/files/51VnX...,Paw Paw Skin Clarifying Papaya Cream The benef...,Beto Cosmetics,2.5
441,Mamado Papaya Lightening serum Vitamin E Enric...,Skin Care,Mamado,"in stock, ready to be shipped",In stock,6.500 BD,https://betocosmetics.com/cdn/shop/products/87...,"About this item 100% Natural , NO SIDE EFFECTS...",Beto Cosmetics,6.5
442,"Clere Lanolin and Glycerin Body Cream, 300 ml",Skin Care,Clere,"in stock, ready to be shipped",In stock,3.000 BD,https://betocosmetics.com/cdn/shop/files/61Fp9...,Buy Clere Body Creme Nourishing Lanolin & Glyc...,Beto Cosmetics,3.0
443,Medix 5.5 Vitamin C Clinical Solutions Cream 5...,Skin Care,Medix 5.5,"in stock, ready to be shipped",In stock,5.000 BD,https://betocosmetics.com/cdn/shop/files/51692...,Brightening: Vitamin C is known for its abilit...,Beto Cosmetics,5.0


### Feature Engineering

In [69]:
import re

def classify_product(name):
    name = name.lower()
    if 'lotion' in name:
        return 'Lotion'
    elif 'cream' in name:
        return 'Cream'
    elif 'serum' in name:
        return 'Serum'
    elif 'soap' in name:
        return 'Soap'
    elif 'oil' in name:
        return 'Oil'
    elif 'butter' in name:
        return 'Butter'
    else:
        return 'Other'

df1['product_type'] = df1['product_name'].apply(classify_product)

In [70]:
df1.tail(9)

Unnamed: 0,product_name,product_line,brand,availability,badge,price,image_url,description,website_name,price_num,product_type
436,B.B. Clear Unifying Skin lightening soap,Skin Care,Rodis,"in stock, ready to be shipped",In stock,3.000 BD,https://betocosmetics.com/cdn/shop/products/bb...,B.b. Clear Unifying Skin lightening soap B.b. ...,Beto Cosmetics,3.0,Soap
437,Jaune d' Oeuf Clarifying & Treatment Egg Yolk ...,Skin Care,Jaune D'Oeuf,"in stock, ready to be shipped",In stock,2.500 BD,https://betocosmetics.com/cdn/shop/files/05_62...,Jaune D'oeuf Huile (Egg Yolk ) Treating & Clar...,Beto Cosmetics,2.5,Oil
438,New Light Zaban Cream,Skin Care,Others,"in stock, ready to be shipped",In stock,3.000 BD,https://betocosmetics.com/cdn/shop/products/Ot...,New Light Zaban Cream is a cosmetic product th...,Beto Cosmetics,3.0,Cream
439,Fair & White Gold 2 Maxi Tone Body Lotion - Br...,Skin Care,F&W,Out of stock,Sold out,15.000 BD,https://betocosmetics.com/cdn/shop/files/51KFJ...,,Beto Cosmetics,15.0,Lotion
440,Paw Paw Clarifying Soap with Vitamin E and Pap...,Skin Care,Dream Cosmetics,"in stock, ready to be shipped",In stock,2.500 BD,https://betocosmetics.com/cdn/shop/files/51VnX...,Paw Paw Skin Clarifying Papaya Cream The benef...,Beto Cosmetics,2.5,Soap
441,Mamado Papaya Lightening serum Vitamin E Enric...,Skin Care,Mamado,"in stock, ready to be shipped",In stock,6.500 BD,https://betocosmetics.com/cdn/shop/products/87...,"About this item 100% Natural , NO SIDE EFFECTS...",Beto Cosmetics,6.5,Serum
442,"Clere Lanolin and Glycerin Body Cream, 300 ml",Skin Care,Clere,"in stock, ready to be shipped",In stock,3.000 BD,https://betocosmetics.com/cdn/shop/files/61Fp9...,Buy Clere Body Creme Nourishing Lanolin & Glyc...,Beto Cosmetics,3.0,Cream
443,Medix 5.5 Vitamin C Clinical Solutions Cream 5...,Skin Care,Medix 5.5,"in stock, ready to be shipped",In stock,5.000 BD,https://betocosmetics.com/cdn/shop/files/51692...,Brightening: Vitamin C is known for its abilit...,Beto Cosmetics,5.0,Cream
444,Moco De Gorila Gel Extreme Hold 340g (Yellow),Skin Care,Moco De Gorilla,"in stock, ready to be shipped",In stock,4.000 BD,https://betocosmetics.com/cdn/shop/files/71SRH...,Holds approx. 11.9 oz. Maximum hold formula. H...,Beto Cosmetics,4.0,Other


In [64]:
len(df1)

445

In [72]:
df1.to_csv('skincare2.csv', index = False)