In [1]:
import pandas as pd
import time
import os
from selenium import webdriver  
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys  
from bs4 import BeautifulSoup as soup

# Initialize the browser once
service = Service(ChromeDriverManager().install())
browser = webdriver.Chrome(service=service)
browser.get("https://www.jumia.com.ng/")
time.sleep(2)

products = [
    "infinix phones",
    "samsung phones",
    "tecno phones",
    "redmi phones",
    "iphone phones"
] 

product_urls = []

for product in products:
    try:
        print(f"\n🔍 Searching: {product}")
        browser.get("https://www.jumia.com.ng/")
        time.sleep(3)

        search_box = browser.find_element(By.XPATH, '//*[@id="fi-q"]')
        search_box.click()
        search_box.send_keys(Keys.CONTROL, 'a')
        search_box.send_keys(Keys.BACKSPACE)
        time.sleep(1)
        search_box.send_keys(product)
        search_box.send_keys(Keys.ENTER)
        time.sleep(5)

        total_collected = 0
        page_number = 1
        max_per_product = 20

        while total_collected < max_per_product:
            html_source = browser.page_source
            page_soup = soup(html_source, 'html.parser')
            containers = page_soup.find_all("a", {"class": "core"})

            if not containers:
                print(f"⚠️ No more products on page {page_number}")
                break

            new_links = 0
            for container in containers:
                url = container.get("href")
                if url:
                    full_url = f"https://www.jumia.com.ng{url}"
                    if full_url not in product_urls:
                        product_urls.append(full_url)
                        total_collected += 1
                        new_links += 1
                        if total_collected >= max_per_product:
                            break

            print(f"✅ Page {page_number}: {new_links} new links added")

            try:
                next_btn = browser.find_element(By.CSS_SELECTOR, 'a.pg[aria-label="Next"]')
                browser.execute_script("arguments[0].scrollIntoView();", next_btn)
                next_btn.click()
                page_number += 1
                time.sleep(5)
            except:
                print("🚫 No more pages available.")
                break

    except Exception as e:
        print(f"❌ Error scraping {product}: {e}")
        continue

# Save URLs
pd.DataFrame(product_urls, columns=["product_URL"]).to_csv("jumia_multibrand_urls.csv", index=False)
print(f"\n✅ Total unique product URLs collected: {len(product_urls)}")
browser.quit()

# -------------------------------
# SCRAPE EACH PRODUCT PAGE
# -------------------------------
browser = webdriver.Chrome(service=service)

# Load previous data if it exists
previous_data = {}
if os.path.exists("scraped_jumia_multibrand_products.csv"):
    previous_df = pd.read_csv("scraped_jumia_multibrand_products.csv")
    previous_data = {row['URL']: row for _, row in previous_df.iterrows()}

# Lists to store data
product_name = []
price = []
product_brand = []
product_rating = []
product_reviews = []
product_image = []
product_unit = []
discount = []
original_price = []
final_urls = []

for index, url in enumerate(product_urls):
    print(f"Scraping {index + 1}/{len(product_urls)}: {url}")
    browser.get(url)
    time.sleep(5)
    html_source = browser.page_source
    page_soup = soup(html_source, 'html.parser')

    # Brand
    try:
        all_links = page_soup.find_all("a", {"class": "_more"})
        brand = None
        for link in all_links:
            href = link.get("href", "")
            if href.startswith("/") and "/sp-" not in href and "privacy" not in href:
                brand = link.text.strip()
                break
        if not brand:
            brand = "No brand found"
    except:
        brand = "No brand found"

    # Name
    try:
        name = page_soup.find("h1", {"class": "-fs20 -pts -pbxs"}).text.strip()
    except:
        name = "No name found"

    # Current Price
    try:
        new_price = page_soup.find("span", {"class": "-b -ubpt -tal -fs24 -prxs"}).text.strip()
    except:
        new_price = "No price found"

    # Original Price
    try:
        orig_price = page_soup.find("span", {"class": "-tal -gy5 -lthr -fs16 -pvxs -ubpt"}).text.strip()
    except:
        orig_price = "No original price"

    # Discount
    try:
        disc = page_soup.find("span", {"class": "bdg _dsct _dyn -mls"}).text.strip()
    except:
        disc = "No discount"

    # Image
    try:
        image = page_soup.find("img", {"class": "-fw -fh"})["src"]
    except:
        image = "No image"

    # Rating
    try:
        rating = page_soup.find("div", {"class": "stars _m _al"}).text.strip()
    except:
        rating = "No rating"

    # Reviews
    try:
        reviews = page_soup.find("a", {"class": "-plxs _more"}).text.strip()
    except:
        reviews = "No reviews"

    # Unit Left
    try:
        units = page_soup.find("p", {"class": "-df -i-ctr -fs12 -pbs -yl7"}).text.strip()
    except:
        units = "No unit left"

    # Store
    product_brand.append(brand)
    product_name.append(name)
    price.append(new_price)
    product_rating.append(rating)
    product_reviews.append(reviews)
    product_unit.append(units)
    discount.append(disc)
    original_price.append(orig_price)
    product_image.append(image)
    final_urls.append(url)

    print(f"✔️ Done: {name}")

# Final Data
data = {
    "Brand": product_brand,
    "Name": product_name,
    "Price": price,
    "Rating": product_rating,
    "NumberOfReviews": product_reviews,
    "Original Price": original_price,
    "Unit Left": product_unit,
    "Discount": discount,
    "Image": product_image,
    "URL": final_urls
}

# Save to CSV
df = pd.DataFrame(data)
df.to_csv("scraped_jumia_multibrand_products.csv", index=False)
browser.quit()

print("✅ Scraping complete. Data saved to 'scraped_jumia_multibrand_products.csv'")



🔍 Searching: infinix phones
✅ Page 1: 20 new links added
🚫 No more pages available.

🔍 Searching: samsung phones
✅ Page 1: 20 new links added
🚫 No more pages available.

🔍 Searching: tecno phones
✅ Page 1: 20 new links added
🚫 No more pages available.

🔍 Searching: redmi phones
✅ Page 1: 20 new links added
🚫 No more pages available.

🔍 Searching: iphone phones
✅ Page 1: 20 new links added
🚫 No more pages available.

✅ Total unique product URLs collected: 100
Scraping 1/100: https://www.jumia.com.ng/infinix-smart-9hd-6.7-4gb-ram64gb-rom-jumia-only-black-411755963.html
✔️ Done: Infinix Smart 9HD 6.7" 4GB RAM/64GB ROM Jumia Only Black
Scraping 2/100: https://www.jumia.com.ng/infinix-smart-9hd-6.7-4gb-ram64gb-rom-titanium-384822967.html
✔️ Done: Infinix Smart 9HD 6.7" 4GB RAM/64GB ROM Titanium
Scraping 3/100: https://www.jumia.com.ng/infinix-hot-50i-smart-phone-with-128gb-rom-4gb-ram-380388798.html
✔️ Done: Infinix Hot 50i Smart Phone With 128GB ROM & 4GB RAM
Scraping 4/100: https://www.j