<a href="https://colab.research.google.com/github/RichardKameri/24s/blob/main/Jumia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Base URL (you can change the search query)
BASE_URL = "https://www.jumia.co.ke/catalog/?q=smartphones"

# Set up headers to mimic a real browser
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

# Create lists to hold data
product_names = []
brands = []
prices = []
discounts = []
reviews = []
ratings = []

# Number of pages to scrape
NUM_PAGES = 3

for page in range(1, NUM_PAGES + 1):
    print(f"Scraping page {page}...")
    url = f"{BASE_URL}&page={page}"
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, 'html.parser')

    products = soup.find_all('article', class_='prd')

    for product in products:
        # Product Name
        name_tag = product.find('h3', class_='name')
        product_name = name_tag.text.strip() if name_tag else "N/A"
        product_names.append(product_name)

        # Brand Name (usually the first word of the product name)
        brand = product_name.split()[0] if product_name != "N/A" else "N/A"
        brands.append(brand)

        # Price
        price_tag = product.find('div', class_='prc')
        price = price_tag.text.replace('KSh', '').replace(',', '').strip() if price_tag else "N/A"
        prices.append(price)

        # Discount
        discount_tag = product.find('div', class_='bdg _dsct')
        discount = discount_tag.text.strip().replace('-', '') if discount_tag else "0%"
        discounts.append(discount)

        # Reviews
        review_tag = product.find('div', class_='stars _s')
        review_count = review_tag.get('aria-label') if review_tag else "0"
        total_reviews = review_count.split()[0] if review_count != "0" else "0"
        reviews.append(total_reviews)

        # Rating (from star width)
        rating_tag = product.find('div', class_='stars _s')
        if rating_tag and rating_tag.find('div'):
            style = rating_tag.find('div').get('style', '')
            try:
                width_percent = float(style.replace('width:', '').replace('%', '').strip())
                rating = round((width_percent / 100) * 5, 1)
            except:
                rating = "N/A"
        else:
            rating = "N/A"
        ratings.append(rating)

    time.sleep(1)  # Be polite and avoid hitting the server too fast

# Create DataFrame
df = pd.DataFrame({
    "Product Name": product_names,
    "Brand": brands,
    "Price (Ksh)": prices,
    "Discount (%)": discounts,
    "Total Reviews": reviews,
    "Rating (out of 5)": ratings
})

# Save to CSV
df.to_csv("jumia_smartphones.csv", index=False)
print("Data saved to 'jumia_smartphones.csv'")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Data saved to 'jumia_smartphones.csv'
