In [1]:
# Step 0 — Setup & fetch page
!pip install beautifulsoup4 requests lxml --quiet

import requests
from bs4 import BeautifulSoup
import csv, json, re, os

url = "https://baraasalout.github.io/test.html"
resp = requests.get(url)
resp.raise_for_status()  # لو حصل خطأ في الشبكة سيوقف هنا
html = resp.text
soup = BeautifulSoup(html, "lxml")

# تفقد سريع
title = soup.title.string if soup.title else soup.find(['h1','h2']).get_text(strip=True)
print("Fetched page. First heading / title:", title)


Fetched page. First heading / title: Web Scraping Task with Form


In [2]:
# Step 1 — Extract headings and p/li, save CSV
rows = []

# headings
for tag in soup.find_all(['h1','h2']):
    rows.append({'type': tag.name, 'text': tag.get_text(strip=True)})

# paragraphs and list items
for tag in soup.find_all(['p','li']):
    text = tag.get_text(" ", strip=True)
    if text:
        rows.append({'type': tag.name, 'text': text})

csv_file = "Extract_Text_Data.CSV"
with open(csv_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=['type','text'])
    writer.writeheader()
    writer.writerows(rows)

print(f"Saved {csv_file} ({len(rows)} rows).")


Saved Extract_Text_Data.CSV (34 rows).


In [3]:
# Step 2 — Extract table data (product, price, in stock) -> CSV
table_rows = []
table = soup.find('table')
if table:
    # لو في table حنعمل parsing عادي
    headers = [th.get_text(strip=True) for th in table.find_all('th')]
    for tr in table.find_all('tr'):
        cells = [td.get_text(strip=True) for td in tr.find_all(['td','th'])]
        if len(cells) >= 3 and cells[0].lower() != 'product':
            table_rows.append({'Product': cells[0], 'Price': cells[1], 'In Stock': cells[2]})
else:
    # fallback: parse the text block under "Product Table"
    text = soup.get_text("\n")
    m = re.search(r'Product Table(.*?)(?:Watch This Video|Product Information)', text, re.S)
    if m:
        lines = [l.strip() for l in m.group(1).splitlines() if l.strip()]
        for line in lines:
            parts = line.split()
            if len(parts) >= 3 and parts[0].lower() != 'product':
                product = " ".join(parts[:-2])
                price = parts[-2]
                stock = parts[-1]
                table_rows.append({'Product': product, 'Price': price, 'In Stock': stock})

csv_table = "Extract_Table_Data.CSV"
with open(csv_table, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['Product','Price','In Stock'])
    writer.writeheader()
    writer.writerows(table_rows)

print(f"Saved {csv_table} ({len(table_rows)} rows).")


Saved Extract_Table_Data.CSV (3 rows).


In [4]:
# Step 3 — Extract product/book cards -> Product_Information.JSON
books = []
# نبحث عن عنوان القسم ثم نقرأ العناصر الموجودة بعده حتى نصل لعنوان القسم التالي
section = soup.find(lambda t: t.name in ['h1','h2','h3'] and 'Product Information' in t.get_text())
if section:
    node = section.find_next_sibling()
    buffer_text = ""
    # نجمع النصوص التي تحتوي "Add to basket" لأنها تفصل الكروت
    while node and 'Featured Products' not in node.get_text():
        text = node.get_text("\n", strip=True)
        if 'Add to basket' in text:
            # إذا وجدنا Add to basket نفك البلوك إلى كروت
            parts = text.split('Add to basket')
            for p in parts:
                p = p.strip()
                if not p:
                    continue
                # نبحث عن السعر (جنيه أو $) واسم الكتاب وسطر التوفر
                title = None; price = None; stock = None
                lines = [l.strip() for l in p.splitlines() if l.strip()]
                if lines:
                    title = lines[0]
                    for ln in lines[1:]:
                        if re.search(r'£\d+|\$\d+', ln):
                            price = ln
                        if 'In stock' in ln or 'Out stock' in ln or 'Out of stock' in ln or '✔' in ln:
                            stock = ln
                books.append({'title': title, 'price': price, 'stock': stock, 'button': 'Add to basket'})
        node = node.find_next_sibling()

# save JSON
with open('Product_Information.JSON','w',encoding='utf-8') as f:
    json.dump(books, f, ensure_ascii=False, indent=2)

print(f"Saved Product_Information.JSON ({len(books)} items).")


Saved Product_Information.JSON (4 items).


In [5]:
# Step 4 — Extract form inputs -> Form_Details.JSON
inputs = []
forms = soup.find_all('form')
if forms:
    for form in forms:
        for inp in form.find_all(['input','select','textarea']):
            name = inp.get('name') or inp.get('id') or ""
            tag = inp.name
            type_ = inp.get('type') if tag == 'input' else tag
            default = inp.get('value') or inp.get('placeholder') or ""
            inputs.append({'field_name': name, 'tag': tag, 'type': type_, 'default': default})
else:
    # fallback: ابحث عن أي input على الصفحة
    for inp in soup.find_all('input'):
        inputs.append({'field_name': inp.get('name') or inp.get('id') or "", 'tag':'input', 'type': inp.get('type') or 'text', 'default': inp.get('value') or ""})

with open('Form_Details.JSON','w',encoding='utf-8') as f:
    json.dump(inputs, f, ensure_ascii=False, indent=2)

print(f"Saved Form_Details.JSON ({len(inputs)} fields).")


Saved Form_Details.JSON (5 fields).


In [6]:
# Step 5 — Extract iframe/video link -> Media_Links.JSON
media = {}
iframe = soup.find('iframe')
if iframe and iframe.get('src'):
    media['iframe_src'] = iframe.get('src')
else:
    video = soup.find('video') or soup.find('source')
    if video and video.get('src'):
        media['video_src'] = video.get('src')

with open('Media_Links.JSON','w',encoding='utf-8') as f:
    json.dump(media, f, ensure_ascii=False, indent=2)

print("Saved Media_Links.JSON:", media)


Saved Media_Links.JSON: {'iframe_src': 'https://www.youtube.com/watch?v=ujf9RNuBdCU'}


In [7]:
# Step 6 — Featured Products challenge -> Featured_Products.json
featured_items = []
section = soup.find(lambda t: t.name in ['h1','h2','h3'] and 'Featured Products' in t.get_text())
if section:
    node = section.find_next_sibling()
    while node:
        text = node.get_text(" ", strip=True)
        if text and 'Add to Basket' in text:
            # parse simple pattern: name, price, colors
            # مثال نصي: "Wireless Headphones $49.99 Available colors: Black, White, Blue Add to Basket"
            m = re.search(r'^(.*?)\s*(\$\d+(?:[.,]\d+)?)\s*Available colors:\s*(.*?)\s*Add to Basket', text, re.I)
            if m:
                featured_items.append({'name': m.group(1).strip(), 'price': m.group(2).strip(), 'colors': m.group(3).strip()})
        node = node.find_next_sibling()

# Fallback: search for span.name etc.
for parent in soup.find_all(attrs={"data-id": True}):
    pid = parent.get("data-id")
    name_span = parent.find('span', class_='name')
    price_span = parent.find('span', class_='price')
    colors_span = parent.find('span', class_='colors')
    if name_span:
        featured_items.append({
            'id': pid,
            'name': name_span.get_text(strip=True),
            'price': price_span.get_text(strip=True) if price_span else '',
            'colors': colors_span.get_text(strip=True) if colors_span else ''
        })

with open('Featured_Products.json','w',encoding='utf-8') as f:
    json.dump(featured_items, f, ensure_ascii=False, indent=2)

print(f"Saved Featured_Products.json ({len(featured_items)} items).")


Saved Featured_Products.json (1 items).
