In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from tqdm import tqdm

# ==============================
# Step 1: URL आणि Save Folder
# ==============================
url = "https://www.corelanguages.com/"  # 🔁 इथे तू ज्या website चा डेटा डाउनलोड करायचा आहे तो टाक
save_dir = "downloaded_website"
os.makedirs(save_dir, exist_ok=True)

# ==============================
# Step 2: HTML Page Download
# ==============================
print(f"📥 Downloading main page from {url} ...")
response = requests.get(url)
html_content = response.text

html_file_path = os.path.join(save_dir, "index.html")

# Save HTML locally
with open(html_file_path, "w", encoding="utf-8") as f:
    f.write(html_content)

print(f"✅ HTML page saved to {html_file_path}")

# ==============================
# Step 3: Asset Download (images, CSS, JS)
# ==============================
print("🔍 Parsing and downloading assets (images, css, js)...")

soup = BeautifulSoup(html_content, "html.parser")
assets = []

# Tags with downloadable content
for tag in soup.find_all(["img", "script", "link"]):
    attr = "src" if tag.name in ["img", "script"] else "href"
    file_url = tag.get(attr)
    if file_url:
        full_url = urljoin(url, file_url)
        assets.append(full_url)

# Download assets
for asset_url in tqdm(set(assets), desc="📦 Downloading assets"):
    try:
        asset_resp = requests.get(asset_url, timeout=10)
        parsed_url = urlparse(asset_url)
        asset_path = parsed_url.path.lstrip("/")

        full_path = os.path.join(save_dir, asset_path)
        os.makedirs(os.path.dirname(full_path), exist_ok=True)

        with open(full_path, "wb") as f:
            f.write(asset_resp.content)
    except Exception as e:
        print(f"⚠️ Failed to download {asset_url} -> {e}")

print("✅ All content downloaded successfully in folder:", save_dir)


📥 Downloading main page from https://www.corelanguages.com/ ...
✅ HTML page saved to downloaded_website\index.html
🔍 Parsing and downloading assets (images, css, js)...


📦 Downloading assets: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.37it/s]

✅ All content downloaded successfully in folder: downloaded_website



