In [1]:
# Step 1: Import Required Libraries
import os
import json
import time
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from difflib import SequenceMatcher

In [2]:
# Step 2: Define a List of Websites to Scrape
websites = [
    {"name": "GeeksforGeeks", "url": "https://www.geeksforgeeks.org/", "parser": "p"},
    {"name": "W3Schools", "url": "https://www.w3schools.com/", "parser": "p"},
    {"name": "TutorialsPoint", "url": "https://www.tutorialspoint.com/", "parser": "p"},
    {"name": "Programiz", "url": "https://www.programiz.com/", "parser": "p"},
    {"name": "Real Python", "url": "https://realpython.com/", "parser": "p"},
    {"name": "JavaTpoint", "url": "https://www.javatpoint.com/", "parser": "p"},
    {"name": "FreeCodeCamp", "url": "https://www.freecodecamp.org/", "parser": "p"},
]

In [3]:
# Step 3: Function to Scrape Links from the Main Page
def scrape_links(website):
    """Extracts internal links from a given website's main page."""
    try:
        response = requests.get(website["url"], timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            links = []
            for a_tag in soup.find_all("a", href=True):
                href = a_tag["href"]
                if website["url"] in href or href.startswith("/"):  # Internal links only
                    full_url = href if href.startswith("http") else website["url"] + href.lstrip("/")
                    links.append(full_url)
            return list(set(links))  # Remove duplicates
        else:
            print(f"Failed to fetch links from {website['name']}")
            return []
    except Exception as e:
        print(f"Error fetching links from {website['name']}: {e}")
        return []

In [4]:
# Step 4: Function to Scrape Content from a Page
def scrape_content(url, parser="p"):
    """Extracts text content from a given webpage using the specified parser."""
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            elements = soup.find_all(parser)
            return [element.get_text().strip() for element in elements if element.get_text().strip()]
        else:
            print(f"Failed to scrape content from {url}")
            return []
    except Exception as e:
        print(f"Error scraping content from {url}: {e}")
        return []


In [5]:
# Step 5: Function to Scrape an Entire Website (Main Page + Internal Links)
def scrape_website(website):
    print(f"Scraping {website['name']}...")
    
    # Scrape main page content
    main_content = scrape_content(website["url"], website["parser"])
    
    # Scrape internal links
    internal_links = scrape_links(website)
    detailed_content = []
    
    for link in internal_links:
        print(f"Scraping link: {link}")
        detailed_content.extend(scrape_content(link, website["parser"]))
        time.sleep(2)  # Prevents website blocking by delaying requests

    return {
        "name": website["name"],
        "url": website["url"],
        "main_content": main_content,
        "detailed_content": detailed_content,
    }

In [6]:
# Step 6: Scrape All Websites and Store the Data
scraped_data = []
for website in websites:
    scraped_data.append(scrape_website(website))
    time.sleep(2)



Scraping GeeksforGeeks...
Scraping link: https://www.geeksforgeeks.org/engineering-mathematics-tutorials/?ref=home-articlecards
Scraping link: https://www.geeksforgeeks.org/data-analysis-tutorial/?ref=home-articlecards
Scraping link: https://www.geeksforgeeks.org/mysql-tutorial/?ref=home-articlecards
Scraping link: https://www.geeksforgeeks.org/software-engineering/?ref=home-articlecards
Scraping link: https://www.geeksforgeeks.org/commerce/?ref=home-articlecards
Scraping link: https://www.geeksforgeeks.org/
Scraping link: https://www.geeksforgeeks.org/javascript/?ref=outindfooter
Scraping link: https://www.geeksforgeeks.org/web-development/?ref=home-articlecards
Scraping link: https://www.geeksforgeeks.org/system-design-tutorial/?ref=outindfooter
Scraping link: https://www.geeksforgeeks.org/learn-data-structures-and-algorithms-dsa-tutorial/?ref=outindfooter
Scraping link: https://www.geeksforgeeks.org/learn-reactjs/?ref=outindfooter
Scraping link: https://www.geeksforgeeks.org/queue-d

In [7]:
# Step 7: Save Scraped Data to JSON
scraped_data_path = os.path.join(os.getcwd(), "New_enhanced_scraped_websites.json")
with open(scraped_data_path, "w", encoding="utf-8") as f:
    json.dump(scraped_data, f, ensure_ascii=False, indent=4)

print(f"Enhanced scraped content saved at: {scraped_data_path}")

Enhanced scraped content saved at: C:\Users\Rahul\New_enhanced_scraped_websites.json
