In [None]:
import re
import csv
from bs4 import BeautifulSoup
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('punkt')

ML_seed_urls = [
    'https://towardsdatascience.com/machine-learning-an-introduction-23b84d51e6d0',
    'https://dltlabs.medium.com/understanding-machine-learning-deep-learning-f5aa95264d61',
    'https://medium.com/@randylaosat/a-beginners-guide-to-machine-learning-dfadc19f6caf',
    'https://www.ibm.com/topics/machine-learning',
    'https://www.techtarget.com/searchenterpriseai/definition/machine-learning-ML'
]

# Setting up the CSV file
csv_filename = 'web_ML_data.csv'

# Function to extract data from a URL
def extract_data(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract relevant data
        title = soup.find('title').get_text(strip=True) if soup.find('title') else ""
        description = soup.find('meta', attrs={'name': 'description'}).get('content', "").strip() if soup.find('meta', attrs={'name': 'description'}) else ""

        # Extract text from the web page
        text = soup.get_text(separator=' ')

        # Tokenize the text and count the number of words
        tokens = nltk.word_tokenize(text)
        num_words = len(tokens)

        # Extract keywords using TF-IDF
        keywords = extract_keywords(text)

        return title, url, description, num_words, keywords

    except requests.exceptions.RequestException as e:
        # Handle connection errors or invalid URLs
        print(f"Error accessing URL: {url}\n{str(e)}")
        return "", url, "", 0, ""

# Function to extract keywords using TF-IDF
def extract_keywords(text):
    if len(text.split()) >= 5:  # Minimum number of words required
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([text])
        feature_names = vectorizer.get_feature_names()
        top_keywords = [feature_names[idx] for idx in tfidf_matrix.toarray().argsort()[0][-5:]][::-1]
        return ", ".join(top_keywords)
    else:
        return ""

# Setting up the queue
visited_urls = set()
url_queue = ML_seed_urls.copy()

# Open the CSV file
with open(csv_filename, 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Title', 'URL', 'Description', 'Number of Words', 'Keywords'])

    while url_queue:
        current_url = url_queue.pop(0)

        if current_url in visited_urls:
            continue

        visited_urls.add(current_url)

        title, url, description, num_words, keywords = extract_data(current_url)

        # Store in the CSV file
        csv_writer.writerow([title, url, description, num_words, keywords])

        # Find additional URLs within the page and add them to the queue
        try:
            response = requests.get(current_url)
            soup = BeautifulSoup(response.content, 'html.parser')
            for link in soup.find_all('a', href=True):
                href = link['href']
                if href.startswith('http') and href not in visited_urls:
                    url_queue.append(href)
        except requests.exceptions.RequestException as e:
            # Handle connection errors for linked URLs
            print(f"Error accessing URL: {current_url}\n{str(e)}")

print("Web scraping completed and data saved to", csv_filename)
