STEP  1: Data Collection & HTML Parsing

In [1]:
# --- 0. Imports ---
import pandas as pd
from bs4 import BeautifulSoup
import os
import re
from tqdm import tqdm

In [2]:
# --- 1. Configuration & Setup ---
# Define file paths. '../' means 'go up one directory'
# We are in 'notebooks/', so we go up to 'seo-content-detector/' then down to 'data/'
DATA_DIR = '../data/'
RAW_DATA_PATH = os.path.join(DATA_DIR, 'data.csv')
EXTRACTED_DATA_PATH = os.path.join(DATA_DIR, 'extracted_content.csv')

print(f"Raw data path: {RAW_DATA_PATH}")
print(f"Output path: {EXTRACTED_DATA_PATH}")

Raw data path: ../data/data.csv
Output path: ../data/extracted_content.csv


In [3]:
# --- 2. Data Loading ---
try:
    df = pd.read_csv(RAW_DATA_PATH)
    print(f"\nSuccessfully loaded {RAW_DATA_PATH} with {len(df)} rows.")
    print("Columns found:", df.columns.tolist())
    print(df.head())
except FileNotFoundError:
    print(f"ERROR: Could not find data.csv at {RAW_DATA_PATH}")
    print("Please make sure you have downloaded the dataset and placed it in the 'data/' folder.")
# Stop execution if the file isn't found
if 'df' not in locals():
    raise SystemExit("Stopping execution: Data file not found.")


Successfully loaded ../data/data.csv with 81 rows.
Columns found: ['url', 'html_content']
                                                 url  \
0     https://www.cm-alliance.com/cybersecurity-blog   
1    https://www.varonis.com/blog/cybersecurity-tips   
2  https://www.cisecurity.org/insights/blog/11-cy...   
3  https://www.cisa.gov/topics/cybersecurity-best...   
4  https://www.qnbtrust.bank/Resources/Learning-C...   

                                        html_content  
0  <!doctype html><!--[if lt IE 7]> <html class="...  
1  <!doctype html><html lang="en"><head>\n    <me...  
2  <!DOCTYPE html><html data-unhead-vue-server-re...  
3  \n\n<!DOCTYPE html>\n<html lang="en" dir="ltr"...  
4                                                NaN  


In [None]:
# --- 3. HTML Parsing Function ---

def parse_html_content(html_content):
    """
    Parses raw HTML to extract title, clean body text, and word count.
    
    Args:
        html_content (str): The raw HTML string.

    Returns:
        tuple: (title, body_text, word_count)
    """
    # Handle non-string (e.g., NaN, float) inputs gracefully
    if not isinstance(html_content, str):
        return "", "", 0

    try:
        # Use 'lxml' for a fast and forgiving HTML parser
        soup = BeautifulSoup(html_content, 'lxml')

        # 3a. Extract Title
        title = soup.title.string if soup.title else "No Title Found"
        
        # 3b. Extract Main Body Text
        # This is a heuristic (educated guess) approach. We prioritize tags 
        # that usually contain the main article.
        
        body_text = ""
        
        # Try to find the main content in specific tags
        if soup.find('article'):
            main_content = soup.find('article')
        elif soup.find('main'):
            main_content = soup.find('main')
        else:
            # Fallback to the whole body if no <article> or <main>
            main_content = soup.find('body')

        # If we found a content block, get text from it
        if main_content:
            # Get text from all <p> (paragraph) tags inside the main content
            paragraphs = main_content.find_all('p')
            if paragraphs:
                body_text = " ".join([p.get_text() for p in paragraphs])
            else:
                # If no <p> tags, just get all text from the main block
                body_text = main_content.get_text()
        else:
            # Absolute fallback: just get all text from the page
            body_text = soup.get_text()

        # 3c. Clean the extracted text
        # Remove extra whitespace, tabs, and newlines
        body_text = re.sub(r'\s+', ' ', body_text).strip()
        
        # 3d. Calculate Word Count
        word_count = len(body_text.split())

        return title, body_text, word_count

    except Exception as e:
        # Catch any unexpected parsing errors
        print(f"Error parsing content: {e}")
        return "Parsing Error", "", 0

In [6]:
# --- 4. Apply Parsing to DataFrame ---
from tqdm import tqdm
tqdm.pandas(desc="Parsing HTML")
print("\nStarting HTML parsing for all rows (this may take a moment)...")

# We use .progress_apply() from tqdm to get a progress bar!
# This applies our function to every row in the 'html_content' column.
parsed_data = df['html_content'].progress_apply(parse_html_content)

# The result is a Series of tuples. Let's make it a new DataFrame.
df_parsed = pd.DataFrame(parsed_data.tolist(), columns=['title', 'body_text', 'word_count'])

# Combine the original 'url' column with our new parsed data
df_extracted = pd.concat([df['url'], df_parsed], axis=1)


Starting HTML parsing for all rows (this may take a moment)...


Parsing HTML:   0%|          | 0/81 [00:00<?, ?it/s]

Parsing HTML: 100%|██████████| 81/81 [00:07<00:00, 11.06it/s]


In [7]:
# --- 5. Filter & Save Extracted Data ---
# Let's check our work and filter out any failed/empty pages
original_count = len(df_extracted)
# We only want to keep rows where we successfully extracted text
df_extracted = df_extracted[df_extracted['word_count'] > 0].reset_index(drop=True)
print(f"\nParsing complete.")
print(f"Successfully parsed {len(df_extracted)} rows with content (dropped {original_count - len(df_extracted)} empty/failed rows).")

# Save the cleaned data to the 'data/' folder
try:
    df_extracted.to_csv(EXTRACTED_DATA_PATH, index=False)
    print(f"\nSuccessfully saved extracted content to {EXTRACTED_DATA_PATH}")
    
    # Display the final cleaned data
    print("\n--- Extracted Content (Head) ---")
    print(df_extracted.head())

except Exception as e:
    print(f"Error saving file: {e}")

# Display the final cleaned data
print("\n--- Extracted Content (Head) ---")
print(df_extracted.head())


Parsing complete.
Successfully parsed 69 rows with content (dropped 12 empty/failed rows).

Successfully saved extracted content to ../data/extracted_content.csv

--- Extracted Content (Head) ---
                                                 url  \
0     https://www.cm-alliance.com/cybersecurity-blog   
1    https://www.varonis.com/blog/cybersecurity-tips   
2  https://www.cisecurity.org/insights/blog/11-cy...   
3  https://www.cisa.gov/topics/cybersecurity-best...   
4  https://nordlayer.com/learn/network-security/b...   

                                               title  \
0                                Cyber Security Blog   
1  Top 10 Cybersecurity Awareness Tips: How to St...   
2  11 Cyber Defense Tips to Stay Secure at Work a...   
3  Cybersecurity Best Practices | Cybersecurity a...   
4     Network Security 101: Understanding the Basics   

                                           body_text  word_count  
0  Cyber Crisis Tabletop Exercise Cyber Security ...         3

STEP 2: Text Preprocessing & Feature Engineering