In [11]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import re

print("Libraries imported successfully!")


Libraries imported successfully!


In [13]:
# Define your file paths
DATA_DIR = '../data'
INPUT_FILE = os.path.join(DATA_DIR, 'data.csv')
OUTPUT_FILE = os.path.join(DATA_DIR, 'extracted_content.csv')

print(f"Input file will be read from: {INPUT_FILE}")
print(f"Output file will be saved to: {OUTPUT_FILE}")

Input file will be read from: ../data\data.csv
Output file will be saved to: ../data\extracted_content.csv


--- DEBUGGING CELL 3.5 ---

DataFrame info:
ERROR: 'df' is not defined. Please run Cell 3 first.


In [17]:
import csv
import sys

# --- FIX for 'field larger than field limit' ---
# The HTML content is very large, so we need to increase the max size
# This loop finds the maximum possible field size your system can handle
max_int = sys.maxsize
decrement = True
while decrement:
    decrement = False
    try:
        csv.field_size_limit(max_int)
    except OverflowError:
        max_int = int(max_int / 10)
        decrement = True
# --- End of FIX ---

print("Cell 3: CSV field size limit increased.")

# Read the provided dataset
try:
    df = pd.read_csv(
        INPUT_FILE, 
        dtype={'url': 'string', 'html_content': 'string'},
        quoting=csv.QUOTE_MINIMAL,
        engine='python'
    )
    
    na_count = df['html_content'].isna().sum()
    print(f"Cell 3: Data loaded. Total rows: {len(df)}")
    print(f"Rows where html_content is <NA> or missing: {na_count}")
    
    # Fill any <NA> values with a placeholder string
    df['html_content'] = df['html_content'].fillna('No HTML Content')
    
    print("Displaying first 5 rows:")
    print(df.head())
    
except FileNotFoundError:
    print(f"ERROR: Input file not found at {INPUT_FILE}")
    print("Please make sure your 'data.csv' file is in the 'data' folder.")
except Exception as e:
    print(f"An error occurred reading the CSV: {e}")

Cell 3: CSV field size limit increased.
Cell 3: Data loaded. Total rows: 81
Rows where html_content is <NA> or missing: 12
Displaying first 5 rows:
                                                 url  \
0     https://www.cm-alliance.com/cybersecurity-blog   
1    https://www.varonis.com/blog/cybersecurity-tips   
2  https://www.cisecurity.org/insights/blog/11-cy...   
3  https://www.cisa.gov/topics/cybersecurity-best...   
4  https://www.qnbtrust.bank/Resources/Learning-C...   

                                        html_content  
0  <!doctype html><!--[if lt IE 7]> <html class="...  
1  <!doctype html><html lang="en"><head>\n    <me...  
2  <!DOCTYPE html><html data-unhead-vue-server-re...  
3  \n\n<!DOCTYPE html>\n<html lang="en" dir="ltr"...  
4                                    No HTML Content  


In [18]:
print("--- DEBUGGING CELL 3.5 ---")
try:
    print("\nDataFrame info:")
    df.info()
    
    print("\n--- Checking 'html_content' column ---")
    
    if 'html_content' not in df.columns:
        print("ERROR: 'html_content' column does not exist!")
    else:
        html_col = df['html_content']
        
        print(f"Total rows: {len(html_col)}")
        print(f"Pandas <NA> count: {html_col.isna().sum()}")
        
        # Check the first row's content
        print("\n--- Content of first row's 'html_content' (first 500 chars) ---")
        first_row_html = html_col.iloc[0]
        
        if pd.isna(first_row_html):
            print("Row 0 is <NA> (Pandas 'Not Available')")
        elif first_row_html is None:
            print("Row 0 is 'None' (Python 'None')")
        elif isinstance(first_row_html, str):
            print(first_row_html[:500])
        else:
            print(f"Row 0 is of an unexpected type: {type(first_row_html)}")
            
except NameError:
    print("ERROR: 'df' is not defined. Please run Cell 3 first.")
except Exception as e:
    print(f"An error occurred: {e}")

--- DEBUGGING CELL 3.5 ---

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           81 non-null     string
 1   html_content  81 non-null     string
dtypes: string(2)
memory usage: 1.4 KB

--- Checking 'html_content' column ---
Total rows: 81
Pandas <NA> count: 0

--- Content of first row's 'html_content' (first 500 chars) ---
<!doctype html><!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en" > <![endif]--><!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en" >        <![endif]--><!--[if IE 8]>    <html class="no-js lt-ie9" lang="en" >               <![endif]--><!--[if gt IE 8]><!--><html class="no-js" lang="en"><!--<![endif]--><head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
    <meta name="author" content="Cyber Management Alliance Limited"


In [19]:
import pandas as pd # Make sure pandas is imported in case of kernel restart

def parse_html(html_content):
    """
    Parses HTML content to extract title, clean body text, and word count.
    """
    try:
        # Check for invalid inputs (NaN, None, or our placeholder)
        if pd.isna(html_content) or not isinstance(html_content, str) or html_content == 'No HTML Content':
            return "No Title", "No Content", 0
            
        soup = BeautifulSoup(html_content, 'html.parser')

        # 1. Extract Title
        title = soup.title.string if soup.title else "No Title"
        title = re.sub(r'\s+', ' ', title).strip() # Clean up extra whitespace

        # 2. Extract Body Text
        main_content = soup.find('article') or soup.find('main')
        
        if main_content:
            body_text = main_content.get_text(separator=' ', strip=True)
        else:
            paragraphs = soup.find_all('p')
            if paragraphs:
                body_text = ' '.join([p.get_text(strip=True) for p in paragraphs])
            else:
                # Fallback: get all text from body
                body_text = soup.body.get_text(separator=' ', strip=True) if soup.body else "No Content"

        # If body_text is still empty (e.g., just scripts), set to "No Content"
        if not body_text.strip():
             body_text = "No Content"

        # Clean up the extracted text
        body_text = re.sub(r'\s+', ' ', body_text).strip()

        # 3. Calculate Word Count
        word_count = len(body_text.split())
        
        return title, body_text, word_count

    except Exception as e:
        # Handle any other parsing errors gracefully
        print(f"Error parsing content: {e}")
        return "Error Title", "Error Content", 0

# --- Test the function with a sample ---
test_html = "<html><head><title>  Test Title </title></head><body><main><p>This is paragraph 1.</p><p>This is paragraph 2.</p></main></body></html>"
print("\n--- Testing the function ---")
print(parse_html(test_html))

print("Cell 4: HTML parsing function defined and tested.")


--- Testing the function ---
('Test Title', 'This is paragraph 1. This is paragraph 2.', 8)
Cell 4: HTML parsing function defined and tested.


In [20]:
print("Starting HTML parsing for all rows... (This may take a moment)")

# Apply the function to the 'html_content' column
# This will create three new series (like columns)
parsed_data = df['html_content'].apply(lambda html: pd.Series(parse_html(html)))

# Rename the new columns
parsed_data.columns = ['title', 'body_text', 'word_count']

# Add these new columns to our original dataframe, keeping only the 'url'
df_extracted = pd.concat([df['url'], parsed_data], axis=1)

# Display the new, clean dataframe
print("Parsing complete!")
print(df_extracted.head())

Starting HTML parsing for all rows... (This may take a moment)
Parsing complete!
                                                 url  \
0     https://www.cm-alliance.com/cybersecurity-blog   
1    https://www.varonis.com/blog/cybersecurity-tips   
2  https://www.cisecurity.org/insights/blog/11-cy...   
3  https://www.cisa.gov/topics/cybersecurity-best...   
4  https://www.qnbtrust.bank/Resources/Learning-C...   

                                               title  \
0                                Cyber Security Blog   
1  Top 10 Cybersecurity Awareness Tips: How to St...   
2  11 Cyber Defense Tips to Stay Secure at Work a...   
3  Cybersecurity Best Practices | Cybersecurity a...   
4                                           No Title   

                                           body_text  word_count  
0  Cyber Crisis Tabletop Exercise Cyber Security ...         325  
1  Cybersecurity is gaining more importance globa...        1700  
2  Home Insights Blog Posts 11 Cyber Defense

In [21]:
# Save the extracted data to the output CSV
df_extracted.to_csv(OUTPUT_FILE, index=False)

print(f"Cell 6: Successfully parsed and saved data to {OUTPUT_FILE}")
print(f"Total rows processed: {len(df_extracted)}")
df_extracted.info()

Cell 6: Successfully parsed and saved data to ../data\extracted_content.csv
Total rows processed: 81
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   url         81 non-null     string
 1   title       81 non-null     object
 2   body_text   81 non-null     object
 3   word_count  81 non-null     int64 
dtypes: int64(1), object(2), string(1)
memory usage: 2.7+ KB


In [22]:
import nltk
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import numpy as np

print("Cell 7: Libraries for Part 2 imported.")

Cell 7: Libraries for Part 2 imported.


In [27]:
import ssl
try:
    nltk.data.find('tokenizers/punkt')
    print("'punkt' is already downloaded.")
except LookupError:
    print("Downloading 'punkt' for NLTK (for sentence tokenizing)...")
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context
    nltk.download('punkt')
    print("Download complete.")
print("Cell 8: NLTK 'punkt' check complete.")

'punkt' is already downloaded.
Cell 8: NLTK 'punkt' check complete.


In [None]:
# Load the clean data we just saved
df_features = pd.read_csv(OUTPUT_FILE) # This is 'extracted_content.csv'

def extract_text_features(text):
    if not isinstance(text, str) or not text.strip():
        return "", 0, 100.0 
    try:
        clean_text = text.lower()
        clean_text = re.sub(r'\s+', ' ', clean_text).strip()
        sentences = nltk.sent_tokenize(clean_text)
        sentence_count = len(sentences)
        if len(clean_text.split()) < 10:
            flesch_score = 100.0 
        else:
            flesch_score = textstat.flesch_reading_ease(clean_text)
    except Exception as e:
        print(f"Error processing text: {e}")
        clean_text = ""
        sentence_count = 0
        flesch_score = 0.0 
    return clean_text, sentence_count, flesch_score
print("Cell 9: Feature extraction function defined.")

Cell 9: Feature extraction function defined.


In [None]:
print("Applying feature extraction to all rows...")
features = df_features.apply(lambda row: pd.Series(extract_text_features(row['body_text'])), axis=1)
features.columns = ['clean_text', 'sentence_count', 'flesch_reading_ease']
df_features = pd.concat([df_features, features], axis=1)
print("Cell 10: Features extracted successfully.")
print(df_features[['url', 'word_count', 'sentence_count', 'flesch_reading_ease']].head())

Applying feature extraction to all rows...
Error processing text: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\Vinal/nltk_data'
    - 'c:\\Users\\Vinal\\OneDrive\\Desktop\\Project_leadwalnut\\seo-content-detector\\venv\\nltk_data'
    - 'c:\\Users\\Vinal\\OneDrive\\Desktop\\Project_leadwalnut\\seo-content-detector\\venv\\share\\nltk_data'
    - 'c:\\Users\\Vinal\\OneDrive\\Desktop\\Project_leadwalnut\\seo-content-detector\\venv\\lib\\nltk_data'
    - 'C:\\Users\\Vinal\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************

Error processing 

In [26]:
print("Starting TF-IDF vectorization...")

# Initialize the vectorizer
# We'll limit to the top 2000 features and remove common English stop words
vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')

# Create the TF-IDF matrix from our clean text
# We fill empty text with '' to avoid errors
tfidf_matrix = vectorizer.fit_transform(df_features['clean_text'].fillna(''))

# Get the list of feature names (the words)
feature_names = vectorizer.get_feature_names_out()

print("Cell 11: TF-IDF matrix created.")
print(f"Matrix shape (documents, features): {tfidf_matrix.shape}")

Starting TF-IDF vectorization...


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
# Define all the output file paths
MODELS_DIR = '../models'
FEATURES_FILE = os.path.join(DATA_DIR, 'features.csv')
VECTORIZER_FILE = os.path.join(MODELS_DIR, 'tfidf_vectorizer.pkl')
MATRIX_FILE = os.path.join(MODELS_DIR, 'tfidf_matrix.pkl')

# Create 'models' directory if it doesn't exist
os.makedirs(MODELS_DIR, exist_ok=True)

# --- 1. Save the models for later use ---
# We save the matrix and vectorizer for Part 3 (Duplicates)
with open(VECTORIZER_FILE, 'wb') as f:
    pickle.dump(vectorizer, f)
    
with open(MATRIX_FILE, 'wb') as f:
    pickle.dump(tfidf_matrix, f)
    
print(f"TF-IDF vectorizer saved to: {VECTORIZER_FILE}")
print(f"TF-IDF matrix saved to: {MATRIX_FILE}")


# --- 2. Prepare and save the features.csv file ---
def get_top_keywords(doc_vector, n=5):
    # Sort the tf-idf scores in descending order
    sorted_indices = np.argsort(doc_vector.toarray()).flatten()[::-1]
    top_indices = sorted_indices[:n]
    # Get the words, separated by "|" as per assignment example
    top_keywords = [feature_names[i] for i in top_indices if doc_vector[0, i] > 0]
    return "|".join(top_keywords)

# Get keywords for each document
df_features['top_keywords'] = [get_top_keywords(tfidf_matrix[i]) for i in range(tfidf_matrix.shape[0])]

# Get embeddings as string lists, as per assignment example "[0.023,-0.145,...]"
# We just show a small slice of the vector (first 20 numbers) for the CSV
df_features['embedding_str'] = [np.array2string(tfidf_matrix[i].toarray().flatten()[:20], separator=',') for i in range(tfidf_matrix.shape[0])]

# Select columns for the final CSV, as required
csv_columns = ['url', 'word_count', 'sentence_count', 'flesch_reading_ease', 'top_keywords', 'embedding_str']
df_to_save = df_features.rename(columns={'embedding_str': 'embedding'})[csv_columns]

# Save to CSV
df_to_save.to_csv(FEATURES_FILE, index=False)

print(f"\nCell 12: Features successfully saved to: {FEATURES_FILE}")
print(df_to_save.head())

NameError: name 'tfidf_matrix' is not defined