In [7]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import os
import requests
import time
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import textstat 
from nltk.tokenize import sent_tokenize
import nltk

try:
    print("Checking/downloading NLTK 'punkt' and 'punkt_tab' resources...")
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True) 
    print("NLTK downloads complete.")
except Exception as e:
    print(f"Warning: Error during NLTK download. Feature engineering may fail. ({e})")

# --- Configuration and Paths ---
DATA_DIR = 'data'
MODELS_DIR = 'models'

INPUT_DATA_PATH = os.path.join(os.pardir, DATA_DIR, 'data.csv')
EXTRACTED_CONTENT_PATH = os.path.join(os.pardir, DATA_DIR, 'extracted_content.csv')
FEATURES_PATH = os.path.join(os.pardir, DATA_DIR, 'features.csv')
DUPLICATES_PATH = os.path.join(os.pardir, DATA_DIR, 'duplicates.csv')
MODEL_PATH = os.path.join(os.pardir, MODELS_DIR, 'quality_model.pkl')
VECTORIZER_PATH = os.path.join(os.pardir, MODELS_DIR, 'tfidf_vectorizer.pkl')

SIMILARITY_THRESHOLD = 0.80
THIN_CONTENT_WORD_COUNT = 500

# Load the dataset
try:
    df = pd.read_csv(INPUT_DATA_PATH)
    print(f"\nLoaded dataset with {len(df)} rows.")
except FileNotFoundError:
    print(f"\nError: data.csv not found at {os.path.abspath(INPUT_DATA_PATH)}")
    print("Please ensure your 'data.csv' is inside the 'data' folder, and the 'data' folder is next to the 'notebooks' folder.")
    raise 

# --- HTML Parsing Function (Utility) ---
def parse_html_content(html_content):
    """Parses HTML to extract title, body text, and word count."""
    if pd.isna(html_content) or not html_content:
        return 'No Title', '', 0
    
    try:
        soup = BeautifulSoup(str(html_content), 'html.parser')
        
        title = soup.title.string.strip() if soup.title and soup.title.string else 'No Title'
        
        main_content_div = soup.find('main') or soup.find('article') or soup.find('body')
        
        if main_content_div:
            body_text = main_content_div.get_text(separator=' ', strip=True)
        else:
            body_text = soup.body.get_text(separator=' ', strip=True) if soup.body else ''

        clean_text = re.sub(r'\s+', ' ', body_text).strip()
        word_count = len(clean_text.split())

        return title, clean_text, word_count
        
    except Exception as e:
        return 'Parsing Error', '', 0 

# --- Text Feature Function (Utility) ---
def calculate_text_features(text):
    """Calculates sentence count and Flesch Reading Ease score."""
    if not text:
        return (0, 0.0)
        
    clean_text = str(text).lower()
    
    sentence_count = len(sent_tokenize(clean_text))
    
    try:
        flesch_score = textstat.flesch_reading_ease(clean_text)
    except:
        flesch_score = 0.0 
        
    return (sentence_count, flesch_score)

Checking/downloading NLTK 'punkt' and 'punkt_tab' resources...
NLTK downloads complete.

Loaded dataset with 81 rows.


In [9]:
print("---HTML Parsing and Content Extraction---")

# Apply the parsing function
df[['title', 'body_text', 'word_count']] = df['html_content'].apply(
    lambda x: pd.Series(parse_html_content(x))
)

extracted_df = df[['url', 'title', 'body_text', 'word_count']].copy()
extracted_df.to_csv(EXTRACTED_CONTENT_PATH, index=False)
print(f"Parsed {len(extracted_df)} pages. Saved to {EXTRACTED_CONTENT_PATH}")
print(extracted_df.head())

--- Step 2: HTML Parsing and Content Extraction (15%) ---
Parsed 81 pages. Saved to ..\data\extracted_content.csv
                                                 url  \
0     https://www.cm-alliance.com/cybersecurity-blog   
1    https://www.varonis.com/blog/cybersecurity-tips   
2  https://www.cisecurity.org/insights/blog/11-cy...   
3  https://www.cisa.gov/topics/cybersecurity-best...   
4  https://www.qnbtrust.bank/Resources/Learning-C...   

                                               title  \
0                                Cyber Security Blog   
1  Top 10 Cybersecurity Awareness Tips: How to St...   
2  11 Cyber Defense Tips to Stay Secure at Work a...   
3  Cybersecurity Best Practices | Cybersecurity a...   
4                                           No Title   

                                           body_text  word_count  
0  Back Training NCSC Assured Cyber Incident Plan...        2605  
1  Blog Privacy & Compliance Top 10 Cybersecurity...        1747  
2  Home Ins

In [5]:
import nltk

try:
    print("Attempting to download 'punkt_tab' resource for sentence tokenization...")
    nltk.download('punkt_tab', quiet=True)
    print("Download complete. Proceeding with feature calculation.")
except Exception as e:
    print(f"Error during NLTK download: {e}")
    pass

Attempting to download 'punkt_tab' resource for sentence tokenization...
Download complete. Proceeding with feature calculation.


In [11]:
print("\n---Feature Engineering---")

all_features_calculated = [
    calculate_text_features(text) for text in extracted_df['body_text']
]

# Create a temporary DataFrame directly from this list of tuples.
feature_df = pd.DataFrame(
    all_features_calculated, 
    columns=['sentence_count', 'flesch_reading_ease'],
    index=extracted_df.index 
)

# Assign the columns from the new DataFrame
extracted_df[['sentence_count', 'flesch_reading_ease']] = feature_df

# TF-IDF and Embedding Generation
content_for_tfidf = extracted_df[extracted_df['word_count'] > 0]['body_text']

# Fit the vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
tfidf_matrix = vectorizer.fit_transform(content_for_tfidf)
feature_names = vectorizer.get_feature_names_out()

# Keyword Extraction
def get_top_keywords(text):
    if not text:
        return ""
    tfidf_vector = vectorizer.transform([text])
    
    if tfidf_vector.shape[1] == 0:
        return "" 

    top_n = 5
    sorted_indices = tfidf_vector.sum(axis=0).A1.argsort()[-top_n:][::-1]
    
    keywords = [feature_names[i] for i in sorted_indices if tfidf_vector[0, i] > 0]
    return '|'.join(keywords) 

extracted_df['top_keywords'] = extracted_df['body_text'].apply(get_top_keywords)

# Create the 'embedding' column 
full_tfidf_matrix = vectorizer.transform(extracted_df['body_text'])

extracted_df['embedding'] = [
    str(list(vector)).replace(' ', '') for vector in full_tfidf_matrix.toarray()
]

# Save features and vectorizer
features_df = extracted_df[['url', 'word_count', 'sentence_count', 'flesch_reading_ease', 'top_keywords', 'embedding']].copy()
features_df.to_csv(FEATURES_PATH, index=False)
dump(vectorizer, VECTORIZER_PATH) 
print(f"Features computed. Saved to {FEATURES_PATH} and vectorizer saved to {VECTORIZER_PATH}")
print(features_df.head())


--- Step 3: Feature Engineering (25%) ---
Features computed. Saved to ..\data\features.csv and vectorizer saved to ..\models\tfidf_vectorizer.pkl
                                                 url  word_count  \
0     https://www.cm-alliance.com/cybersecurity-blog        2605   
1    https://www.varonis.com/blog/cybersecurity-tips        1747   
2  https://www.cisecurity.org/insights/blog/11-cy...        1058   
3  https://www.cisa.gov/topics/cybersecurity-best...         826   
4  https://www.qnbtrust.bank/Resources/Learning-C...           0   

   sentence_count  flesch_reading_ease  \
0              66            28.741548   
1              94            40.871699   
2              62            53.262918   
3              27            -2.538002   
4               0             0.000000   

                                    top_keywords  \
0     cyber|2025|cybersecurity|tabletop|incident   
1         access|data|security|app|cybersecurity   
2       password|authentication|don

In [13]:
print("\n---Duplicate Detection---")

# Compute pairwise cosine similarity
cosine_sim_matrix = cosine_similarity(tfidf_matrix)
valid_indices = extracted_df[extracted_df['word_count'] > 0].index

duplicate_pairs = []
n_valid_pages = tfidf_matrix.shape[0]

for i in range(n_valid_pages):
    for j in range(i + 1, n_valid_pages):
        similarity = cosine_sim_matrix[i, j]
        if similarity >= SIMILARITY_THRESHOLD: 
            url1 = extracted_df.loc[valid_indices[i], 'url']
            url2 = extracted_df.loc[valid_indices[j], 'url']
            duplicate_pairs.append({
                'url1': url1, 
                'url2': url2, 
                'similarity': round(similarity, 4)
            })

# Save duplicate pairs
duplicates_df = pd.DataFrame(duplicate_pairs)
duplicates_df.to_csv(DUPLICATES_PATH, index=False)

# Thin Content Detection
extracted_df['is_thin'] = (extracted_df['word_count'] < THIN_CONTENT_WORD_COUNT)

# Report basic statistics
total_pages = len(extracted_df)
num_duplicates = len(duplicates_df)
num_thin = extracted_df['is_thin'].sum()
percent_thin = (num_thin / total_pages) * 100 if total_pages > 0 else 0

print(f"Total pages analyzed: {total_pages}")
print(f"Duplicate pairs found: {num_duplicates}")
print(f"Thin content pages: {num_thin} ({percent_thin:.1f}%)")
print(f"Saved duplicate pairs to {DUPLICATES_PATH}")


--- Step 4: Duplicate Detection (20%) ---
Total pages analyzed: 81
Duplicate pairs found: 7
Thin content pages: 24 (29.6%)
Saved duplicate pairs to ..\data\duplicates.csv


In [17]:
print("\n---Content Quality Scoring---")

def create_synthetic_label(row):
    """Creates synthetic labels (High, Medium, Low) based on word count and readability."""
    word_count = row['word_count']
    readability = row['flesch_reading_ease']
    
    if word_count > 1500 and 50 <= readability <= 70:
        return 'High'
    elif word_count < 500 or readability < 30:
        return 'Low'
    else:
        return 'Medium'

extracted_df['quality_label'] = extracted_df.apply(create_synthetic_label, axis=1)

# Prepare Features (X) and Target (y) for pages with content
model_data = extracted_df[extracted_df['word_count'] > 0].copy()
X_core = model_data[['word_count', 'sentence_count', 'flesch_reading_ease']]
y = model_data['quality_label']

# Re-align TF-IDF matrix for training data only
tfidf_train_data = vectorizer.transform(model_data['body_text'])

# Combine core features and TF-IDF features
X = pd.concat([X_core.reset_index(drop=True), 
               pd.DataFrame(tfidf_train_data.toarray()).reset_index(drop=True)], axis=1)
X.columns = X.columns.astype(str)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Classification Model (Logistic Regression)
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train) 

# Predict and Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Overall Accuracy: {accuracy:.4f}")
print("\nClassification Report (F1-score):")
print(classification_report(y_test, y_pred))

# Save the trained model
dump(model, MODEL_PATH)
print(f"Saved quality model to {MODEL_PATH}")


--- Step 5: Content Quality Scoring (25%) ---
Overall Accuracy: 0.7143

Classification Report (F1-score):
              precision    recall  f1-score   support

        High       0.00      0.00      0.00         2
         Low       0.83      0.91      0.87        11
      Medium       0.62      0.62      0.62         8

    accuracy                           0.71        21
   macro avg       0.49      0.51      0.50        21
weighted avg       0.67      0.71      0.69        21

Saved quality model to ..\models\quality_model.pkl


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
import json
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning

warnings.simplefilter('ignore', InsecureRequestWarning)


# --- Live Scraping Utility ---
def realtime_scrape_url(url):
    """Scrapes a single URL with error handling and rate limiting."""
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (compatible; SEOContentAnalyzer/1.0)'} 
        time.sleep(1.5) 
        response = requests.get(url, headers=headers, timeout=10, verify=False)
        response.raise_for_status() 
        return response.text
    except requests.exceptions.RequestException as e: 
        print(f"Scraping error for {url}: {e}")
        return None

# --- Main Analysis Function ---
def analyze_url(url, existing_df=extracted_df):
    try:
        realtime_model = load(MODEL_PATH)
        realtime_vectorizer = load(VECTORIZER_PATH)
    except FileNotFoundError:
        return {"error": "Model or vectorizer not found. Run the full pipeline first."}

    # Scraping 
    html_content = realtime_scrape_url(url)
    if html_content is None:
        return {"url": url, "error": "Scraping failed or received bad status code."}

    # Parsing and Feature Extraction
    title, body_text, word_count = parse_html_content(html_content)
    if not body_text:
        return {"url": url, "error": "Parsing failed or content is empty."}
        
    # Recalculate features (returns a tuple)
    sentence_count, readability = calculate_text_features(body_text)

    # Model Prediction Preparation
    X_core_features = pd.DataFrame({
        'word_count': [word_count],
        'sentence_count': [sentence_count],
        'flesch_reading_ease': [readability]
    })
    
    # TF-IDF vector
    new_tfidf_vector = realtime_vectorizer.transform([body_text])
    
    # Combine features (must match training feature count and column string type)
    X_predict = pd.concat([X_core_features.reset_index(drop=True), 
                           pd.DataFrame(new_tfidf_vector.toarray()).reset_index(drop=True)], axis=1)
    
    # Ensure column names are strings for prediction
    X_predict.columns = X_predict.columns.astype(str)

    # Quality Score
    quality_label = realtime_model.predict(X_predict)[0]
    
    # Duplicate Check
    corpus_content_df = existing_df[existing_df['body_text'].str.len() > 0].copy()
    corpus_tfidf = realtime_vectorizer.transform(corpus_content_df['body_text'])
    corpus_urls = corpus_content_df['url']
    
    # Calculate similarity 
    new_sims = cosine_similarity(new_tfidf_vector, corpus_tfidf)[0]
    
    similar_to = []
    for i, sim in enumerate(new_sims):
        if sim >= SIMILARITY_THRESHOLD and sim < 0.999: 
            similar_to.append({
                "url": corpus_urls.iloc[i], 
                "similarity": round(sim, 4)
            })

    # Final Output
    return {
        "url": url,
        "word_count": word_count,
        "readability": round(readability, 1),
        "quality_label": quality_label,
        "is_thin": bool(word_count < THIN_CONTENT_WORD_COUNT),
        "similar_to": similar_to
    }

print("\n--- Testing Real-Time Analysis Demo ---")
test_url = "https://en.wikipedia.org/wiki/Digital_marketing"
print(f"Analyzing URL: {test_url}")
result = analyze_url(test_url)
print(json.dumps(result, indent=2))


--- Testing Real-Time Analysis Demo ---
Analyzing URL: https://en.wikipedia.org/wiki/Digital_marketing
{
  "url": "https://en.wikipedia.org/wiki/Digital_marketing",
  "word_count": 11368,
  "readability": 35.7,
  "quality_label": "Medium",
  "is_thin": false,
  "similar_to": []
}
