# SEO Content Quality & Duplicate Detector

This notebook contains a complete pipeline template for the assignment.

In [22]:
!pip install beautifulsoup4 lxml scikit-learn pandas numpy textstat sentence-transformers joblib nltk



In [23]:
import os
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib

print('Imports ok')

Imports ok


In [24]:
from pathlib import Path
DATA_DIR = Path('data')
DATA_DIR.mkdir(exist_ok=True)
DATA_CSV = DATA_DIR / 'data.csv'
EXTRACTED_CSV = DATA_DIR / 'extracted_content.csv'
FEATURES_CSV = DATA_DIR / 'features.csv'
DUPLICATES_CSV = DATA_DIR / 'duplicates.csv'

if DATA_CSV.exists():
    df = pd.read_csv(DATA_CSV)
    print('Loaded', DATA_CSV, 'shape=', df.shape)
else:
    print('Please add data/data.csv with url and html_content columns.')

Loaded data\data.csv shape= (81, 2)


In [25]:
def extract_title_and_body_from_html(html):
    try:
        soup = BeautifulSoup(html, 'lxml')
        title = soup.title.get_text(strip=True) if soup.title else ''
        main = soup.find('article') or soup.find('main')
        if main:
            parts = [p.get_text(separator=' ', strip=True) for p in main.find_all('p')]
            body = ' '.join(parts)
        else:
            parts = [p.get_text(separator=' ', strip=True) for p in soup.find_all('p')]
            body = ' '.join(parts)
        if not body:
            body = soup.get_text(separator=' ', strip=True)
        body = re.sub(r'\s+', ' ', body).strip()
        return title, body
    except Exception:
        return '', ''

In [26]:
if 'df' in globals() and 'html_content' in df.columns:
    rows = []
    for _, r in df.iterrows():
        url = r.get('url','')
        html = r.get('html_content','') or ''
        title, body = extract_title_and_body_from_html(html)
        rows.append({'url': url, 'title': title, 'body_text': body, 'word_count': len(body.split())})
    extracted_df = pd.DataFrame(rows)
    extracted_df.to_csv(EXTRACTED_CSV, index=False)
    print('Saved', EXTRACTED_CSV)
else:
    print('No html_content column found in data.csv; run scraping or provide html_content.')

Saved data\extracted_content.csv


In [28]:
# Feature engineering (readability + TF-IDF top keywords extraction)
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# Optional readability library
try:
    import textstat
    _HAS_TEXTSTAT = True
except Exception:
    _HAS_TEXTSTAT = False

def estimate_syllables(word):
    word = word.lower()
    vowels = 'aeiouy'
    count = 0
    if word and word[0] in vowels:
        count += 1
    for i in range(1, len(word)):
        if word[i] in vowels and word[i - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count = max(1, count - 1)
    return max(1, count)

def compute_readability(text):
    """Calculate Flesch Reading Ease score (fallback if textstat unavailable)."""
    if _HAS_TEXTSTAT:
        return textstat.flesch_reading_ease(text)
    sents = max(1, len(sent_tokenize(text)))
    words = text.split()
    words_count = max(1, len(words))
    sylls = sum(estimate_syllables(w) for w in words)
    asl = words_count / sents
    asw = sylls / words_count
    score = 206.835 - 1.015 * asl - 84.6 * asw
    return score

# ---- Feature Extraction ----
if EXTRACTED_CSV.exists():
    ex = pd.read_csv(EXTRACTED_CSV)

    # Basic text metrics
    ex['sentence_count'] = ex['body_text'].fillna('').apply(lambda t: len(sent_tokenize(t)) if t.strip() else 0)
    ex['word_count'] = ex['body_text'].fillna('').apply(lambda t: len(t.split()))
    ex['flesch_reading_ease'] = ex['body_text'].fillna('').apply(
        lambda t: compute_readability(t) if t.strip() else 0.0
    )

    # Build TF-IDF model
    corpus = ex['body_text'].fillna('').astype(str).tolist()
    if any(corpus):
        vect = TfidfVectorizer(
            max_df=0.8, min_df=1, stop_words='english', ngram_range=(1, 2)
        )
        X = vect.fit_transform(corpus)
        feature_names = np.array(vect.get_feature_names_out())

        top_keywords_list = []
        for i in range(X.shape[0]):
            row = X[i].toarray().ravel()   # convert sparse matrix row to dense array
            top_idx = row.argsort()[-5:][::-1]
            keywords = '|'.join(feature_names[top_idx])
            top_keywords_list.append(keywords)

        ex['top_keywords'] = top_keywords_list
        tfidf_matrix = X
        tfidf_vect = vect
    else:
        ex['top_keywords'] = ''
        tfidf_matrix = None
        tfidf_vect = None

    # Save features
    ex.to_csv(FEATURES_CSV, index=False)
    print('✅ Saved features to', FEATURES_CSV)
    display(ex.head())
else:
    print('⚠️ Run the extraction step first (extracted_content.csv missing).')


✅ Saved features to data\features.csv


Unnamed: 0,url,title,body_text,word_count,sentence_count,flesch_reading_ease,top_keywords
0,https://www.cm-alliance.com/cybersecurity-blog,Cyber Security Blog,Cyber Crisis Tabletop Exercise Cyber Security ...,326,6,-6.816181,cyber|alliance|cyber management|management all...
1,https://www.varonis.com/blog/cybersecurity-tips,Top 10 Cybersecurity Awareness Tips: How to St...,Cybersecurity is gaining more importance globa...,1578,78,38.946453,varonis|access|data|security|app
2,https://www.cisecurity.org/insights/blog/11-cy...,11 Cyber Defense Tips to Stay Secure at Work a...,Cybersecurity is inextricably tied to the tech...,946,61,53.698274,password|passphrase|authentication|protect|device
3,https://www.cisa.gov/topics/cybersecurity-best...,Cybersecurity Best Practices | Cybersecurity a...,Cyberspace is particularly difficult to secure...,489,22,9.65399,cisa|cybersecurity|cyber|nation|cybersecurity ...
4,https://www.qnbtrust.bank/Resources/Learning-C...,,,0,0,0.0,00|тhe gartner|тhe|čapek films|čapek


In [29]:
# Duplicate detection (cosine similarity)
if 'tfidf_matrix' in globals():
    sim = cosine_similarity(tfidf_matrix)
    threshold = 0.80
    pairs = []
    n = sim.shape[0]
    for i in range(n):
        for j in range(i+1, n):
            if sim[i,j] >= threshold:
                pairs.append({'url1': ex.loc[i,'url'], 'url2': ex.loc[j,'url'], 'similarity': float(sim[i,j])})
    pd.DataFrame(pairs).to_csv(DUPLICATES_CSV, index=False)
    print('Saved duplicates to', DUPLICATES_CSV)
else:
    print('TF-IDF not available; skip duplicates.')

Saved duplicates to data\duplicates.csv


In [31]:
# Labeling & Model training
if 'ex' in globals():
    def quality_label(row):
        wc = row['word_count']; fr = row['flesch_reading_ease']
        if (wc > 1500) and (50 <= fr <= 70):
            return 'High'
        if (wc < 500) or (fr < 30):
            return 'Low'
        return 'Medium'
    ex['quality_label'] = ex.apply(quality_label, axis=1)
    features = ex[['word_count','sentence_count','flesch_reading_ease']].fillna(0)
    le = LabelEncoder()
    y = le.fit_transform(ex['quality_label'])
    X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.3, random_state=42, stratify=y)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    os.makedirs("models", exist_ok=True)
    joblib.dump(clf, 'models/quality_model.pkl')
    joblib.dump(le, 'models/label_encoder.pkl')
    print('Saved model to models/quality_model.pkl')
else:
    print('Features missing; cannot train model.')

              precision    recall  f1-score   support

        High       0.67      1.00      0.80         2
         Low       0.94      1.00      0.97        15
      Medium       1.00      0.75      0.86         8

    accuracy                           0.92        25
   macro avg       0.87      0.92      0.87        25
weighted avg       0.94      0.92      0.92        25

Saved model to models/quality_model.pkl


# Live content analyzer for any webpage

In [36]:
import requests, json
from pathlib import Path
import joblib

def analyze_url(url):
    """Fetches a webpage, extracts features, and predicts content quality."""
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (compatible; SEO-Content-Detector/1.0)'}
        r = requests.get(url, headers=headers, timeout=15)

        if r.status_code != 200:
            return {'url': url, 'error': f'HTTP {r.status_code}'}

        # ---- Extract title and body ----
        title, body = extract_title_and_body_from_html(r.text)
        if not body.strip():
            return {'url': url, 'error': 'Empty body text extracted'}

        # ---- Compute features ----
        wc = len(body.split())
        sc = len([s for s in body.split('.') if s.strip()])
        fr = compute_readability(body)

        # ---- Predict using trained model ----
        model_path = Path('models/quality_model.pkl')
        encoder_path = Path('models/label_encoder.pkl')

        if model_path.exists() and encoder_path.exists():
            clf = joblib.load(model_path)
            le = joblib.load(encoder_path)
            pred = clf.predict([[wc, sc, fr]])
            label = le.inverse_transform(pred)[0]
        else:
            label = "Unknown (model not found)"

        # ---- Build result ----
        result = {
            'url': url,
            'title': title,
            'word_count': wc,
            'sentence_count': sc,
            'flesch_reading_ease': round(fr, 2),
            'quality_label': label
        }
        return result

    except Exception as e:
        return {'url': url, 'error': str(e)}

# ------------------------------
# Testing the live analyzer
# ------------------------------
test_url = "https://www.bbc.com/news/technology" 
result = analyze_url(test_url)

# Pretty print the output
print("\n Live SEO Content Analysis Result:")
print(json.dumps(result, indent=2))



 Live SEO Content Analysis Result:
{
  "url": "https://www.bbc.com/news/technology",
  "title": "BBC Innovation | Technology, Health, Environment, AI",
  "word_count": 997,
  "sentence_count": 63,
  "flesch_reading_ease": 54.81,
  "quality_label": "Medium"
}


