In [1]:
!pip install pandas numpy beautifulsoup4 lxml nltk textstat scikit-learn sentence-transformers matplotlib seaborn



In [37]:
# Full edited pipeline: improved parsing, percentile labeling, calibrated model, analyze_url()
import os
import json
import joblib
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix

import textstat

# ---------------- Paths ----------------
DATA_DIR = '../data'
RAW_CSV = os.path.join(DATA_DIR, 'data.csv')               # input (Kaggle)
EXTRACTED_CSV = os.path.join(DATA_DIR, 'extracted_content.csv')
FEATURES_CSV = os.path.join(DATA_DIR, 'features.csv')
DUPLICATES_CSV = os.path.join(DATA_DIR, 'duplicates.csv')
MODEL_DIR = '../models'
MODEL_PATH = os.path.join(MODEL_DIR, 'quality_model.pkl')
os.makedirs(MODEL_DIR, exist_ok=True)

# 1) Load raw dataset
df_raw = pd.read_csv(RAW_CSV)
print(f"Loaded {len(df_raw)} rows from {RAW_CSV}")

# 2) Improved HTML parsing: choose largest meaningful block
def parse_html_largest_block(html):
    if not isinstance(html, str) or not html.strip():
        return "", "", 0
    soup = BeautifulSoup(html, 'lxml')
    title = soup.title.string.strip() if soup.title and soup.title.string else ""

    candidates = soup.find_all(['article', 'main', 'section', 'div'])
    best_text = ""
    for c in candidates:
        try:
            text = c.get_text(separator=' ', strip=True)
        except Exception:
            text = ""
        if text and len(text) > len(best_text):
            best_text = text

    if not best_text:
        paragraphs = [p.get_text(separator=' ', strip=True) for p in soup.find_all('p')]
        best_text = ' '.join(paragraphs)

    clean_text = ' '.join(best_text.split())
    word_count = len(clean_text.split()) if clean_text else 0
    return title, clean_text, word_count

parsed = df_raw['html_content'].apply(lambda x: pd.Series(parse_html_largest_block(x), index=['title','body_text','word_count']))
df = pd.concat([df_raw[['url']].reset_index(drop=True), parsed.reset_index(drop=True)], axis=1)
df.to_csv(EXTRACTED_CSV, index=False)
print(f"Saved parsed content -> {EXTRACTED_CSV}")

# 3) Text cleaning & basic features
df['clean_text'] = df['body_text'].fillna('').astype(str).str.lower().str.replace(r'\s+', ' ', regex=True).str.strip()
df['sentence_count'] = df['clean_text'].apply(lambda x: max(0, len([s for s in x.split('.') if s.strip()])))
df['flesch_reading_ease'] = df['clean_text'].apply(lambda x: textstat.flesch_reading_ease(x) if x.strip() else 0.0)
df['is_thin'] = df['word_count'] < 500

# 4) TF-IDF vectorization and top keywords
vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['clean_text'].fillna(''))
feature_names = vectorizer.get_feature_names_out()

def get_top_k_keywords(row_tfidf, k=5):
    if row_tfidf.nnz == 0:
        return ''
    row = row_tfidf.toarray().flatten()
    top_idx = np.argsort(row)[-k:][::-1]
    top_words = [feature_names[i] for i in top_idx if row[i] > 0]
    return '|'.join(top_words)

df['top_keywords'] = [get_top_k_keywords(tfidf_matrix[i], k=5) for i in range(tfidf_matrix.shape[0])]
df_features = df[['url','word_count','sentence_count','flesch_reading_ease','is_thin','top_keywords','clean_text']].copy()
df_features.to_csv(FEATURES_CSV, index=False)
print(f"Saved features -> {FEATURES_CSV}")

# 5) Duplicate detection
sim_matrix = cosine_similarity(tfidf_matrix)
threshold = 0.80
pairs = []
n = sim_matrix.shape[0]
for i in range(n):
    for j in range(i+1, n):
        sim = float(sim_matrix[i,j])
        if sim >= threshold:
            pairs.append({'url1': df.loc[i,'url'], 'url2': df.loc[j,'url'], 'similarity': sim})
dup_df = pd.DataFrame(pairs)
dup_df.to_csv(DUPLICATES_CSV, index=False)
print(f"Saved duplicates -> {DUPLICATES_CSV} (found {len(dup_df)} pairs at threshold {threshold})")

# 6) Create adaptive labels
wc_90 = max(1, int(df['word_count'].quantile(0.90)))
flesch_60 = df['flesch_reading_ease'].quantile(0.60)
print(f"Adaptive thresholds -> word_count >= {wc_90}, flesch >= {flesch_60:.2f}")

def assign_quality_percentile(wc, flesch):
    if (wc >= wc_90) and (flesch >= flesch_60):
        return 'High'
    elif (wc < 500) or (flesch < 30):
        return 'Low'
    else:
        return 'Medium'

df['quality_label'] = df.apply(lambda r: assign_quality_percentile(r['word_count'], r['flesch_reading_ease']), axis=1)
print("Label distribution (adaptive):\n", df['quality_label'].value_counts())

# 7) Train calibrated RandomForest
feature_cols = ['word_count', 'sentence_count', 'flesch_reading_ease']
X = df[feature_cols].fillna(0).astype(float)
le = LabelEncoder()
y_enc = le.fit_transform(df['quality_label'])

try:
    X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.3, random_state=42, stratify=y_enc)
except Exception:
    X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.3, random_state=42)

base_clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# dynamically choose cv based on minimum class count
min_class_count = min(Counter(y_train).values())
cv_folds = min(5, min_class_count)
print(f"Using cv={cv_folds} for calibration due to class sizes {Counter(y_train)}")

clf = CalibratedClassifierCV(estimator=base_clf, cv=cv_folds, method='sigmoid')
clf.fit(X_train, y_train)


y_pred_enc = clf.predict(X_test)
y_pred_labels = le.inverse_transform(y_pred_enc)
y_test_labels = le.inverse_transform(y_test)
print("Classification report (calibrated model):")
print(classification_report(y_test_labels, y_pred_labels))
print("Confusion matrix:")
print(confusion_matrix(y_test_labels, y_pred_labels))

# Save artifacts
joblib.dump({'model': clf, 'label_encoder': le, 'vectorizer': vectorizer, 'feature_cols': feature_cols,
             'wc_90': wc_90, 'flesch_60': flesch_60}, MODEL_PATH)
print(f"Saved model artifacts -> {MODEL_PATH}")

# 8) Real-time analyze_url
def analyze_url(url, sim_threshold=0.75, top_k_similar=3, prob_threshold_high=0.65, prob_threshold_low=0.55, max_confidence=0.95):
    try:
        resp = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
        resp.raise_for_status()
        html = resp.text
    except Exception as e:
        return {'error': f'fetch_failed: {e}', 'url': url}

    title, body_text, word_count = parse_html_largest_block(html)
    clean_text = ' '.join(str(body_text).lower().split())
    readability = textstat.flesch_reading_ease(clean_text) if clean_text else 0.0
    sentence_count = max(0, len([s for s in clean_text.split('.') if s.strip()]))
    is_thin = word_count < 500

    feat_df = pd.DataFrame([[word_count, sentence_count, readability]], columns=feature_cols)

    try:
        pred_enc = clf.predict(feat_df)[0]
        pred_label = le.inverse_transform([pred_enc])[0]
        probs = clf.predict_proba(feat_df)[0]
        pred_conf = float(probs[int(pred_enc)])
        pred_conf = min(pred_conf, float(max_confidence))
        quality_score_pct = round(pred_conf * 100, 2)
    except Exception:
        pred_label = None
        quality_score_pct = None
        pred_conf = 0.0

    final_label = pred_label
    if pred_label == 'High' and pred_conf >= prob_threshold_high:
        final_label = 'High'
    elif pred_label == 'Low' and pred_conf >= prob_threshold_low:
        final_label = 'Low'
    else:
        final_label = assign_quality_percentile(word_count, readability)

    similar_docs = []
    if clean_text:
        vec = vectorizer.transform([clean_text])
        sims = cosine_similarity(vec, tfidf_matrix).flatten()
        idxs = np.argsort(sims)[-top_k_similar:][::-1]
        for idx in idxs:
            # include the same URL
            similar_docs.append({'url': df.loc[idx, 'url'], 'similarity': float(sims[idx])})

    result = {
        'url': url,
        'word_count': int(word_count),
        'readability': round(float(readability), 2),
        'quality_label': final_label,
        'is_thin': bool(is_thin),
        'similar_to': similar_docs,
        '_model_confidence_pct': quality_score_pct,
        'title': title
    }
    return result

# 9) Quick test
test_url = "https://www.cm-alliance.com/cybersecurity-blog"
out = analyze_url(test_url)
print(json.dumps(out, indent=2))


Loaded 81 rows from ../data/data.csv
Saved parsed content -> ../data/extracted_content.csv
Saved features -> ../data/features.csv
Saved duplicates -> ../data/duplicates.csv (found 24 pairs at threshold 0.8)
Adaptive thresholds -> word_count >= 6570, flesch >= 36.81
Label distribution (adaptive):
 Low       43
Medium    32
High       6
Name: quality_label, dtype: int64
Using cv=4 for calibration due to class sizes Counter({1: 30, 2: 22, 0: 4})
Classification report (calibrated model):
              precision    recall  f1-score   support

        High       0.50      0.50      0.50         2
         Low       1.00      1.00      1.00        13
      Medium       0.90      0.90      0.90        10

    accuracy                           0.92        25
   macro avg       0.80      0.80      0.80        25
weighted avg       0.92      0.92      0.92        25

Confusion matrix:
[[ 1  0  1]
 [ 0 13  0]
 [ 1  0  9]]
Saved model artifacts -> ../models/quality_model.pkl
{
  "url": "https://ww