In [3]:
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def embed_texts(texts, max_features=512, normalize=True):
    """
    Simple text embedding using TF-IDF vectors.
    Returns a list of numpy arrays (one per text). Vectors are optionally L2-normalized.
    """
np.source
# Create or update a joblib pre-trained embeddings file from DB (supabase)
import os
import joblib
import numpy as np
from dotenv import load_dotenv
load_dotenv()
# attempt SBERT first, fallback to TF-IDF
try:
    from sentence_transformers import SentenceTransformer
    sbert = SentenceTransformer('all-MiniLM-L6-v2')
except Exception:
    sbert = None

# supabase config from env (ensure these are set in your notebook env)
SUPABASE_URL = os.getenv('SUPABASE_URL')
SUPABASE_KEY = os.getenv('SUPABASE_KEY')
try:
    from supabase import create_client
    sb = create_client(SUPABASE_URL, SUPABASE_KEY) if SUPABASE_URL and SUPABASE_KEY else None
except Exception:
    sb = None

# fetch past novelty entries from DB table 'novelty_reports' (fallback to storage not implemented here)
past_texts = []
if sb is not None:
    try:
        res = sb.table('novelty_reports').select('filename,result').limit(2000).execute()
        rows = getattr(res, 'data', []) or []
        for r in rows:
            content = r.get('result') or {}
            # pick unique_sections if present, else raw snippet
            for s in (content.get('unique_sections') or [])[:5]:
                if s and s not in past_texts:
                    past_texts.append(s)
            raw = (content.get('raw_text') or content.get('raw') or '')
            if raw and raw not in past_texts:
                past_texts.append(raw[:1000])
    except Exception as e:
        print('Supabase read failed in notebook:', e)

# prepare pretrain directory relative to this notebook path
PRETRAIN_DIR = os.path.join(os.getcwd(), 'pre-trained')
os.makedirs(PRETRAIN_DIR, exist_ok=True)
JOBLIB_PATH = os.path.join(PRETRAIN_DIR, 'novelty_embeddings.joblib')

def compute_and_save_embeddings(texts):
    if not texts:
        joblib.dump({'texts': [], 'embs': None}, JOBLIB_PATH)
        return
    if sbert is not None:
        try:
            embs = sbert.encode(texts, convert_to_tensor=False, show_progress_bar=True)
            embs = np.vstack([np.array(e).astype(np.float32) for e in embs])
            joblib.dump({'texts': texts, 'embs': embs.tolist()}, JOBLIB_PATH)
            print('Saved SBERT embeddings to', JOBLIB_PATH)
            return
        except Exception as e:
            print('SBERT embedding failed:', e)
    # fallback to TF-IDF if SBERT not available
    try:
        from sklearn.feature_extraction.text import TfidfVectorizer
        vec = TfidfVectorizer(max_features=1024, stop_words='english')
        X = vec.fit_transform(texts).toarray()
        # normalize rows
        norms = np.linalg.norm(X, axis=1, keepdims=True)
        norms[norms==0] = 1
        X = (X / norms).astype(np.float32)
        joblib.dump({'texts': texts, 'embs': X.tolist()}, JOBLIB_PATH)
        print('Saved TF-IDF embeddings to', JOBLIB_PATH)
        return
    except Exception as e:
        print('TF-IDF fallback failed:', e)

# deduplicate and limit size to keep file manageable
unique_texts = []
seen = set()
for t in past_texts:
    k = (t or '').strip()
    if not k:
        continue
    if k in seen:
        continue
    seen.add(k)
    unique_texts.append(k)
    if len(unique_texts) >= 2000:
        break

compute_and_save_embeddings(unique_texts)

# load back for verification
try:
    data = joblib.load(JOBLIB_PATH)
    print('joblib contains texts:', len(data.get('texts', [])))
except Exception as e:
    print('Failed to load saved joblib:', e)

joblib contains texts: 0


In [4]:
# Load libraries
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

sns.set(style="whitegrid")

# Supabase client (reuse env vars)
SUPABASE_URL = os.getenv('SUPABASE_URL')
SUPABASE_KEY = os.getenv('SUPABASE_KEY')
try:
    from supabase import create_client
    sb = create_client(SUPABASE_URL, SUPABASE_KEY) if SUPABASE_URL and SUPABASE_KEY else None
except Exception:
    sb = None

# Fetch labeled novelty rows
rows = []
if sb is not None:
    try:
        res = sb.table('novelty_reports').select('filename,novelty_percentage,result').limit(2000).execute()
        rows = getattr(res, 'data', []) or []
    except Exception as e:
        print('Failed to read novelty_reports from Supabase:', e)

# Build dataframe of text (representative) and label
texts = []
labels = []
for r in rows:
    res_json = (r.get('result') or {})
    label = res_json.get('novelty_percentage') if res_json else r.get('novelty_percentage')
    try:
        label = int(label) if label is not None else None
    except Exception:
        label = None
    # pick representative text: first unique_section or raw snippet
    text = ''
    if res_json:
        us = res_json.get('unique_sections') or []
        if us:
            text = us[0]
        else:
            text = (res_json.get('raw_text') or res_json.get('raw') or '')[:1000]
    if not text or label is None:
        continue
    texts.append(text)
    labels.append(label)

n = min(len(texts), 1000)
if n == 0:
    print('No labeled rows found to evaluate.')
else:
    texts = texts[:n]
    labels = labels[:n]

    # Try loading precomputed joblib embeddings (if available)
    JOBLIB_PATH = os.path.join(os.getcwd(), 'pre-trained', 'novelty_embeddings.joblib')
    emb_store = None
    try:
        data = joblib.load(JOBLIB_PATH)
        emb_store = data.get('embs', None)
        if emb_store is not None:
            emb_store = np.asarray(emb_store)
    except Exception:
        emb_store = None

    # Create embeddings for our evaluation texts (prefer SBERT, fallback TF-IDF)
    try:
        from sentence_transformers import SentenceTransformer
        embedder = SentenceTransformer('all-MiniLM-L6-v2')
        eval_embs = embedder.encode(texts, convert_to_tensor=False, show_progress_bar=False)
        eval_embs = np.vstack([np.array(e).astype(np.float32) for e in eval_embs])
    except Exception as e:
        print('SBERT not available; falling back to TF-IDF for evaluation:', e)
        vec = TfidfVectorizer(max_features=1024, stop_words='english')
        X = vec.fit_transform(texts).toarray()
        norms = np.linalg.norm(X, axis=1, keepdims=True)
        norms[norms==0] = 1
        eval_embs = (X / norms).astype(np.float32)

    # Heuristic prediction: novelty = (1 - mean(top-k similarity to other docs)) * 100
    preds = []
    for i in range(len(eval_embs)):
        past = np.delete(eval_embs, i, axis=0)
        if past.shape[0] == 0:
            preds.append(100.0)
            continue
        sims = cosine_similarity(eval_embs[i:i+1], past)[0]
        k = min(5, len(sims))
        topk = -np.sort(-sims)[:k]
        mean_sim = float(np.mean(topk)) if len(topk) > 0 else 0.0
        preds.append((1.0 - mean_sim) * 100.0)

    # Metrics
    mae = mean_absolute_error(labels, preds)
    mse = mean_squared_error(labels, preds)
    rmse = mse ** 0.5
    r2 = r2_score(labels, preds)
    print(f'Count: {len(labels)}; MAE: {mae:.2f}; RMSE: {rmse:.2f}; R2: {r2:.3f}')

    df = pd.DataFrame({'actual': labels, 'pred': preds})

    # Scatter: actual vs predicted
    plt.figure(figsize=(7,6))
    sns.scatterplot(data=df, x='actual', y='pred', alpha=0.6)
    plt.plot([0,100],[0,100], 'r--')
    plt.xlabel('Actual Novelty (%)')
    plt.ylabel('Predicted Novelty (%)')
    plt.title('Actual vs Predicted Novelty')
    plt.tight_layout()
    plt.show()

    # Residuals histogram
    df['error'] = df['pred'] - df['actual']
    plt.figure(figsize=(7,4))
    sns.histplot(df['error'], bins=40, kde=True)
    plt.title('Prediction Error Distribution (pred - actual)')
    plt.xlabel('Error')
    plt.tight_layout()
    plt.show()

    # Calibration: bin by actual novelty and show mean predicted vs actual per-bin
    df['bin'] = pd.cut(df['actual'], bins=10)
    calib = df.groupby('bin').agg(mean_actual=('actual','mean'), mean_pred=('pred','mean'), count=('actual','count')).reset_index()
    plt.figure(figsize=(8,4))
    sns.lineplot(data=calib, x='mean_actual', y='mean_pred', marker='o')
    plt.plot([0,100],[0,100],'r--')
    plt.xlabel('Mean Actual Novelty (bin)')
    plt.ylabel('Mean Predicted Novelty')
    plt.title('Calibration by Actual Novelty Bins')
    plt.tight_layout()
    plt.show()

No labeled rows found to evaluate.


In [5]:
# Export labeling samples for manual annotation
import os
import csv
import random
from collections import defaultdict
SUPABASE_URL = os.getenv('SUPABASE_URL')
SUPABASE_KEY = os.getenv('SUPABASE_KEY')
try:
    from supabase import create_client
    sb = create_client(SUPABASE_URL, SUPABASE_KEY) if SUPABASE_URL and SUPABASE_KEY else None
except Exception:
    sb = None
rows = []
if sb is not None:
    try:
        res = sb.table('novelty_reports').select('filename,novelty_percentage,result').limit(5000).execute()
        rows = getattr(res, 'data', []) or []
    except Exception as e:
        print('Failed to read novelty_reports from Supabase:', e)
# Build candidates list
candidates = []
for r in rows:
    res_json = (r.get('result') or {})
    label = res_json.get('novelty_percentage') if res_json else r.get('novelty_percentage')
    try:
        label = int(label) if label is not None else None
    except Exception:
        label = None
    text = ''
    if res_json:
        us = res_json.get('unique_sections') or []
        if us:
            text = us[0]
        else:
            text = (res_json.get('raw_text') or res_json.get('raw') or '')[:1000]
    if not text:
        continue
    candidates.append({'filename': r.get('filename'), 'text': text, 'existing_label': label})
print(f'Found {len(candidates)} candidate rows for labeling')
if len(candidates) == 0:
    print('No candidates found; ensure SUPABASE env vars are set and table exists')
else:
    # Option: stratified sample across existing label bins to get diverse set
    bins = defaultdict(list)
    for c in candidates:
        lab = c['existing_label']
        if lab is None:
            bins['none'].append(c)
        else:
            b = int(lab // 10) * 10
            bins[b].append(c)
    # desired sample size (change as needed)
    sample_size = 500
    sampled = []
    # take proportional from each bin but at least 1 from each non-empty bin
    keys = sorted(bins.keys())
    per_bin = max(1, sample_size // max(1, len(keys)))
    for k in keys:
        group = bins[k]
        if len(group) <= per_bin:
            sampled.extend(group)
        else:
            sampled.extend(random.sample(group, per_bin))
    # if undersampled, fill randomly
    if len(sampled) < sample_size:
        remaining = [c for c in candidates if c not in sampled]
        add = min(len(remaining), sample_size - len(sampled))
        if add > 0:
            sampled.extend(random.sample(remaining, add))
    # prepare output dir
    out_dir = os.path.join(os.getcwd(), 'pre-trained')
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, 'novelty_labeling_samples.csv')
    # write CSV with columns: filename, text_snippet, existing_label, new_label, notes
    with open(out_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['filename','text_snippet','existing_label','new_label','notes'])
        for r in sampled:
            writer.writerow([r['filename'], r['text'].replace('\n',' ').strip(), r['existing_label'] if r['existing_label'] is not None else '', '', ''])
    print('Wrote labeling CSV to', out_path)
    print('Open the CSV, add `new_label` values (0-100), then save. After labeling, run the evaluation cell to compute metrics against your new labels.')

Found 0 candidate rows for labeling
No candidates found; ensure SUPABASE env vars are set and table exists
