# BM25 + Domain Lexicon with Pseudo‑Relevance Feedback (PRF)

In [1]:
import math
import re
import pandas as pd
import numpy as np
from dataclasses import dataclass

STOPWORDS = {
    'a','an','the','and','or','but','if','while','for','to','of','in','on','at','by','from',
    'as','is','are','was','were','be','been','being','it','its','this','that','these','those',
    'with','we','you','he','she','they','them','his','her','their','our','us','i','me','my','mine',
    'your','yours','ours','do','does','did','doing','have','has','had','having','will','would',
    'can','could','should','shall','may','might','must','not','no','yes','so','such','than','then'
}

TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")

def tokenize(text: str):
    if not isinstance(text, str):
        return []
    toks = [t.lower() for t in TOKEN_RE.findall(text)]
    return [t for t in toks if t not in STOPWORDS and len(t) > 1]

def normalize_emails(df: pd.DataFrame) -> pd.DataFrame:
    out = df.rename(columns={"message_id": "email_id", "date": "email_ts"}).copy()
    out["email_ts"] = pd.to_datetime(out["email_ts"], utc=True, errors="coerce")
    keep = {"email_id","email_ts","subject","body","from","to","file"}
    cols = [c for c in out.columns if c in keep]
    return out[cols].dropna(subset=["email_id","email_ts"]).reset_index(drop=True)


In [2]:
ELECTRICITY_LEXICON = {
    'caiso','ancillary','congestion','schedule','ricochet','death','star','fat','boy',
    'outage','bid','megawatt','load','rto','iso','lmp','price','cap','overcharge',
    'reserve','balancing','settlement','counterflow','day','ahead','real','time'
}

MULTIWORD_HINTS = {
    'death star': ['death','star'],
    'fat boy': ['fat','boy'],
    'price cap': ['price','cap'],
    'real-time': ['real','time'],
    'day-ahead': ['day','ahead']
}


In [3]:
@dataclass
class BM25Index:
    docs: list
    doc_lens: np.ndarray
    avgdl: float
    df: dict
    idf: dict
    k1: float = 1.5
    b: float = 0.75

    def score(self, query_terms, doc_id):
        tf = {}
        for t in self.docs[doc_id]:
            tf[t] = tf.get(t, 0) + 1
        score = 0.0
        dl = self.doc_lens[doc_id]
        for t in query_terms:
            if t not in self.idf:
                continue
            f = tf.get(t, 0)
            if f == 0:
                continue
            idf = self.idf[t]
            denom = f + self.k1*(1 - self.b + self.b*dl/self.avgdl)
            score += idf * (f*(self.k1+1))/denom
        return score

def build_bm25(corpus_tokens, k1=1.5, b=0.75):
    N = len(corpus_tokens)
    df = {}
    doc_lens = np.array([len(d) for d in corpus_tokens], dtype=float)
    avgdl = float(doc_lens.mean()) if N > 0 else 0.0
    for d in corpus_tokens:
        seen = set()
        for t in d:
            if t in seen: continue
            df[t] = df.get(t, 0) + 1
            seen.add(t)
    idf = {}
    for t, df_t in df.items():
        idf[t] = math.log((N - df_t + 0.5) / (df_t + 0.5) + 1e-12)
    return BM25Index(
        docs=corpus_tokens, doc_lens=doc_lens, avgdl=avgdl, df=df, idf=idf, k1=k1, b=b
    )


In [4]:
def top_terms_from_docs(corpus_tokens, doc_ids, top_n=10, exclude=None):
    exclude = set(exclude or [])
    counts = {}
    for d in doc_ids:
        for t in corpus_tokens[d]:
            if t in exclude or t in STOPWORDS:
                continue
            counts[t] = counts.get(t, 0) + 1
    ranked = sorted(counts.items(), key=lambda x: (-x[1], x[0]))
    return [t for t,_ in ranked[:top_n]]

def expand_query_with_prf(bm25, corpus_tokens, init_query_terms, top_k=5, expand_n=8):
    scores = [bm25.score(init_query_terms, i) for i in range(len(corpus_tokens))]
    top_doc_ids = list(np.argsort(scores)[::-1][:top_k])
    expansion = top_terms_from_docs(corpus_tokens, top_doc_ids, top_n=expand_n, exclude=init_query_terms)
    expanded_query = list(dict.fromkeys(init_query_terms + expansion))
    return expanded_query, scores, top_doc_ids


In [5]:
def build_corpus(df: pd.DataFrame):
    texts = (df['subject'].fillna('') + ' ' + df['body'].fillna('')).tolist()
    return [tokenize(t) for t in texts]

def bm25_rank(df: pd.DataFrame, query_terms, k1=1.5, b=0.75):
    corpus_tokens = build_corpus(df)
    index = build_bm25(corpus_tokens, k1=k1, b=b)
    scores = [index.score(query_terms, i) for i in range(len(corpus_tokens))]
    out = df.copy()
    out['bm25_score'] = scores
    return out.sort_values('bm25_score', ascending=False).reset_index(drop=True), index, corpus_tokens

def bm25_with_prf(df: pd.DataFrame, seed_terms, top_k=5, expand_n=8, k1=1.5, b=0.75):
    init_rank, index, corpus_tokens = bm25_rank(df, seed_terms, k1=k1, b=b)
    expanded_query, init_scores, top_doc_ids = expand_query_with_prf(index, corpus_tokens, seed_terms, top_k=top_k, expand_n=expand_n)
    final_scores = [index.score(expanded_query, i) for i in range(len(corpus_tokens))]
    final = df.copy()
    final['bm25_score'] = final_scores
    final['rank_before_prf'] = pd.Series(init_scores).rank(ascending=False, method='min').astype(int)
    final['rank_after_prf'] = pd.Series(final_scores).rank(ascending=False, method='min').astype(int)
    final = final.sort_values('bm25_score', ascending=False).reset_index(drop=True)
    return {
        'initial_ranking': init_rank,
        'expanded_query': expanded_query,
        'top_doc_ids': top_doc_ids,
        'final_ranking': final
    }


In [6]:
import pandas as pd

enron_df = pd.DataFrame([
    {
        "message_id": "<18782981.1075855378110.JavaMail.evans@thyme>",
        "date": "2001-05-14 23:39:00Z",
        "subject": "CAISO congestion and price cap discussion",
        "body": "Team, CAISO congestion charges may spike; discuss LMP and schedule impacts. Keep this internal.",
        "from": "allen-p@enron.com",
        "to": "energy-team@enron.com",
        "file": "allen-p/_sent_mail/1."
    },
    {
        "message_id": "<15464986.1075855378456.JavaMail.evans@thyme>",
        "date": "2001-05-04 20:51:00Z",
        "subject": "Ancillary services bids and balancing market",
        "body": "Please review ancillary bids vs balancing requirements; potential outage risk noted.",
        "from": "allen-p@enron.com",
        "to": "grid-ops@enron.com",
        "file": "allen-p/_sent_mail/10."
    },
    {
        "message_id": "<20010515.083000.iris@enron>",
        "date": "2001-05-15 08:30:00Z",
        "subject": "Press talking points for analysts",
        "body": "Keep messaging strong on capacity; avoid mentioning the transmission constraint.",
        "from": "ir@enron.com",
        "to": "exec-staff@enron.com",
        "file": "ir/_sent_mail/77."
    },
    {
        "message_id": "<20010515.081200.ops@enron>",
        "date": "2001-05-15 08:12:00Z",
        "subject": "Outage schedule and Ricochet routing",
        "body": "Routing via out-of-state tie may trigger ricochet. Monitor settlement exposure.",
        "from": "ops@enron.com",
        "to": "trading-floor@enron.com",
        "file": "ops/_sent_mail/42."
    },
    {
        "message_id": "<20010513.120000.random@enron>",
        "date": "2001-05-13 12:00:00Z",
        "subject": "Lunch plans",
        "body": "Anyone up for sushi near the office?",
        "from": "hr@enron.com",
        "to": "all@enron.com",
        "file": "hr/_sent_mail/5."
    },
    {
        "message_id": "<20010514.230000.ops2@enron>",
        "date": "2001-05-14 23:00:00Z",
        "subject": "Settlement check: balancing & reserve",
        "body": "Check settlement diffs on reserve and balancing; potential overcharge flagged.",
        "from": "ops2@enron.com",
        "to": "settlements@enron.com",
        "file": "ops/_sent_mail/55."
    },
])

core = normalize_emails(enron_df)
print("Loaded emails:", len(core))
core.head()


Loaded emails: 6


Unnamed: 0,email_id,email_ts,subject,body,from,to,file
0,<18782981.1075855378110.JavaMail.evans@thyme>,2001-05-14 23:39:00+00:00,CAISO congestion and price cap discussion,"Team, CAISO congestion charges may spike; disc...",allen-p@enron.com,energy-team@enron.com,allen-p/_sent_mail/1.
1,<15464986.1075855378456.JavaMail.evans@thyme>,2001-05-04 20:51:00+00:00,Ancillary services bids and balancing market,Please review ancillary bids vs balancing requ...,allen-p@enron.com,grid-ops@enron.com,allen-p/_sent_mail/10.
2,<20010515.083000.iris@enron>,2001-05-15 08:30:00+00:00,Press talking points for analysts,Keep messaging strong on capacity; avoid menti...,ir@enron.com,exec-staff@enron.com,ir/_sent_mail/77.
3,<20010515.081200.ops@enron>,2001-05-15 08:12:00+00:00,Outage schedule and Ricochet routing,Routing via out-of-state tie may trigger ricoc...,ops@enron.com,trading-floor@enron.com,ops/_sent_mail/42.
4,<20010513.120000.random@enron>,2001-05-13 12:00:00+00:00,Lunch plans,Anyone up for sushi near the office?,hr@enron.com,all@enron.com,hr/_sent_mail/5.


In [7]:
seed_terms = sorted(ELECTRICITY_LEXICON)
print('Seed terms (subset):', seed_terms[:10], '... (total', len(seed_terms), ')')

results = bm25_with_prf(core, seed_terms, top_k=3, expand_n=6, k1=1.5, b=0.75)
print('\nExpanded query terms:', results['expanded_query'])

display_cols = ['email_id','email_ts','from','to','subject','bm25_score','rank_before_prf','rank_after_prf']
results['final_ranking'][display_cols].head(10)


Seed terms (subset): ['ahead', 'ancillary', 'balancing', 'bid', 'boy', 'caiso', 'cap', 'congestion', 'counterflow', 'day'] ... (total 27 )

Expanded query terms: ['ahead', 'ancillary', 'balancing', 'bid', 'boy', 'caiso', 'cap', 'congestion', 'counterflow', 'day', 'death', 'fat', 'iso', 'lmp', 'load', 'megawatt', 'outage', 'overcharge', 'price', 'real', 'reserve', 'ricochet', 'rto', 'schedule', 'settlement', 'star', 'time', 'check', 'routing', 'charges', 'diffs', 'discuss', 'discussion']


Unnamed: 0,email_id,email_ts,from,to,subject,bm25_score,rank_before_prf,rank_after_prf
0,<18782981.1075855378110.JavaMail.evans@thyme>,2001-05-14 23:39:00+00:00,allen-p@enron.com,energy-team@enron.com,CAISO congestion and price cap discussion,10.985069,1,1
1,<20010514.230000.ops2@enron>,2001-05-14 23:00:00+00:00,ops2@enron.com,settlements@enron.com,Settlement check: balancing & reserve,8.183344,2,2
2,<20010515.081200.ops@enron>,2001-05-15 08:12:00+00:00,ops@enron.com,trading-floor@enron.com,Outage schedule and Ricochet routing,5.3009,3,3
3,<15464986.1075855378456.JavaMail.evans@thyme>,2001-05-04 20:51:00+00:00,allen-p@enron.com,grid-ops@enron.com,Ancillary services bids and balancing market,3.026754,4,4
4,<20010515.083000.iris@enron>,2001-05-15 08:30:00+00:00,ir@enron.com,exec-staff@enron.com,Press talking points for analysts,0.0,5,5
5,<20010513.120000.random@enron>,2001-05-13 12:00:00+00:00,hr@enron.com,all@enron.com,Lunch plans,0.0,5,5


### Notes
- BM25 uses TF, document length normalization, and IDF.
- PRF expands the query using the top‑k docs from the initial rank (frequency‑based here).
- For stronger precision, swap frequency for TF‑IDF within the PRF selector or manually whitelist expansion terms.
- Extend `ELECTRICITY_LEXICON` with plant names, CAISO acronyms, file artifacts (e.g., `.xls`, `.csv`).


In [10]:
# --- Export BM25-based text_score for Bayes fusion ---

final_rank = results['final_ranking'].copy()

bm25_vals = final_rank['bm25_score'].astype(float).values
bm25_min, bm25_max = float(np.min(bm25_vals)), float(np.max(bm25_vals))
rng = bm25_max - bm25_min if bm25_max > bm25_min else 1.0   # guard against zero-variance
final_rank['text_score'] = (final_rank['bm25_score'] - bm25_min) / rng

def _lex_hits(row):
    txt = f"{row.get('subject','')} {row.get('body','')}".lower()
    return sum(1 for w in ELECTRICITY_LEXICON if w in txt)

final_rank['lexicon_hits'] = final_rank.apply(_lex_hits, axis=1)

text_scores = final_rank[['email_id', 'text_score', 'bm25_score', 'lexicon_hits']].copy()
display(text_scores.head(10))

out_path = "scores/text_scores.csv"
text_scores.to_csv(out_path, index=False)
print("Saved text scores to:", out_path)

def attach_text_scores(candidates_df: pd.DataFrame) -> pd.DataFrame:
    """
    Left-join BM25 text_score into your existing candidates/anomaly dataframe.
    Expects a column 'email_id' in candidates_df.
    """
    return candidates_df.merge(text_scores, on='email_id', how='left')


Unnamed: 0,email_id,text_score,bm25_score,lexicon_hits
0,<18782981.1075855378110.JavaMail.evans@thyme>,1.0,10.985069,7
1,<20010514.230000.ops2@enron>,0.744952,8.183344,4
2,<20010515.081200.ops@enron>,0.482555,5.3009,4
3,<15464986.1075855378456.JavaMail.evans@thyme>,0.275533,3.026754,4
4,<20010515.083000.iris@enron>,0.0,0.0,1
5,<20010513.120000.random@enron>,0.0,0.0,0


Saved text scores to: scores/text_scores.csv
