In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# --------------------------
# Load your data
# --------------------------
# Replace 'claims.csv' with your file path
df = pd.read_csv('claims.csv')

# Merge notes
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " +
    df["free_flow_opt_note"].fillna("")
)

# --------------------------
# Cleaning
# --------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["clean_text"] = df["merged_text"].apply(clean_text)

df.head()

In [None]:
# --------------------------
# Vectorize & cluster
# --------------------------
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, stop_words="english")
X = vectorizer.fit_transform(df["clean_text"])

# Choose cluster count (tweak as needed)
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df["cluster"] = kmeans.fit_predict(X)

df['cluster'].value_counts().sort_index()

In [None]:
# --------------------------
# Top terms per cluster
# --------------------------
import numpy as np

def top_terms_for_cluster(cluster_id, top_n=15):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    return [(terms[i], mean_tfidf[i]) for i in top_idx]

for c in range(n_clusters):
    print(f"\nCluster {c} (size {len(df[df['cluster']==c])}):")
    for term, score in top_terms_for_cluster(c):
        print(f"{term:20s} {score:.4f}")

In [None]:
# --------------------------
# Add evidence column: matched top terms from each note's cluster
# --------------------------
def find_matches_in_text(text, terms):
    found = []
    for term in terms:
        if re.search(r'\b' + re.escape(term) + r'\b', text):
            found.append(term)
    return found

# Build dictionary of cluster -> top terms
cluster_top_terms = {c: [t for t, _ in top_terms_for_cluster(c)] for c in range(n_clusters)}

# Create column with matches
df["matched_terms"] = df.apply(
    lambda row: "; ".join(find_matches_in_text(row["clean_text"], cluster_top_terms[row["cluster"]])),
    axis=1
)

df[["merged_text", "cluster", "matched_terms"]].head()

In [None]:
# --------------------------
# Show sample notes per cluster
# --------------------------
SAMPLES_PER_CLUSTER = 5
for c in range(n_clusters):
    print(f"\n=== Cluster {c} ===")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(SAMPLES_PER_CLUSTER)
    for note in sample_notes:
        print("-", note)

In [None]:
# --------------------------
# Interactive cluster tagging
# --------------------------
cluster_labels = {}
for c in range(n_clusters):
    print(f"\nCluster {c} top terms:")
    for term, score in top_terms_for_cluster(c):
        print(f"  {term:20s} {score:.4f}")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(3)
    print("\nSample notes:")
    for note in sample_notes:
        print("-", note)
    label = input("Enter label for this cluster (finding/nofinding/cancelled/unknown): ").strip().lower()
    cluster_labels[c] = label

# Assign labels back to df
df['pattern_label'] = df['cluster'].map(cluster_labels)

# Save labeled dataset
df.to_csv('claims_cluster_tagged.csv', index=False)
print("\nSaved tagged data to claims_cluster_tagged.csv")

In [None]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# --------------------------
# Cleaning function
# --------------------------
def clean_text(text):
    text = text.lower()
    # Remove IDs like n19878, abc123 (letters+digits combos)
    text = re.sub(r'\b[a-z]{0,3}\d{3,}\b', ' ', text)
    # Remove years/numbers
    text = re.sub(r'\b\d{2,4}\b', ' ', text)
    # Remove known meaningless codes (expand this list as you see them)
    custom_stopwords = {'drg', 'hrp', 'etc', 'na'}
    text = ' '.join(w for w in text.split() if w not in custom_stopwords)
    # Remove special characters & extra spaces
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --------------------------
# Prepare data
# --------------------------
df['merged_text'] = (
    df['communication_notes'].fillna('') + ' ' + df['free_flow_opt_note'].fillna('')
).apply(clean_text)

# --------------------------
# TF-IDF with bigrams
# --------------------------
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.85,
    min_df=5,
    ngram_range=(1, 2)  # single words + 2-word phrases
)
X = vectorizer.fit_transform(df['merged_text'])

# --------------------------
# Cluster
# --------------------------
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# --------------------------
# Top terms per cluster
# --------------------------
def top_terms_for_cluster(cluster_id, top_n=15, min_score=0.01):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1]
    results = []
    for i in top_idx:
        if mean_tfidf[i] >= min_score:  # skip very low-weight terms
            results.append((terms[i], mean_tfidf[i]))
        if len(results) >= top_n:
            break
    return results

# --------------------------
# Display results
# --------------------------
for c in range(n_clusters):
    print(f"\n{'='*50}\nCluster {c} (size {len(df[df['cluster']==c])})")
    for term, score in top_terms_for_cluster(c):
        print(f"{term:25s} {score:.4f}")


In [None]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# --------------------------
# Cleaning function
# --------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\b[a-z]{0,3}\d{3,}\b', ' ', text)   # IDs
    text = re.sub(r'\b\d{2,4}\b', ' ', text)            # years/numbers
    custom_stopwords = {'drg', 'hrp', 'etc', 'na'}      # expand this
    text = ' '.join(w for w in text.split() if w not in custom_stopwords)
    text = re.sub(r'[^a-z\s]', ' ', text)               # non-letters
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --------------------------
# Prepare data
# --------------------------
df['merged_text'] = (
    df['communication_notes'].fillna('') + ' ' + df['free_flow_opt_note'].fillna('')
).apply(clean_text)

# --------------------------
# TF-IDF with bigrams
# --------------------------
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.85,
    min_df=5,
    ngram_range=(1, 2)
)
X = vectorizer.fit_transform(df['merged_text'])

# --------------------------
# Cluster
# --------------------------
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# --------------------------
# Top terms per cluster
# --------------------------
def top_terms_for_cluster(cluster_id, top_n=15, min_score=0.01):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1]
    results = []
    for i in top_idx:
        if mean_tfidf[i] >= min_score:
            results.append((terms[i], mean_tfidf[i]))
        if len(results) >= top_n:
            break
    return results

# --------------------------
# Add evidence column
# --------------------------
def find_matches_in_text(text, terms):
    found = []
    for term in terms:
        if re.search(r'\b' + re.escape(term) + r'\b', text):
            found.append(term)
    return found

cluster_top_terms = {c: [t for t, _ in top_terms_for_cluster(c)] for c in range(n_clusters)}
df["matched_terms"] = df.apply(
    lambda row: "; ".join(find_matches_in_text(row["merged_text"], cluster_top_terms[row["cluster"]])),
    axis=1
)

# --------------------------
# Show sample notes per cluster
# --------------------------
SAMPLES_PER_CLUSTER = 5
for c in range(n_clusters):
    print(f"\n=== Cluster {c} ===")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(SAMPLES_PER_CLUSTER)
    for note in sample_notes:
        print("-", note)

# --------------------------
# Interactive cluster tagging
# --------------------------
cluster_labels = {}
for c in range(n_clusters):
    print(f"\nCluster {c} top terms:")
    for term, score in top_terms_for_cluster(c):
        print(f"  {term:20s} {score:.4f}")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(3)
    print("\nSample notes:")
    for note in sample_notes:
        print("-", note)
    label = input("Enter label for this cluster (finding/nofinding/cancelled/unknown): ").strip().lower()
    cluster_labels[c] = label

df['pattern_label'] = df['cluster'].map(cluster_labels)
df.to_csv('claims_cluster_tagged.csv', index=False)
print("\nSaved tagged data to claims_cluster_tagged.csv")


In [None]:
# ============================
# 1. Imports
# ============================
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction import text

# ============================
# 2. Load your data
# ============================
# df = pd.read_csv("your_claims.csv")  # Replace with your file
# Example columns: communication_notes, free_flow_opt_note
df["merged_text"] = (
    df["communication_notes"].fillna("").astype(str) + " " +
    df["free_flow_opt_note"].fillna("").astype(str)
)

# ============================
# 3. Clean text
# ============================
def clean_text(txt):
    txt = str(txt).lower()
    txt = re.sub(r"\bnan\b", " ", txt)             # remove 'nan'
    txt = re.sub(r"[^a-z0-9\s]", " ", txt)         # remove punctuation
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

df["clean_text"] = df["merged_text"].apply(clean_text)

# ============================
# 4. TF-IDF with bigrams & trigrams
# ============================
# Keep default stopwords but allow important domain words
custom_stopwords = text.ENGLISH_STOP_WORDS - {
    "letter", "request", "sent", "medical", "record"
}

vectorizer = TfidfVectorizer(
    stop_words=custom_stopwords,
    ngram_range=(1, 3),        # unigrams, bigrams, trigrams
    min_df=5                   # must appear in at least 5 docs
)

X = vectorizer.fit_transform(df["clean_text"])

# ============================
# 5. KMeans clustering
# ============================
n_clusters = 5  # You can adjust this
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# ============================
# 6. Function: Top phrases per cluster
# ============================
def top_terms_for_cluster(cluster_id, top_n=20):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    return [(terms[i], mean_tfidf[i]) for i in top_idx]

# Show top terms per cluster
for c in range(n_clusters):
    print(f"\n=== Cluster {c} (size {len(df[df['cluster'] == c])}) ===")
    for term, score in top_terms_for_cluster(c):
        print(f"{term:40s} {score:.4f}")

# ============================
# 7. Evidence phrase extraction
# ============================
def find_matches_in_text(text, terms):
    found = []
    for term in terms:
        # Match full term as phrase
        if re.search(r"\b" + re.escape(term) + r"\b", text):
            found.append(term)
    return found

# Dictionary: cluster → top phrases
cluster_top_terms = {
    c: [t for t, _ in top_terms_for_cluster(c)]
    for c in range(n_clusters)
}

# Create column with matched phrases
df["matched_terms"] = df.apply(
    lambda row: "; ".join(find_matches_in_text(row["clean_text"], cluster_top_terms[row["cluster"]])),
    axis=1
)

# ============================
# 8. Sample viewer per cluster
# ============================
SAMPLES_PER_CLUSTER = 5
for c in range(n_clusters):
    print(f"\n=== Cluster {c} ===")
    print("Top phrases:", ", ".join(cluster_top_terms[c]))
    sample_notes = df[df['cluster'] == c]['merged_text'].sample(min(SAMPLES_PER_CLUSTER, len(df[df['cluster'] == c])))
    for note in sample_notes:
        print("-", note)

# ============================
# 9. Interactive labeling
# ============================
cluster_labels = {}
for c in range(n_clusters):
    print(f"\nCluster {c} top phrases: {', '.join(cluster_top_terms[c][:10])}")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(3)
    print("\nSample notes:")
    for note in sample_notes:
        print("-", note)
    label = input("Enter label for this cluster (finding/nofinding/cancelled/unknown): ").strip().lower()
    cluster_labels[c] = label

df['pattern_label'] = df['cluster'].map(cluster_labels)

# ============================
# 10. Save
# ============================
df.to_csv("claims_cluster_tagged.csv", index=False)
print("\nSaved tagged data to claims_cluster_tagged.csv")


In [None]:
# ============================
# 1. Imports
# ============================
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction import text

# ============================
# 2. Load your data
# ============================
# df = pd.read_csv("your_claims.csv")
df["merged_text"] = (
    df["communication_notes"].fillna("").astype(str) + " " +
    df["free_flow_opt_note"].fillna("").astype(str)
)

# ============================
# 3. Clean text
# ============================
def clean_text(txt):
    txt = str(txt).lower()
    txt = re.sub(r"\bnan\b", " ", txt)  # remove 'nan'
    txt = re.sub(r"[^a-z0-9\s]", " ", txt)  # remove punctuation
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

df["clean_text"] = df["merged_text"].apply(clean_text)

# ============================
# 4. TF-IDF with bigrams/trigrams
# ============================
custom_stopwords = text.ENGLISH_STOP_WORDS - {
    "letter", "request", "sent", "medical", "record"
}

vectorizer = TfidfVectorizer(
    stop_words=custom_stopwords,
    ngram_range=(1, 3),        # unigrams, bigrams, trigrams
    min_df=5
)

X = vectorizer.fit_transform(df["clean_text"])

# ============================
# 5. KMeans clustering
# ============================
n_clusters = 5
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# ============================
# 6. Top phrases per cluster
# ============================
def top_terms_for_cluster(cluster_id, top_n=20):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    return [(terms[i], mean_tfidf[i]) for i in top_idx]

cluster_top_terms = {
    c: [t for t, _ in top_terms_for_cluster(c)]
    for c in range(n_clusters)
}

# ============================
# 7. Evidence phrase extraction
# ============================
def find_matches_in_text(text, terms):
    return [term for term in terms if re.search(r"\b" + re.escape(term) + r"\b", text)]

df["matched_terms"] = df.apply(
    lambda row: "; ".join(find_matches_in_text(row["clean_text"], cluster_top_terms[row["cluster"]])),
    axis=1
)

# ============================
# 8. Auto-suggest cluster labels
# ============================
FINDING_KEYWORDS = [
    "overpayment", "offset", "uncheck", "refund", "rpa",
    "recoupment", "dispute", "recover", "repayment"
]
NOFINDING_KEYWORDS = [
    "no findings", "no issue", "valid", "compliant",
    "nothing found", "no error"
]
CANCELLED_KEYWORDS = [
    "cancelled", "withdrawn", "closed without action",
    "claim withdrawn"
]

def auto_label_cluster(phrases):
    joined = " ".join(phrases).lower()
    if any(kw in joined for kw in FINDING_KEYWORDS):
        return "finding"
    elif any(kw in joined for kw in NOFINDING_KEYWORDS):
        return "nofinding"
    elif any(kw in joined for kw in CANCELLED_KEYWORDS):
        return "cancelled"
    else:
        return "unknown"

cluster_labels = {}
for c in range(n_clusters):
    suggestion = auto_label_cluster(cluster_top_terms[c])
    print(f"\n=== Cluster {c} ===")
    print("Top phrases:", ", ".join(cluster_top_terms[c]))
    sample_notes = df[df['cluster'] == c]['merged_text'].head(3)
    print("\nSample notes:")
    for note in sample_notes:
        print("-", note)
    label = input(f"Suggested label: {suggestion} — Press Enter to accept or type new label: ").strip().lower()
    cluster_labels[c] = label if label else suggestion

df['pattern_label'] = df['cluster'].map(cluster_labels)

# ============================
# 9. Save
# ============================
df.to_csv("claims_cluster_tagged.csv", index=False)
print("\nSaved tagged data to claims_cluster_tagged.csv")


In [None]:
# =====================================
# 1. Imports
# =====================================
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# =====================================
# 2. Load your dataset
# =====================================
# df = pd.read_csv("your_claims_data.csv")  # Replace with your file path

# Merge the relevant text columns into one
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " + df["free_flow_opt_note"].fillna("")
)

# Basic cleaning: lowercase, remove special chars
df["clean_text"] = (
    df["merged_text"]
    .str.lower()
    .str.replace(r"[^a-z0-9\s]", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

# =====================================
# 3. Remove stopwords, codes, numbers
# =====================================
REMOVE_TERMS = {"md", "drg", "hrp", "ovptrxid", "nan"}
MIN_WORD_LEN = 3
WHITELIST = {"rpa", "offset"}

def clean_and_filter_tokens(text):
    tokens = text.split()
    filtered = []
    for token in tokens:
        if token.lower() in REMOVE_TERMS:
            continue
        if token.isdigit():
            continue
        if len(token) < MIN_WORD_LEN and token.lower() not in WHITELIST:
            continue
        filtered.append(token)
    return " ".join(filtered)

df["filtered_text"] = df["clean_text"].apply(clean_and_filter_tokens)

# =====================================
# 4. TF-IDF Vectorization
# =====================================
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),  # unigrams, bigrams, trigrams
    min_df=5,            # drop rare terms
    token_pattern=r'\b[a-zA-Z][a-zA-Z]+\b'
)
X = vectorizer.fit_transform(df["filtered_text"])

# =====================================
# 5. Clustering
# =====================================
n_clusters = 5  # You can adjust
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# =====================================
# 6. Function to get top terms per cluster
# =====================================
def top_terms_for_cluster(cluster_id, top_n=15):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    return [(terms[i], mean_tfidf[i]) for i in top_idx]

# =====================================
# 7. Print top terms
# =====================================
for c in range(n_clusters):
    print(f"\nCluster {c} (size {len(df[df['cluster']==c])}):")
    for term, score in top_terms_for_cluster(c):
        print(f"{term:40s} {score:.4f}")

# =====================================
# 8. Add evidence_terms column
# =====================================
def find_matches_in_text(text, terms):
    found = []
    for term in terms:
        if re.search(r'\b' + re.escape(term) + r'\b', text):
            found.append(term)
    return found

cluster_top_terms = {c: [t for t, _ in top_terms_for_cluster(c)] for c in range(n_clusters)}

df["evidence_terms"] = df.apply(
    lambda row: "; ".join(find_matches_in_text(row["filtered_text"], cluster_top_terms[row["cluster"]])),
    axis=1
)

# =====================================
# 9. Show sample notes per cluster
# =====================================
SAMPLES_PER_CLUSTER = 5
for c in range(n_clusters):
    print(f"\n=== Cluster {c} ===")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(SAMPLES_PER_CLUSTER)
    for note in sample_notes:
        print("-", note)

# =====================================
# 10. Interactive cluster tagging
# =====================================
cluster_labels = {}
for c in range(n_clusters):
    print(f"\nCluster {c} top terms:")
    for term, score in top_terms_for_cluster(c):
        print(f"  {term:40s} {score:.4f}")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(3)
    print("\nSample notes:")
    for note in sample_notes:
        print("-", note)
    label = input("Enter label for this cluster (finding/nofinding/cancelled/unknown): ").strip().lower()
    cluster_labels[c] = label

df['pattern_label'] = df['cluster'].map(cluster_labels)

# =====================================
# 11. Save final labeled dataset
# =====================================
df.to_csv('claims_cluster_tagged.csv', index=False)
print("\nSaved tagged data to claims_cluster_tagged.csv")


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Load data
# df = pd.read_csv("claims_data.csv")
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " + df["free_flow_opt_note"].fillna("")
)

# === Step 1: Clean and remove generic terms ===
REMOVE_TERMS = {"md", "drg", "hrp", "nan"}
MIN_WORD_LEN = 3
WHITELIST = {"rpa", "offset"}

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  # remove numbers/special chars
    tokens = []
    for tok in text.split():
        if tok in REMOVE_TERMS:
            continue
        if len(tok) < MIN_WORD_LEN and tok not in WHITELIST:
            continue
        tokens.append(tok)
    return " ".join(tokens)

df["filtered_text"] = df["merged_text"].apply(clean_text)

# === Step 2: Vectorize with n-grams ===
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),
    min_df=5
)
X = vectorizer.fit_transform(df["filtered_text"])
terms = vectorizer.get_feature_names_out()

# === Step 3: Cluster ===
n_clusters = 5
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# === Step 4: Top terms per cluster (remove overlaps) ===
def remove_redundant_phrases(phrases):
    phrases = sorted(phrases, key=len, reverse=True)
    final = []
    for p in phrases:
        if not any(p in bigger for bigger in final if p != bigger):
            final.append(p)
    return final

def top_terms_for_cluster(c, top_n=20):
    idx = df[df["cluster"] == c].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    raw_terms = [terms[i] for i in top_idx]
    return remove_redundant_phrases(raw_terms)

cluster_terms = {c: top_terms_for_cluster(c) for c in range(n_clusters)}

# === Step 5: Evidence column ===
def find_matches(text, phrases):
    found = []
    for phrase in phrases:
        if re.search(r'\b' + re.escape(phrase) + r'\b', text):
            found.append(phrase)
    return found

df["evidence_terms"] = df.apply(
    lambda row: find_matches(row["filtered_text"], cluster_terms[row["cluster"]]),
    axis=1
)

# === Step 6: Rule-based label suggestion ===
FINDING_PHRASES = {"overpayment closing", "recover overpayment", "audit completed"}
NOFINDING_PHRASES = {"no findings", "no overpayment", "reconsideration denied"}
CANCELLED_PHRASES = {"withdrawn", "cancelled", "rescinded"}

def suggest_label(evidence, amount):
    ev_set = set(evidence)
    if any(p in ev_set for p in CANCELLED_PHRASES):
        return "cancelled"
    if any(p in ev_set for p in FINDING_PHRASES):
        return "finding"
    if any(p in ev_set for p in NOFINDING_PHRASES) and amount == 0:
        return "nofinding"
    if amount > 0:
        return "finding"
    return "unknown"

df["pattern_label"] = df.apply(
    lambda r: suggest_label(r["evidence_terms"], r.get("overpayment_amount", 0)), axis=1
)

# === Step 7: Save results ===
df.to_csv("claims_with_labels.csv", index=False)
print("Saved to claims_with_labels.csv")

# === Debug: Show clusters ===
for c in range(n_clusters):
    print(f"\nCluster {c}: {cluster_terms[c]}")


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# =========================
# Load & merge text columns
# =========================
# df = pd.read_csv("claims_data.csv")
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " + df["free_flow_opt_note"].fillna("")
)

# =========================
# Step 1: Clean text
# =========================
REMOVE_TERMS = {"md", "drg", "hrp", "nan", "request", "letter", "sent", "medical", "record"}
MIN_WORD_LEN = 3
WHITELIST = {"rpa", "offset"}  # short words you still want

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  # keep only letters
    tokens = []
    for tok in text.split():
        if tok in REMOVE_TERMS:
            continue
        if len(tok) < MIN_WORD_LEN and tok not in WHITELIST:
            continue
        tokens.append(tok)
    return " ".join(tokens)

df["filtered_text"] = df["merged_text"].apply(clean_text)

# =========================
# Step 2: TF-IDF Vectorizer (1-3 grams)
# =========================
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),
    min_df=5
)
X = vectorizer.fit_transform(df["filtered_text"])
terms = vectorizer.get_feature_names_out()

# =========================
# Step 3: KMeans clustering
# =========================
n_clusters = 5
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# =========================
# Step 4: Top terms per cluster (remove overlaps)
# =========================
def remove_redundant_phrases(phrases):
    phrases = sorted(phrases, key=len, reverse=True)
    final = []
    for p in phrases:
        if not any(p in bigger for bigger in final if p != bigger):
            final.append(p)
    return final

def top_terms_for_cluster(c, top_n=20):
    idx = df[df["cluster"] == c].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    raw_terms = [terms[i] for i in top_idx]
    return remove_redundant_phrases(raw_terms)

cluster_terms = {c: top_terms_for_cluster(c) for c in range(n_clusters)}

# =========================
# Step 5: Evidence terms per claim
# =========================
def find_matches(text, phrases):
    found = []
    for phrase in phrases:
        if re.search(r'\b' + re.escape(phrase) + r'\b', text):
            found.append(phrase)
    return found

df["evidence_terms"] = df.apply(
    lambda row: find_matches(row["filtered_text"], cluster_terms[row["cluster"]]),
    axis=1
)

# =========================
# Step 6: Rule-based label suggestion
# =========================
FINDING_PHRASES = {"overpayment closing", "recover overpayment", "audit completed"}
NOFINDING_PHRASES = {"no findings", "no overpayment", "reconsideration denied"}
CANCELLED_PHRASES = {"withdrawn", "cancelled", "rescinded"}

def suggest_label(evidence, amount):
    ev_set = set(evidence)
    if any(p in ev_set for p in CANCELLED_PHRASES):
        return "cancelled"
    if any(p in ev_set for p in FINDING_PHRASES):
        return "finding"
    if any(p in ev_set for p in NOFINDING_PHRASES) and amount == 0:
        return "nofinding"
    if amount > 0:
        return "finding"
    return "unknown"

df["pattern_label"] = df.apply(
    lambda r: suggest_label(r["evidence_terms"], r.get("overpayment_amount", 0)), axis=1
)

# =========================
# Step 7: Save results
# =========================
df.to_csv("claims_with_labels.csv", index=False)
print("✅ Saved to claims_with_labels.csv")

# =========================
# Step 8: Show cluster summaries
# =========================
SAMPLES_PER_CLUSTER = 5

for c in range(n_clusters):
    cluster_df = df[df['cluster'] == c]
    print("="*80)
    print(f"📌 Cluster {c} — size: {len(cluster_df)}")
    print("Top 20 terms/phrases:")
    for term in cluster_terms[c]:
        print(f"  {term}")

    # Overpayment stats
    if "overpayment_amount" in df.columns:
        pos_amount = (cluster_df["overpayment_amount"] > 0).mean() * 100
        print(f"\n💰 % with positive overpayment_amount: {pos_amount:.1f}%")

    print("\nSample claim notes:")
    sample_rows = cluster_df.sample(
        min(SAMPLES_PER_CLUSTER, len(cluster_df)),
        random_state=42
    )
    for note in sample_rows["merged_text"]:
        print(f"- {note}")
    print("="*80)


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# =========================
# Load data
# =========================
# df = pd.read_csv("claims_data.csv")
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " + df["free_flow_opt_note"].fillna("")
)

# =========================
# Step 1: Clean text
# =========================
REMOVE_TERMS = {
    "md", "drg", "hrp", "nan", "request", "letter", "sent", "medical", "record",
    "chart", "order", "verify", "reported", "please", "note", "per"
}
MIN_WORD_LEN = 3
WHITELIST = {"rpa", "offset"}  # short words you want to keep

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = []
    for tok in text.split():
        if tok in REMOVE_TERMS:
            continue
        if len(tok) < MIN_WORD_LEN and tok not in WHITELIST:
            continue
        tokens.append(tok)
    return " ".join(tokens)

df["filtered_text"] = df["merged_text"].apply(clean_text)

# =========================
# Step 2: TF-IDF Vectorizer (1-3 grams)
# =========================
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),
    min_df=5,
    max_df=0.8  # remove phrases appearing in >80% of claims
)
X = vectorizer.fit_transform(df["filtered_text"])
terms = vectorizer.get_feature_names_out()

# =========================
# Step 3: KMeans clustering
# =========================
n_clusters = 5
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# =========================
# Step 4: Top terms per cluster (remove overlaps)
# =========================
def remove_redundant_phrases(phrases):
    phrases = sorted(phrases, key=len, reverse=True)
    final = []
    for p in phrases:
        if not any(p in bigger for bigger in final if p != bigger):
            final.append(p)
    return final

def top_terms_for_cluster(c, top_n=20):
    idx = df[df["cluster"] == c].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    top_idx = mean_tfidf.argsort()[::-1][:top_n * 2]  # take more, then filter
    raw_terms = [terms[i] for i in top_idx]
    # remove admin-like meaningless phrases again
    cleaned = [t for t in raw_terms if not any(stop in t.split() for stop in REMOVE_TERMS)]
    return remove_redundant_phrases(cleaned)[:top_n]

cluster_terms = {c: top_terms_for_cluster(c) for c in range(n_clusters)}

# =========================
# Step 5: Evidence terms per claim
# =========================
def find_matches(text, phrases):
    found = []
    for phrase in phrases:
        if re.search(r'\b' + re.escape(phrase) + r'\b', text):
            found.append(phrase)
    return found

df["evidence_terms"] = df.apply(
    lambda row: find_matches(row["filtered_text"], cluster_terms[row["cluster"]]),
    axis=1
)

# =========================
# Step 6: Rule-based label suggestion
# =========================
FINDING_PHRASES = {"overpayment closing", "recover overpayment", "audit completed"}
NOFINDING_PHRASES = {"no findings", "no overpayment", "reconsideration denied"}
CANCELLED_PHRASES = {"withdrawn", "cancelled", "rescinded"}

def suggest_label(evidence, amount):
    ev_set = set(evidence)
    if any(p in ev_set for p in CANCELLED_PHRASES):
        return "cancelled"
    if any(p in ev_set for p in FINDING_PHRASES):
        return "finding"
    if any(p in ev_set for p in NOFINDING_PHRASES) and amount == 0:
        return "nofinding"
    if amount > 0:
        return "finding"
    return "unknown"

df["pattern_label"] = df.apply(
    lambda r: suggest_label(r["evidence_terms"], r.get("overpayment_amount", 0)), axis=1
)

# =========================
# Step 7: Create final descriptive column for clarity
# =========================
df["cluster_top_terms"] = df["cluster"].apply(lambda c: ", ".join(cluster_terms[c]))

# =========================
# Step 8: Save full enriched dataset
# =========================
df.to_csv("claims_with_clusters_and_labels.csv", index=False)
print("✅ Saved to claims_with_clusters_and_labels.csv")

# =========================
# Step 9: Show summaries
# =========================
SAMPLES_PER_CLUSTER = 5

for c in range(n_clusters):
    cluster_df = df[df['cluster'] == c]
    print("="*80)
    print(f"📌 Cluster {c} — size: {len(cluster_df)}")
    print("Top 20 terms/phrases:")
    for term in cluster_terms[c]:
        print(f"  {term}")

    if "overpayment_amount" in df.columns:
        pos_amount = (cluster_df["overpayment_amount"] > 0).mean() * 100
        print(f"\n💰 % with positive overpayment_amount: {pos_amount:.1f}%")

    print("\nSample claim notes:")
    sample_rows = cluster_df.sample(
        min(SAMPLES_PER_CLUSTER, len(cluster_df)),
        random_state=42
    )
    for note in sample_rows["merged_text"]:
        print(f"- {note}")
    print("="*80)


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# ----------------------------------------
# Load data
# ----------------------------------------
# df = pd.read_csv("claims_data.csv")
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " + df["free_flow_opt_note"].fillna("")
)

# ----------------------------------------
# Step 1: Clean and remove generic terms
# ----------------------------------------
REMOVE_TERMS = {"md", "drg", "hrp", "nan", "request", "letter", "sent", "medical", "record"}
MIN_WORD_LEN = 3
WHITELIST = {"rpa", "offset"}

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  # remove numbers/special chars
    tokens = []
    for tok in text.split():
        if tok in REMOVE_TERMS:
            continue
        if len(tok) < MIN_WORD_LEN and tok not in WHITELIST:
            continue
        tokens.append(tok)
    return " ".join(tokens)

df["filtered_text"] = df["merged_text"].apply(clean_text)

# ----------------------------------------
# Step 2: Vectorize with n-grams
# ----------------------------------------
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),
    min_df=5
)
X = vectorizer.fit_transform(df["filtered_text"])
terms = vectorizer.get_feature_names_out()

# ----------------------------------------
# Step 3: Cluster
# ----------------------------------------
n_clusters = 5
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# ----------------------------------------
# Step 4: Top terms per cluster (remove overlaps)
# ----------------------------------------
def remove_redundant_phrases(phrases):
    phrases = sorted(phrases, key=len, reverse=True)
    final = []
    for p in phrases:
        if not any(p in bigger for bigger in final if p != bigger):
            final.append(p)
    return final

def top_terms_for_cluster(c, top_n=20):
    idx = df[df["cluster"] == c].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    raw_terms = [terms[i] for i in top_idx]
    return remove_redundant_phrases(raw_terms)

cluster_terms = {c: top_terms_for_cluster(c) for c in range(n_clusters)}

# ----------------------------------------
# Step 5: Evidence column
# ----------------------------------------
def find_matches(text, phrases):
    found = []
    for phrase in phrases:
        if re.search(r'\b' + re.escape(phrase) + r'\b', text):
            found.append(phrase)
    return found

df["evidence_terms"] = df.apply(
    lambda row: find_matches(row["filtered_text"], cluster_terms[row["cluster"]]),
    axis=1
)

# ----------------------------------------
# Step 6: Rule-based label suggestion
# ----------------------------------------
FINDING_PHRASES = {"overpayment closing", "recover overpayment", "audit completed"}
NOFINDING_PHRASES = {"no findings", "no overpayment", "reconsideration denied"}
CANCELLED_PHRASES = {"withdrawn", "cancelled", "rescinded"}

def suggest_label(evidence, amount):
    ev_set = set(evidence)
    if any(p in ev_set for p in CANCELLED_PHRASES):
        return "cancelled"
    if any(p in ev_set for p in FINDING_PHRASES):
        return "finding"
    if any(p in ev_set for p in NOFINDING_PHRASES) and amount == 0:
        return "nofinding"
    if amount > 0:
        return "finding"
    return "unknown"

df["pattern_label"] = df.apply(
    lambda r: suggest_label(r["evidence_terms"], r.get("overpayment_amount", 0)), axis=1
)

# ----------------------------------------
# Step 7: Save results
# ----------------------------------------
df.to_csv("claims_with_labels.csv", index=False)
print("Saved to claims_with_labels.csv")

# ----------------------------------------
# Step 8: Debug - Show clusters
# ----------------------------------------
for c in range(n_clusters):
    print(f"\nCluster {c}: {cluster_terms[c]}")

# ----------------------------------------
# EXTRA VIEW: Filter top terms by minimum TF-IDF weight
# ----------------------------------------
MIN_TFIDF_WEIGHT = 0.05  # adjust threshold

def top_terms_above_threshold(c, min_weight=MIN_TFIDF_WEIGHT, top_n=20):
    idx = df[df["cluster"] == c].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    mask = mean_tfidf >= min_weight
    filtered_idx = np.where(mask)[0]
    sorted_idx = filtered_idx[np.argsort(mean_tfidf[filtered_idx])[::-1]]
    raw_terms = [terms[i] for i in sorted_idx]
    cleaned = [t for t in raw_terms if not any(stop in t.split() for stop in REMOVE_TERMS)]
    return remove_redundant_phrases(cleaned)[:top_n]

strong_cluster_terms = {c: top_terms_above_threshold(c) for c in range(n_clusters)}

print("\n\n📊 Strong Signal Top Terms per Cluster (TF-IDF >= {:.2f})".format(MIN_TFIDF_WEIGHT))
for c in range(n_clusters):
    print("="*80)
    print(f"Cluster {c}")
    for term in strong_cluster_terms[c]:
        print(f"  {term}")

# ----------------------------------------
# Step 9: Add strong terms column for each row & save
# ----------------------------------------
df["strong_terms"] = df["cluster"].map(strong_cluster_terms)

# Final export with everything
df.to_csv("claims_with_labels_and_strong_terms.csv", index=False)
print("Saved to claims_with_labels_and_strong_terms.csv")


In [None]:
# Full improved pipeline: clustering + phrase mining + weak-supervision + classifier + evidence
# Requirements: pandas, numpy, scikit-learn
# Optional but recommended: increase min_df if you have extremely large data

import re
import numpy as np
import pandas as pd
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.utils import shuffle

# ---------------------------
# Configuration (tune here)
# ---------------------------
N_CLUSTERS = 5               # clustering (kept for exploration)
TF_MIN_DF = 8                # minimum docs an ngram must appear in
TF_MAX_DF = 0.85             # remove very common phrases
TF_NGRAM_RANGE = (1, 3)      # unigrams..trigrams
TF_MAX_FEATURES = 20000      # limit vocabulary
CHI2_TOP_K = 300             # candidate ngrams to treat as 'finding' seeds
MIN_CHI2_SCORE = 10.0        # filter weak chi2 signals
SEED_MIN_COUNT = 20          # only use ngrams that appear at least this often for seeds
PROB_THRESHOLD = 0.80        # assign final label only when model is confident
RANDOM_STATE = 42

# ---------------------------
# Load your data (edit)
# ---------------------------
# df = pd.read_csv("claims_data.csv")       # uncomment and set path
# For this snippet we assume df exists and has:
# - communication_notes
# - free_flow_opt_note
# - overpayment_amount

# ---------------------------
# Merge + basic cleaning
# ---------------------------
def preprocess_text(s):
    if pd.isna(s) or s is None:
        return ""
    text = str(s).lower()
    # remove emails/urls/phone-like tokens, claim ids (like n19878) and numeric tokens
    text = re.sub(r'\b[a-z]{0,3}\d{2,}\b', ' ', text)   # small letter+digits patterns
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'\S+@\S+', ' ', text)
    # keep letters and spaces (we will later drop many short tokens)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['merged_text'] = (df['communication_notes'].fillna('') + ' ' +
                     df['free_flow_opt_note'].fillna('')).apply(preprocess_text)

# ---------------------------
# Token-level filtering (remove short noisy tokens, common codes)
# ---------------------------
NOISE_TOKENS = set([
    'nan','md','drg','hrp','ovptrxid','omniclaim','claim','facility','provider','sent',
    'request','requested','please','note','order','chart','record','verify','reported'
])
MIN_TOKEN_LEN = 3
WHITELIST_SHORT = {'rpa','offset'}  # keep these short tokens if present

def filter_tokens(text):
    toks = []
    for t in text.split():
        if t in NOISE_TOKENS:
            continue
        if len(t) < MIN_TOKEN_LEN and t not in WHITELIST_SHORT:
            continue
        toks.append(t)
    return ' '.join(toks)

df['filtered_text'] = df['merged_text'].apply(filter_tokens)

# ---------------------------
# Vectorization (TF-IDF for classifier + Count for chi2)
# ---------------------------
tfvec = TfidfVectorizer(stop_words='english',
                        ngram_range=TF_NGRAM_RANGE,
                        min_df=TF_MIN_DF,
                        max_df=TF_MAX_DF,
                        max_features=TF_MAX_FEATURES)

X_tfidf = tfvec.fit_transform(df['filtered_text'])
feature_names = np.array(tfvec.get_feature_names_out())

# Also build count matrix for chi2 (chi2 expects counts)
cntvec = CountVectorizer(stop_words='english',
                         ngram_range=TF_NGRAM_RANGE,
                         min_df=TF_MIN_DF,
                         max_df=TF_MAX_DF,
                         max_features=TF_MAX_FEATURES,
                         vocabulary=tfvec.vocabulary_)  # same vocab
X_count = cntvec.fit_transform(df['filtered_text'])

# ---------------------------
# OPTIONAL: Clustering for exploration
# ---------------------------
km = KMeans(n_clusters=N_CLUSTERS, random_state=RANDOM_STATE, n_init=10)
df['cluster'] = km.fit_predict(X_tfidf)

# Compute top phrases per cluster (mean TF-IDF)
def top_terms_for_cluster(cluster_id, top_n=20, min_weight=0.003):
    idx = np.where(df['cluster'] == cluster_id)[0]
    if idx.size == 0:
        return []
    mean = X_tfidf[idx].mean(axis=0).A1
    order = np.argsort(mean)[::-1]
    selected = []
    for i in order:
        if mean[i] < min_weight:
            break
        term = feature_names[i]
        # drop if contains tokens entirely in NOISE_TOKENS
        toks = term.split()
        if all(tok in NOISE_TOKENS for tok in toks):
            continue
        selected.append((term, mean[i]))
        if len(selected) >= top_n:
            break
    return selected

# quick print of cluster top phrases (optional)
for c in range(N_CLUSTERS):
    t = top_terms_for_cluster(c, top_n=10)
    print(f"\nCluster {c} top (sample): {[x[0] for x in t]}")

# ---------------------------
# STEP: Find discriminative ngrams associated with positive amount (proxy for finding)
# ---------------------------
# create a noisy target: has_amount
df['has_amount'] = df['overpayment_amount'].fillna(0) > 0

# run chi2 to find terms correlated with has_amount
chi2_stats, pvals = chi2(X_count, df['has_amount'].astype(int))
chi2_df = pd.DataFrame({
    'term': feature_names,
    'chi2': chi2_stats,
    'pval': pvals
}).sort_values('chi2', ascending=False)

# filter top candidates: also ensure term frequency in corpus >= SEED_MIN_COUNT
occ = np.asarray(X_count.sum(axis=0)).ravel()
chi2_df['count'] = occ[ [tfvec.vocabulary_[t] for t in chi2_df['term']] ]
# keep strong signals
candidate_finding_terms = chi2_df[(chi2_df['chi2'] >= MIN_CHI2_SCORE) & (chi2_df['count'] >= SEED_MIN_COUNT)]['term'].tolist()[:CHI2_TOP_K]

print(f"\n{len(candidate_finding_terms)} candidate 'finding' phrases from chi2 (amount-associated). Example:", candidate_finding_terms[:20])

# ---------------------------
# STEP: Define explicit high-precision patterns for cancelled / nofinding (regex list)
# ---------------------------
CANCEL_PATTERNS = [
    r'\bcancelled\b', r'\bcanceled\b', r'\bvoided\b', r'\bwithdrawn\b', r'\brescinded\b',
    r'\bclosed without action\b', r'\bclose without action\b'
]
NOFIND_PATTERNS = [
    r'\bno (?:finding|findings|overpayment|issue)\b', r'\bno discrepancies\b', r'\bvalid charge\b',
    r'\bpaid correctly\b', r'\bnot an overpayment\b', r'\bno case to answer\b'
]

CANCEL_RE = re.compile('|'.join(CANCEL_PATTERNS), flags=re.I)
NOFIND_RE = re.compile('|'.join(NOFIND_PATTERNS), flags=re.I)

# also define a small manual list of strong finding seed terms
MANUAL_FINDING_TERMS = [
    'overpayment', 'refund', 'recoup', 'recoupment', 'recover overpayment',
    'amount owed', 'refund request', 'duplicate claim', 'duplicate payment', 'audit finding'
]

# combine manual + chi2 candidates (dedup)
FINDING_SEED_TERMS = list(dict.fromkeys(MANUAL_FINDING_TERMS + candidate_finding_terms))

print("\nSome final candidate finding seeds (top 30):", FINDING_SEED_TERMS[:30])

# ---------------------------
# Create weak (seed) labels per row (high precision rules)
# Priority: cancelled > finding > nofinding
# ---------------------------
def seed_label_for_row(text):
    if not isinstance(text, str) or text.strip() == '':
        return None
    if CANCEL_RE.search(text):
        return 'cancelled_seed'
    if any(re.search(r'\b' + re.escape(t) + r'\b', text) for t in FINDING_SEED_TERMS):
        return 'finding_seed'
    if NOFIND_RE.search(text):
        return 'nofind_seed'
    return None

df['seed_label'] = df['filtered_text'].apply(seed_label_for_row)
print("\nSeed label counts:\n", df['seed_label'].value_counts(dropna=False))

# ---------------------------
# Prepare supervised training set (rows that got seed labels)
# ---------------------------
seed_df = df[df['seed_label'].notnull()].copy()
# map seed names to short classes
seed_map = {'finding_seed':'finding', 'nofind_seed':'nofinding', 'cancelled_seed':'cancelled'}
seed_df['y'] = seed_df['seed_label'].map(seed_map)

# If too few seed rows for any class, you might want to hand-label or relax seeds
print("\nSeed training counts:\n", seed_df['y'].value_counts())

# If we have at least some training rows, train classifier
if seed_df.shape[0] < 50:
    print("WARNING: Very few seed-labeled rows (<50). Consider hand-labeling a small set to bootstrap.")
else:
    # features: TF-IDF + scaled overpayment_amount (numeric)
    # get row indices for seed rows
    seed_idx = seed_df.index.values
    X_seed_tfidf = X_tfidf[seed_idx]
    # numeric feature
    amount = df.loc[seed_idx, 'overpayment_amount'].fillna(0).values.reshape(-1,1)
    scaler = StandardScaler()
    amount_scaled = scaler.fit_transform(np.log1p(amount))
    # combine sparse + dense -> sparse by hstack
    X_seed = hstack([X_seed_tfidf, csr_matrix(amount_scaled)])
    # train/test split
    X_tr, X_val, y_tr, y_val = train_test_split(X_seed, seed_df['y'].values, test_size=0.2, random_state=RANDOM_STATE, stratify=seed_df['y'].values)
    # classifier
    clf = LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga', multi_class='multinomial', random_state=RANDOM_STATE)
    clf.fit(X_tr, y_tr)
    print("\nTrained classifier on seed labels.")
    # evaluate
    y_pred = clf.predict(X_val)
    print("\nValidation classification report:")
    print(classification_report(y_val, y_pred, digits=3))
    # show per-class precision/recall
    prfs = precision_recall_fscore_support(y_val, y_pred, labels=['finding','nofinding','cancelled'])
    print("Precision,Recall,F1 by class (finding,nofinding,cancelled):")
    print(np.round(prfs[0],3), np.round(prfs[1],3), np.round(prfs[2],3))

    # ---------------------------
    # Predict on full dataset
    # ---------------------------
    # prepare full feature matrix (tfidf + amount_scaled)
    amount_all = df['overpayment_amount'].fillna(0).values.reshape(-1,1)
    amount_all_scaled = scaler.transform(np.log1p(amount_all))
    X_all = hstack([X_tfidf, csr_matrix(amount_all_scaled)])
    probs = clf.predict_proba(X_all)
    pred = clf.classes_[np.argmax(probs, axis=1)]
    pred_prob = np.max(probs, axis=1)
    df['pred_label_model'] = pred
    df['pred_prob'] = pred_prob
    # apply thresholding
    df['final_label'] = np.where(df['pred_prob'] >= PROB_THRESHOLD, df['pred_label_model'], 'unknown')

    # ---------------------------
    # Evidence extraction per row (seed match or feature contributions)
    # ---------------------------
    # helper to get top contributing ngrams for a row toward a class
    feature_count = X_tfidf.shape[1]
    coef = clf.coef_   # shape (n_classes, n_features + 1) last column is numeric amount feature
    classes = clf.classes_
    # note: coef shape columns = X_all.shape[1] -> TF features + 1 numeric feature
    def top_contributing_ngrams_for_row(row_idx, top_k=3):
        # row tfidf vector
        row_vec = X_tfidf[row_idx]
        # get nonzero indices and values
        row_coo = row_vec.tocoo()
        cols = row_coo.col
        vals = row_coo.data
        if len(cols) == 0:
            return []
        # class predicted
        cl = df.loc[row_idx, 'pred_label_model']
        cl_idx = list(classes).index(cl)
        # coefficients for TF features only (exclude last amount column)
        coef_cl = coef[cl_idx][:feature_count]
        # compute contributions per nonzero ngram: coef * tfidf
        contribs = vals * coef_cl[cols]
        # pick top positive contributions
        order = np.argsort(contribs)[::-1]
        top = []
        for ind in order[:top_k]:
            if contribs[ind] <= 0:
                continue
            fn = feature_names[cols[ind]]
            top.append(fn)
        return top

    # Build evidence column
    def build_evidence_for_row(i):
        text = df.at[i, 'filtered_text']
        # seed match first
        seed_matches = []
        if CANCEL_RE.search(text):
            seed_matches.append('cancelled_seed_match')
        # finding seeds
        for s in FINDING_SEED_TERMS:
            if re.search(r'\b' + re.escape(s) + r'\b', text):
                seed_matches.append(s)
        for s in NOFIND_PATTERNS:
            # NOFIND_PATTERNS var is a list of regex strings; we already compiled NOFIND_RE above
            pass
        if seed_matches:
            return '; '.join(seed_matches)
        # else fallback: top contributing ngrams
        top_ngrams = top_contributing_ngrams_for_row(i, top_k=4)
        if top_ngrams:
            return '; '.join(top_ngrams)
        # fallback to cluster terms
        cl = df.at[i, 'cluster']
        cl_terms = [t for t,_ in top_terms_for_cluster(cl, top_n=6)]
        return '; '.join(cl_terms[:3])

    # vectorized: but for clarity just loop (ok for up to few 100k rows)
    evidences = []
    for i in range(len(df)):
        evidences.append(build_evidence_for_row(i))
    df['evidence_phrases'] = evidences

    # ---------------------------
    # Save results
    # ---------------------------
    out_cols = list(df.columns)  # keep all columns
    df.to_csv("claims_with_model_labels_and_evidence.csv", index=False)
    print("\nSaved results to claims_with_model_labels_and_evidence.csv")

    # OPTIONAL: show cluster-level summaries combining language + amount + predicted label distribution
    for c in range(N_CLUSTERS):
        sub = df[df['cluster'] == c]
        print("\n--- Cluster", c, "size", len(sub), "---")
        # show top strong terms using earlier function
        top_c = top_terms_for_cluster(c, top_n=12)
        print("Top phrases:", [t for t,_ in top_c])
        if 'overpayment_amount' in df.columns:
            pct_pos = (sub['overpayment_amount'].fillna(0) > 0).mean() * 100
            print(f"% with positive overpayment_amount: {pct_pos:.1f}%")
        print("Predicted label counts:\n", sub['final_label'].value_counts(normalize=True))


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# ===================================
# Load and prepare merged text
# ===================================
# df = pd.read_csv("claims_data.csv")
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " + df["free_flow_opt_note"].fillna("")
)

# ===================================
# Step 1: Clean and remove generic terms
# ===================================
REMOVE_TERMS = {"md", "drg", "hrp", "nan", "request", "letter", "sent", "medical", "record"}
MIN_WORD_LEN = 3
WHITELIST = {"rpa", "offset"}

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  # remove numbers/special chars
    tokens = []
    for tok in text.split():
        if tok in REMOVE_TERMS:
            continue
        if len(tok) < MIN_WORD_LEN and tok not in WHITELIST:
            continue
        tokens.append(tok)
    return " ".join(tokens)

df["filtered_text"] = df["merged_text"].apply(clean_text)

# ===================================
# Step 2: Vectorize with n-grams
# ===================================
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),
    min_df=5
)
X = vectorizer.fit_transform(df["filtered_text"])
terms = vectorizer.get_feature_names_out()

# ===================================
# Step 3: Cluster
# ===================================
n_clusters = 5
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# ===================================
# Step 4: Top terms per cluster (remove overlaps)
# ===================================
def remove_redundant_phrases(phrases):
    phrases = sorted(phrases, key=len, reverse=True)
    final = []
    for p in phrases:
        if not any(p in bigger for bigger in final if p != bigger):
            final.append(p)
    return final

def top_terms_for_cluster(c, top_n=20):
    idx = df[df["cluster"] == c].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    raw_terms = [terms[i] for i in top_idx]
    return remove_redundant_phrases(raw_terms)

cluster_terms = {c: top_terms_for_cluster(c) for c in range(n_clusters)}

# ===================================
# Step 5: Evidence column
# ===================================
def find_matches(text, phrases):
    found = []
    for phrase in phrases:
        if re.search(r'\b' + re.escape(phrase) + r'\b', text):
            found.append(phrase)
    return found

df["evidence_terms"] = df.apply(
    lambda row: find_matches(row["filtered_text"], cluster_terms[row["cluster"]]),
    axis=1
)

# ===================================
# Step 6: Rule-based label suggestion
# ===================================
FINDING_PHRASES = {"overpayment closing", "recover overpayment", "audit completed", "audit findings"}
NOFINDING_PHRASES = {"no findings", "no overpayment", "reconsideration denied"}
CANCELLED_PHRASES = {"withdrawn", "cancelled", "rescinded"}

def suggest_label(evidence, amount):
    ev_set = set(evidence)
    if any(p in ev_set for p in CANCELLED_PHRASES):
        return "cancelled"
    if any(p in ev_set for p in FINDING_PHRASES):
        return "finding"
    if any(p in ev_set for p in NOFINDING_PHRASES) and amount == 0:
        return "nofinding"
    if amount > 0:
        return "finding"
    return "unknown"

df["pattern_label"] = df.apply(
    lambda r: suggest_label(r["evidence_terms"], r.get("overpayment_amount", 0)), axis=1
)

# ===================================
# Step 7: Save results
# ===================================
df.to_csv("claims_with_labels.csv", index=False)
print("Saved to claims_with_labels.csv")

# ===================================
# Step 8: Extra view - samples and top terms
# ===================================
SAMPLES_PER_CLUSTER = 5
for c in range(n_clusters):
    print(f"\n=== Cluster {c} ===")
    print("Top terms:", cluster_terms[c])
    sample_notes = df[df['cluster'] == c]['merged_text'].head(SAMPLES_PER_CLUSTER)
    for note in sample_notes:
        print("-", note)

# ===================================
# Step 9: Final review DataFrame
# ===================================
review_df = df.copy()
review_df["evidence_terms_str"] = review_df["evidence_terms"].apply(lambda x: "; ".join(x))
review_df.to_csv("claims_cluster_review.csv", index=False)
review_df.head()


In [None]:
# -------------------------------
# High-accuracy pipeline (single cell)
# - seed mining (chi2) + manual seed list
# - weak supervision -> classifier (TF-IDF + overpayment_amount)
# - evidence extraction (seed match OR top contributing ngrams)
# - cluster diagnostics + top phrases per cluster
# - final CSVs: detailed outputs for review
# -------------------------------

import re
import numpy as np
import pandas as pd
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import shuffle

# -------------------------------
# CONFIG - tune these for your data
# -------------------------------
N_CLUSTERS = 6                 # for exploration only
TF_MIN_DF = 6                  # lower -> more phrases, higher -> more reliable
TF_MAX_DF = 0.85
TF_NGRAM_RANGE = (1, 3)        # unigrams..trigrams
TF_MAX_FEATURES = 30000
CHI2_TOP_K = 500
MIN_CHI2_SCORE = 8.0
SEED_MIN_COUNT = 15
PROB_THRESHOLD = 0.80          # model confidence threshold to auto-accept label
AMOUNT_OVERRIDE_THRESHOLD = 0  # if amount > this and no contradicting evidence, treat as signal
RANDOM_STATE = 42

# -------------------------------
# 0) LOAD DATA - put your path here
# -------------------------------
# df = pd.read_csv("claims_data.csv")
# Must include: communication_notes, free_flow_opt_note, overpayment_amount, claim_number (optional)
if 'df' not in globals():
    raise RuntimeError("Please load your DataFrame into the variable `df` before running this cell (e.g. df = pd.read_csv(...)).")

# -------------------------------
# 1) Merge fields and basic preprocess
# -------------------------------
def preprocess_text(s):
    if pd.isna(s) or s is None:
        return ""
    text = str(s).lower()
    # remove emails, urls, short id patterns like n19878, numeric tokens (we'll rely on amount separately)
    text = re.sub(r'\b[a-z]{0,3}\d{2,}\b', ' ', text)   # small letter+digits combos
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'\S+@\S+', ' ', text)
    # keep letters and spaces
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['merged_text'] = (df.get('communication_notes', pd.Series('', index=df.index)).fillna('') + ' ' +
                     df.get('free_flow_opt_note', pd.Series('', index=df.index)).fillna(''))
df['merged_text'] = df['merged_text'].apply(preprocess_text)

# -------------------------------
# 2) Token-level noise filtering (customize REMOVE_TOKENS)
# -------------------------------
REMOVE_TOKENS = set([
    'nan','md','drg','hrp','ovptrxid','claimid','omniclaim', 
    'please','thank','regards','sent','request','requested','note','chart','order','record','per'
])
MIN_TOKEN_LEN = 3
WHITELIST_SHORT = {'rpa','offset','icd'}  # keep meaningful short tokens if needed

def filter_tokens(txt):
    toks = []
    for t in txt.split():
        if t in REMOVE_TOKENS:
            continue
        if len(t) < MIN_TOKEN_LEN and t not in WHITELIST_SHORT:
            continue
        toks.append(t)
    return ' '.join(toks)

df['filtered_text'] = df['merged_text'].apply(filter_tokens)

# -------------------------------
# 3) TF-IDF and Count vectorizers (same vocab)
# -------------------------------
tfidf = TfidfVectorizer(stop_words='english',
                        ngram_range=TF_NGRAM_RANGE,
                        min_df=TF_MIN_DF,
                        max_df=TF_MAX_DF,
                        max_features=TF_MAX_FEATURES)
X_tfidf = tfidf.fit_transform(df['filtered_text'])
feature_names = np.array(tfidf.get_feature_names_out())

countvec = CountVectorizer(ngram_range=TF_NGRAM_RANGE, min_df=TF_MIN_DF,
                           max_df=TF_MAX_DF, vocabulary=tfidf.vocabulary_)
X_count = countvec.fit_transform(df['filtered_text'])

# -------------------------------
# 4) Clustering for exploration (optional)
# -------------------------------
km = KMeans(n_clusters=N_CLUSTERS, random_state=RANDOM_STATE, n_init=10)
df['cluster'] = km.fit_predict(X_tfidf)

def top_terms_for_cluster(cluster_id, top_n=20, min_weight=0.002):
    idx = np.where(df['cluster'] == cluster_id)[0]
    if idx.size == 0:
        return []
    mean = X_tfidf[idx].mean(axis=0).A1
    order = np.argsort(mean)[::-1]
    out = []
    for i in order:
        if mean[i] < min_weight:
            break
        term = feature_names[i]
        # ignore pure noise tokens
        if all(tok in REMOVE_TOKENS for tok in term.split()):
            continue
        out.append((term, mean[i]))
        if len(out) >= top_n:
            break
    return out

# print small sample for debug
for c in range(min(N_CLUSTERS, 8)):
    sample_terms = [t for t,_ in top_terms_for_cluster(c, top_n=8)]
    print(f"Cluster {c} sample top: {sample_terms}")

# -------------------------------
# 5) Data-driven seed discovery (chi2) - find ngrams associated with positive overpayment
# -------------------------------
df['has_amount'] = df.get('overpayment_amount', pd.Series(0, index=df.index)).fillna(0) > 0
chi2_stats, pvals = chi2(X_count, df['has_amount'].astype(int))
chi2_df = pd.DataFrame({'term': feature_names, 'chi2': chi2_stats, 'pval': pvals})
# attach counts
counts = np.asarray(X_count.sum(axis=0)).ravel()
chi2_df['count'] = counts
chi2_df = chi2_df.sort_values('chi2', ascending=False)

candidates = chi2_df[(chi2_df['chi2'] >= MIN_CHI2_SCORE) & (chi2_df['count'] >= SEED_MIN_COUNT)]['term'].tolist()[:CHI2_TOP_K]
print(f"Found {len(candidates)} candidate finding terms via chi2. Example slice: {candidates[:30]}")

# -------------------------------
# 6) Manual seed lists (add domain phrases you know are strong)
# -------------------------------
MANUAL_FINDING = [
    'overpayment', 'overpayment identified', 'overpayment recovered', 'refund request',
    'recoup', 'recoupment', 'recover overpayment', 'audit finding', 'audit findings',
    'duplicate claim', 'duplicate payment', 'amount owed', 'refund due', 'appeal upheld',
    'appeal denied', 'dispute upheld', 'dispute denied'
]
# Combine chi2 + manual seeds (unique, ordered)
finding_seeds = []
for s in MANUAL_FINDING + candidates:
    if s not in finding_seeds:
        finding_seeds.append(s)

# Cancellation & no-find regex phrases
CANCEL_PATTERNS = [r'\bcancelled\b', r'\bcanceled\b', r'\bwithdrawn\b', r'\bvoided\b', r'\brescinded\b']
NOFIND_PATTERNS = [r'\bno (?:finding|findings|overpayment|issue)\b', r'\bno discrepancy', r'\bvalid charge\b', r'\bpaid correctly\b']
CANCEL_RE = re.compile('|'.join(CANCEL_PATTERNS), flags=re.I)
NOFIND_RE = re.compile('|'.join(NOFIND_PATTERNS), flags=re.I)

print("\nSample final finding seeds (top 40):", finding_seeds[:40])

# -------------------------------
# 7) Build high-precision seed labels (weak labels)
# Priority: cancelled_seed > finding_seed > nofind_seed
# -------------------------------
def seed_label_for_row(text):
    if not isinstance(text, str) or text.strip() == '':
        return None
    if CANCEL_RE.search(text):
        return 'cancelled_seed'
    for s in finding_seeds:
        # exact token-boundary match
        if re.search(r'\b' + re.escape(s) + r'\b', text):
            return 'finding_seed'
    if NOFIND_RE.search(text):
        return 'nofind_seed'
    return None

df['seed_label'] = df['filtered_text'].apply(seed_label_for_row)
print("\nSeed counts:", df['seed_label'].value_counts(dropna=False))

# -------------------------------
# 8) Prepare training data (rows with seed labels)
# -------------------------------
seed_df = df[df['seed_label'].notnull()].copy()
seed_df['y'] = seed_df['seed_label'].map({'finding_seed':'finding','nofind_seed':'nofinding','cancelled_seed':'cancelled'})

print("\nTraining counts from seeds:\n", seed_df['y'].value_counts())

# If too few seeds, consider hand-labeling some rows (recommended)
if seed_df.shape[0] < 100:
    print("WARNING: fewer than 100 seed-labeled rows. Consider adding some manual labels for better accuracy.")

# -------------------------------
# 9) Train classifier on TF-IDF + numeric overpayment
# -------------------------------
model_trained = False
if seed_df.shape[0] >= 50:
    seed_idx = seed_df.index.values
    X_seed_tfidf = X_tfidf[seed_idx]
    # numeric feature: log1p(amount)
    amount_seed = df.loc[seed_idx, 'overpayment_amount'].fillna(0).values.reshape(-1,1)
    scaler = StandardScaler()
    amount_seed_scaled = scaler.fit_transform(np.log1p(amount_seed))
    X_seed = hstack([X_seed_tfidf, csr_matrix(amount_seed_scaled)])
    # split
    X_tr, X_val, y_tr, y_val = train_test_split(X_seed, seed_df['y'].values, test_size=0.2, random_state=RANDOM_STATE, stratify=seed_df['y'].values)
    clf = LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga', multi_class='multinomial', random_state=RANDOM_STATE)
    clf.fit(X_tr, y_tr)
    model_trained = True
    # eval
    y_pred = clf.predict(X_val)
    print("\nValidation report on seed-labeled holdout:")
    print(classification_report(y_val, y_pred, digits=3))

# -------------------------------
# 10) Score entire dataset with classifier + build final label logic
# -------------------------------
if model_trained:
    amount_all = df['overpayment_amount'].fillna(0).values.reshape(-1,1)
    amount_all_scaled = scaler.transform(np.log1p(amount_all))
    X_all = hstack([X_tfidf, csr_matrix(amount_all_scaled)])
    probs = clf.predict_proba(X_all)
    pred = clf.classes_[np.argmax(probs, axis=1)]
    pred_prob = np.max(probs, axis=1)
    df['pred_label_model'] = pred
    df['pred_prob'] = pred_prob

    # Final label logic:
    # - If cancel regex match -> cancelled
    # - Else if seed finding matched -> finding
    # - Else if model predicts with high prob -> model's label
    # - Else if amount > AMOUNT_OVERRIDE_THRESHOLD -> finding
    # - else unknown
    def final_label_row(i):
        text = df.at[i, 'filtered_text']
        amt = df.at[i, 'overpayment_amount'] if not pd.isna(df.at[i, 'overpayment_amount']) else 0
        # cancel explicit
        if CANCEL_RE.search(text):
            return 'cancelled'
        # seed finding
        if df.at[i, 'seed_label'] == 'finding_seed':
            return 'finding'
        # model confident
        if df.at[i, 'pred_prob'] >= PROB_THRESHOLD:
            return df.at[i, 'pred_label_model']
        # amount tie-breaker
        if amt > AMOUNT_OVERRIDE_THRESHOLD:
            return 'finding'
        return 'unknown'

    df['final_label'] = [final_label_row(i) for i in range(len(df))]
else:
    print("Model not trained due to insufficient seeds; falling back to rule-only labeling.")
    # simple rule-only final label
    def final_label_rule(i):
        text = df.at[i, 'filtered_text']
        amt = df.at[i, 'overpayment_amount'] if not pd.isna(df.at[i, 'overpayment_amount']) else 0
        if CANCEL_RE.search(text):
            return 'cancelled'
        for s in finding_seeds:
            if re.search(r'\b' + re.escape(s) + r'\b', text):
                return 'finding'
        if NOFIND_RE.search(text) and amt == 0:
            return 'nofinding'
        if amt > AMOUNT_OVERRIDE_THRESHOLD:
            return 'finding'
        return 'unknown'
    df['final_label'] = [final_label_rule(i) for i in range(len(df))]

# -------------------------------
# 11) Evidence extraction per row (seed match or top contributing ngrams)
# -------------------------------
# compute feature_count
feature_count = X_tfidf.shape[1]
if model_trained:
    coef = clf.coef_   # shape (n_classes, n_features + 1)
    classes = clf.classes_
else:
    coef = None
    classes = []

def top_contrib_ngrams(row_idx, top_k=3):
    if not model_trained:
        return []
    row_vec = X_tfidf[row_idx]
    row_coo = row_vec.tocoo()
    cols = row_coo.col
    vals = row_coo.data
    if len(cols) == 0:
        return []
    cl = df.at[row_idx, 'pred_label_model']
    cl_idx = list(classes).index(cl)
    coef_cl = coef[cl_idx][:feature_count]
    contribs = vals * coef_cl[cols]
    order = np.argsort(contribs)[::-1]
    top = []
    for ind in order:
        if contribs[ind] <= 0:
            continue
        fn = feature_names[cols[ind]]
        top.append(fn)
        if len(top) >= top_k:
            break
    return top

def build_evidence(i):
    text = df.at[i, 'filtered_text']
    # cancel seed
    if CANCEL_RE.search(text):
        return 'cancelled_seed'
    # finding seed match
    for s in finding_seeds:
        if re.search(r'\b' + re.escape(s) + r'\b', text):
            return s
    # nofind regex
    if NOFIND_RE.search(text):
        return 'nofind_seed'
    # model contributions fallback
    contribs = top_contrib_ngrams(i, top_k=4)
    if contribs:
        return '; '.join(contribs)
    # cluster terms fallback
    cl = df.at[i, 'cluster']
    cl_terms = [t for t,_ in top_terms_for_cluster(cl, top_n=6)]
    return '; '.join(cl_terms[:3])

df['evidence_phrases'] = [build_evidence(i) for i in range(len(df))]

# -------------------------------
# 12) Cluster-level diagnostics (top terms + amount distribution + predicted labels)
# -------------------------------
cluster_summary = []
for c in range(N_CLUSTERS):
    sub = df[df['cluster'] == c]
    if len(sub) == 0:
        continue
    top = [t for t,_ in top_terms_for_cluster(c, top_n=15)]
    pct_amount = (sub['overpayment_amount'].fillna(0) > 0).mean() * 100
    label_counts = sub['final_label'].value_counts(normalize=True).to_dict()
    cluster_summary.append({
        'cluster': c,
        'size': len(sub),
        'top_phrases': top,
        '%with_amount': pct_amount,
        'label_distribution': label_counts
    })
    print(f"\nCluster {c} | size {len(sub)} | %with_amount {pct_amount:.1f}%")
    print("Top phrases:", top[:12])
    print("Label distribution:", label_counts)

# -------------------------------
# 13) Final exports for review
# -------------------------------
# create review dataframe with clear columns
review_cols = list(df.columns)  # keep all
# create a compact DF for review (subset of columns)
compact_cols = [
    'final_label', 'pred_label_model' if model_trained else None, 'pred_prob' if model_trained else None,
    'evidence_phrases', 'cluster', 'filtered_text', 'merged_text', 'overpayment_amount'
]
compact_cols = [c for c in compact_cols if c is not None]
df_review = df[compact_cols + [c for c in df.columns if c not in compact_cols]]  # keep everything but ensure key columns up-front

# save files
df.to_csv("claims_full_results.csv", index=False)
df_review.to_csv("claims_review_compact.csv", index=False)
import json
with open("cluster_summary.json", "w") as f:
    json.dump(cluster_summary, f, default=str)

print("\nSaved outputs:")
print(" - claims_full_results.csv  (complete)")
print(" - claims_review_compact.csv  (compact review)")
print(" - cluster_summary.json      (cluster diagnostics)")

# -------------------------------
# 14) Recommendations (printed)
# -------------------------------
print("\nRECOMMENDATIONS:")
print("1) Inspect 'claims_review_compact.csv' and the 'cluster_summary.json' to validate top phrases and label distribution.")
print("2) If you can manually label 300-500 rows (high-quality), retrain the classifier on those labels — accuracy will usually jump substantially.")
print("3) Tweak TF_MIN_DF, MIN_CHI2_SCORE, PROB_THRESHOLD to trade recall/precision.")
print("4) If you want, I can provide a quick notebook cell to sample 'unknown' rows for manual labeling.")


In [None]:
###13
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# === Load data ===
# df = pd.read_csv("claims_data.csv")
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " + df["free_flow_opt_note"].fillna("")
)

# === Step 1: Clean and remove generic terms ===
REMOVE_TERMS = {"md", "drg", "hrp", "nan", "request", "letter", "sent", "medical", "record"}
MIN_WORD_LEN = 3
WHITELIST = {"rpa", "offset"}

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  # remove numbers/special chars
    tokens = []
    for tok in text.split():
        if tok in REMOVE_TERMS:
            continue
        if len(tok) < MIN_WORD_LEN and tok not in WHITELIST:
            continue
        tokens.append(tok)
    return " ".join(tokens)

df["filtered_text"] = df["merged_text"].apply(clean_text)

# === Step 2: Vectorize with n-grams ===
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),
    min_df=5
)
X = vectorizer.fit_transform(df["filtered_text"])
terms = vectorizer.get_feature_names_out()

# === Step 3: Cluster ===
n_clusters = 5
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# === Step 4: Remove redundant phrases ===
def remove_redundant_phrases(phrases):
    phrases = sorted(phrases, key=len, reverse=True)
    final = []
    for p in phrases:
        if not any(p in bigger for bigger in final if p != bigger):
            final.append(p)
    return final

def top_terms_for_cluster(c, top_n=20):
    idx = df[df["cluster"] == c].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    raw_terms = [terms[i] for i in top_idx]
    return remove_redundant_phrases(raw_terms)

cluster_terms = {c: top_terms_for_cluster(c) for c in range(n_clusters)}

# === Step 5: Evidence column ===
def find_matches(text, phrases):
    found = []
    for phrase in phrases:
        if re.search(r'\b' + re.escape(phrase) + r'\b', text):
            found.append(phrase)
    return found

df["evidence_terms"] = df.apply(
    lambda row: find_matches(row["filtered_text"], cluster_terms[row["cluster"]]),
    axis=1
)

# === Step 6: Aggregate by claim_id before labeling ===
if "claim_id" in df.columns:
    agg_funcs = {col: lambda x: ' | '.join(map(str, set(x.dropna()))) for col in df.columns if col not in ["overpayment_amount"]}
    agg_funcs["overpayment_amount"] = "max"  # take max amount if multiple lines
    df = df.groupby("claim_id", as_index=False).agg(agg_funcs)

# === Step 7: Rule-based label suggestion ===
FINDING_PHRASES = {"overpayment closing", "recover overpayment", "audit completed", "appeal denied", "audit findings"}
NOFINDING_PHRASES = {"no findings", "no overpayment", "reconsideration denied"}
CANCELLED_PHRASES = {"withdrawn", "cancelled", "rescinded"}

def suggest_label(evidence, amount):
    ev_text = " ".join(evidence).lower()
    if any(p in ev_text for p in CANCELLED_PHRASES):
        return "cancelled"
    if any(p in ev_text for p in FINDING_PHRASES):
        return "finding"
    if any(p in ev_text for p in NOFINDING_PHRASES) and amount == 0:
        return "nofinding"
    if amount > 0:
        return "finding"
    return "unknown"

df["pattern_label"] = df.apply(
    lambda r: suggest_label(r["evidence_terms"], r.get("overpayment_amount", 0)), axis=1
)

# === Step 8: Extra business rules ===
# If posted_date exists and claim already posted once with finding, mark others as nofinding
if "posted_date" in df.columns and "business_area" in df.columns:
    df = df.sort_values("posted_date")
    df["final_label"] = df.groupby("claim_id")["pattern_label"].transform(
        lambda x: "finding" if "finding" in x.values else x.iloc[0]
    )
else:
    df["final_label"] = df["pattern_label"]

# === Step 9: Save results ===
df.to_csv("claims_with_labels.csv", index=False)
print("Saved to claims_with_labels.csv")

# === Step 10: Show clusters and sample rows ===
for c in range(n_clusters):
    print(f"\nCluster {c} top terms: {cluster_terms[c]}")
    sample_rows = df[df["cluster"] == c].head(5)
    print(sample_rows[["claim_id", "filtered_text", "evidence_terms", "final_label"]])


In [None]:
# dynamic clusters
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# --------------------------
# Load and prepare data
# --------------------------
# df = pd.read_csv("claims_data.csv")
# Assumes you have columns:
# claim_id, communication_notes, free_flow_opt_note, overpayment_amount, posted_date, business_area

# Step 1: Merge text per claim_id (multiple lines -> single row)
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " + df["free_flow_opt_note"].fillna("")
)
df = df.groupby(
    ["claim_id", "overpayment_amount", "posted_date", "business_area"], as_index=False
).agg({"merged_text": " ".join})

# --------------------------
# Step 2: Text cleaning
# --------------------------
REMOVE_TERMS = {"md", "drg", "hrp", "nan", "request", "letter", "sent", "medical", "record"}
MIN_WORD_LEN = 3
WHITELIST = {"rpa", "offset"}

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  # remove numbers/special chars
    tokens = []
    for tok in text.split():
        if tok in REMOVE_TERMS:
            continue
        if len(tok) < MIN_WORD_LEN and tok not in WHITELIST:
            continue
        tokens.append(tok)
    return " ".join(tokens)

df["filtered_text"] = df["merged_text"].apply(clean_text)

# --------------------------
# Step 3: Vectorization
# --------------------------
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),  # capture up to trigrams
    min_df=3
)
X = vectorizer.fit_transform(df["filtered_text"])
terms = vectorizer.get_feature_names_out()

# --------------------------
# Step 4: Auto-pick n_clusters
# --------------------------
sil_scores = {}
for k in range(3, 10):
    km_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels_temp = km_temp.fit_predict(X)
    if len(set(labels_temp)) > 1:
        sil = silhouette_score(X, labels_temp)
        sil_scores[k] = sil

best_k = max(sil_scores, key=sil_scores.get)
print("Silhouette scores:", sil_scores)
print("Best cluster count:", best_k)

# --------------------------
# Step 5: Final clustering
# --------------------------
km = KMeans(n_clusters=best_k, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# --------------------------
# Step 6: Top terms per cluster
# --------------------------
def remove_redundant_phrases(phrases):
    phrases = sorted(phrases, key=len, reverse=True)
    final = []
    for p in phrases:
        if not any(p in bigger for bigger in final if p != bigger):
            final.append(p)
    return final

def top_terms_for_cluster(c, top_n=20):
    idx = df[df["cluster"] == c].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    raw_terms = [terms[i] for i in top_idx]
    return remove_redundant_phrases(raw_terms)

cluster_terms = {c: top_terms_for_cluster(c) for c in range(best_k)}

# --------------------------
# Step 7: Evidence terms per claim
# --------------------------
def find_matches(text, phrases):
    found = []
    for phrase in phrases:
        if re.search(r'\b' + re.escape(phrase) + r'\b', text):
            found.append(phrase)
    return found

df["evidence_terms"] = df.apply(
    lambda row: find_matches(row["filtered_text"], cluster_terms[row["cluster"]]),
    axis=1
)

# --------------------------
# Step 8: Label suggestion rules
# --------------------------
FINDING_PHRASES = {"overpayment closing", "recover overpayment", "audit completed", "appeal denied", "audit findings"}
NOFINDING_PHRASES = {"no findings", "no overpayment", "reconsideration denied"}
CANCELLED_PHRASES = {"withdrawn", "cancelled", "rescinded"}

def suggest_label(evidence, amount):
    ev_set = set(evidence)
    if any(p in ev_set for p in CANCELLED_PHRASES):
        return "cancelled"
    if any(p in ev_set for p in FINDING_PHRASES):
        return "finding"
    if any(p in ev_set for p in NOFINDING_PHRASES) and amount == 0:
        return "nofinding"
    if amount > 0:
        return "finding"
    return "unknown"

df["pattern_label"] = df.apply(
    lambda r: suggest_label(r["evidence_terms"], r.get("overpayment_amount", 0)),
    axis=1
)

# --------------------------
# Step 9: Preview results
# --------------------------
print("\n=== Top terms per cluster ===")
for c in range(best_k):
    print(f"\nCluster {c} ({len(df[df['cluster']==c])} claims):")
    print(cluster_terms[c])

print("\n=== Sample claims per cluster ===")
SAMPLES_PER_CLUSTER = 3
for c in range(best_k):
    print(f"\n--- Cluster {c} ---")
    for note in df[df["cluster"] == c]["merged_text"].head(SAMPLES_PER_CLUSTER):
        print("-", note)

# --------------------------
# Step 10: Save final data
# --------------------------
df.to_csv("claims_with_labels.csv", index=False)
print("\nSaved to claims_with_labels.csv")
