In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# --------------------------
# Load your data
# --------------------------
# Replace 'claims.csv' with your file path
df = pd.read_csv('claims.csv')

# Merge notes
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " +
    df["free_flow_opt_note"].fillna("")
)

# --------------------------
# Cleaning
# --------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["clean_text"] = df["merged_text"].apply(clean_text)

df.head()

In [None]:
# --------------------------
# Vectorize & cluster
# --------------------------
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, stop_words="english")
X = vectorizer.fit_transform(df["clean_text"])

# Choose cluster count (tweak as needed)
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df["cluster"] = kmeans.fit_predict(X)

df['cluster'].value_counts().sort_index()

In [None]:
# --------------------------
# Top terms per cluster
# --------------------------
import numpy as np

def top_terms_for_cluster(cluster_id, top_n=15):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    return [(terms[i], mean_tfidf[i]) for i in top_idx]

for c in range(n_clusters):
    print(f"\nCluster {c} (size {len(df[df['cluster']==c])}):")
    for term, score in top_terms_for_cluster(c):
        print(f"{term:20s} {score:.4f}")

In [None]:
# --------------------------
# Add evidence column: matched top terms from each note's cluster
# --------------------------
def find_matches_in_text(text, terms):
    found = []
    for term in terms:
        if re.search(r'\b' + re.escape(term) + r'\b', text):
            found.append(term)
    return found

# Build dictionary of cluster -> top terms
cluster_top_terms = {c: [t for t, _ in top_terms_for_cluster(c)] for c in range(n_clusters)}

# Create column with matches
df["matched_terms"] = df.apply(
    lambda row: "; ".join(find_matches_in_text(row["clean_text"], cluster_top_terms[row["cluster"]])),
    axis=1
)

df[["merged_text", "cluster", "matched_terms"]].head()

In [None]:
# --------------------------
# Show sample notes per cluster
# --------------------------
SAMPLES_PER_CLUSTER = 5
for c in range(n_clusters):
    print(f"\n=== Cluster {c} ===")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(SAMPLES_PER_CLUSTER)
    for note in sample_notes:
        print("-", note)

In [None]:
# --------------------------
# Interactive cluster tagging
# --------------------------
cluster_labels = {}
for c in range(n_clusters):
    print(f"\nCluster {c} top terms:")
    for term, score in top_terms_for_cluster(c):
        print(f"  {term:20s} {score:.4f}")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(3)
    print("\nSample notes:")
    for note in sample_notes:
        print("-", note)
    label = input("Enter label for this cluster (finding/nofinding/cancelled/unknown): ").strip().lower()
    cluster_labels[c] = label

# Assign labels back to df
df['pattern_label'] = df['cluster'].map(cluster_labels)

# Save labeled dataset
df.to_csv('claims_cluster_tagged.csv', index=False)
print("\nSaved tagged data to claims_cluster_tagged.csv")

In [None]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# --------------------------
# Cleaning function
# --------------------------
def clean_text(text):
    text = text.lower()
    # Remove IDs like n19878, abc123 (letters+digits combos)
    text = re.sub(r'\b[a-z]{0,3}\d{3,}\b', ' ', text)
    # Remove years/numbers
    text = re.sub(r'\b\d{2,4}\b', ' ', text)
    # Remove known meaningless codes (expand this list as you see them)
    custom_stopwords = {'drg', 'hrp', 'etc', 'na'}
    text = ' '.join(w for w in text.split() if w not in custom_stopwords)
    # Remove special characters & extra spaces
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --------------------------
# Prepare data
# --------------------------
df['merged_text'] = (
    df['communication_notes'].fillna('') + ' ' + df['free_flow_opt_note'].fillna('')
).apply(clean_text)

# --------------------------
# TF-IDF with bigrams
# --------------------------
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.85,
    min_df=5,
    ngram_range=(1, 2)  # single words + 2-word phrases
)
X = vectorizer.fit_transform(df['merged_text'])

# --------------------------
# Cluster
# --------------------------
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# --------------------------
# Top terms per cluster
# --------------------------
def top_terms_for_cluster(cluster_id, top_n=15, min_score=0.01):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1]
    results = []
    for i in top_idx:
        if mean_tfidf[i] >= min_score:  # skip very low-weight terms
            results.append((terms[i], mean_tfidf[i]))
        if len(results) >= top_n:
            break
    return results

# --------------------------
# Display results
# --------------------------
for c in range(n_clusters):
    print(f"\n{'='*50}\nCluster {c} (size {len(df[df['cluster']==c])})")
    for term, score in top_terms_for_cluster(c):
        print(f"{term:25s} {score:.4f}")


In [None]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# --------------------------
# Cleaning function
# --------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\b[a-z]{0,3}\d{3,}\b', ' ', text)   # IDs
    text = re.sub(r'\b\d{2,4}\b', ' ', text)            # years/numbers
    custom_stopwords = {'drg', 'hrp', 'etc', 'na'}      # expand this
    text = ' '.join(w for w in text.split() if w not in custom_stopwords)
    text = re.sub(r'[^a-z\s]', ' ', text)               # non-letters
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --------------------------
# Prepare data
# --------------------------
df['merged_text'] = (
    df['communication_notes'].fillna('') + ' ' + df['free_flow_opt_note'].fillna('')
).apply(clean_text)

# --------------------------
# TF-IDF with bigrams
# --------------------------
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.85,
    min_df=5,
    ngram_range=(1, 2)
)
X = vectorizer.fit_transform(df['merged_text'])

# --------------------------
# Cluster
# --------------------------
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# --------------------------
# Top terms per cluster
# --------------------------
def top_terms_for_cluster(cluster_id, top_n=15, min_score=0.01):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1]
    results = []
    for i in top_idx:
        if mean_tfidf[i] >= min_score:
            results.append((terms[i], mean_tfidf[i]))
        if len(results) >= top_n:
            break
    return results

# --------------------------
# Add evidence column
# --------------------------
def find_matches_in_text(text, terms):
    found = []
    for term in terms:
        if re.search(r'\b' + re.escape(term) + r'\b', text):
            found.append(term)
    return found

cluster_top_terms = {c: [t for t, _ in top_terms_for_cluster(c)] for c in range(n_clusters)}
df["matched_terms"] = df.apply(
    lambda row: "; ".join(find_matches_in_text(row["merged_text"], cluster_top_terms[row["cluster"]])),
    axis=1
)

# --------------------------
# Show sample notes per cluster
# --------------------------
SAMPLES_PER_CLUSTER = 5
for c in range(n_clusters):
    print(f"\n=== Cluster {c} ===")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(SAMPLES_PER_CLUSTER)
    for note in sample_notes:
        print("-", note)

# --------------------------
# Interactive cluster tagging
# --------------------------
cluster_labels = {}
for c in range(n_clusters):
    print(f"\nCluster {c} top terms:")
    for term, score in top_terms_for_cluster(c):
        print(f"  {term:20s} {score:.4f}")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(3)
    print("\nSample notes:")
    for note in sample_notes:
        print("-", note)
    label = input("Enter label for this cluster (finding/nofinding/cancelled/unknown): ").strip().lower()
    cluster_labels[c] = label

df['pattern_label'] = df['cluster'].map(cluster_labels)
df.to_csv('claims_cluster_tagged.csv', index=False)
print("\nSaved tagged data to claims_cluster_tagged.csv")


In [None]:
# ============================
# 1. Imports
# ============================
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction import text

# ============================
# 2. Load your data
# ============================
# df = pd.read_csv("your_claims.csv")  # Replace with your file
# Example columns: communication_notes, free_flow_opt_note
df["merged_text"] = (
    df["communication_notes"].fillna("").astype(str) + " " +
    df["free_flow_opt_note"].fillna("").astype(str)
)

# ============================
# 3. Clean text
# ============================
def clean_text(txt):
    txt = str(txt).lower()
    txt = re.sub(r"\bnan\b", " ", txt)             # remove 'nan'
    txt = re.sub(r"[^a-z0-9\s]", " ", txt)         # remove punctuation
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

df["clean_text"] = df["merged_text"].apply(clean_text)

# ============================
# 4. TF-IDF with bigrams & trigrams
# ============================
# Keep default stopwords but allow important domain words
custom_stopwords = text.ENGLISH_STOP_WORDS - {
    "letter", "request", "sent", "medical", "record"
}

vectorizer = TfidfVectorizer(
    stop_words=custom_stopwords,
    ngram_range=(1, 3),        # unigrams, bigrams, trigrams
    min_df=5                   # must appear in at least 5 docs
)

X = vectorizer.fit_transform(df["clean_text"])

# ============================
# 5. KMeans clustering
# ============================
n_clusters = 5  # You can adjust this
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# ============================
# 6. Function: Top phrases per cluster
# ============================
def top_terms_for_cluster(cluster_id, top_n=20):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    return [(terms[i], mean_tfidf[i]) for i in top_idx]

# Show top terms per cluster
for c in range(n_clusters):
    print(f"\n=== Cluster {c} (size {len(df[df['cluster'] == c])}) ===")
    for term, score in top_terms_for_cluster(c):
        print(f"{term:40s} {score:.4f}")

# ============================
# 7. Evidence phrase extraction
# ============================
def find_matches_in_text(text, terms):
    found = []
    for term in terms:
        # Match full term as phrase
        if re.search(r"\b" + re.escape(term) + r"\b", text):
            found.append(term)
    return found

# Dictionary: cluster → top phrases
cluster_top_terms = {
    c: [t for t, _ in top_terms_for_cluster(c)]
    for c in range(n_clusters)
}

# Create column with matched phrases
df["matched_terms"] = df.apply(
    lambda row: "; ".join(find_matches_in_text(row["clean_text"], cluster_top_terms[row["cluster"]])),
    axis=1
)

# ============================
# 8. Sample viewer per cluster
# ============================
SAMPLES_PER_CLUSTER = 5
for c in range(n_clusters):
    print(f"\n=== Cluster {c} ===")
    print("Top phrases:", ", ".join(cluster_top_terms[c]))
    sample_notes = df[df['cluster'] == c]['merged_text'].sample(min(SAMPLES_PER_CLUSTER, len(df[df['cluster'] == c])))
    for note in sample_notes:
        print("-", note)

# ============================
# 9. Interactive labeling
# ============================
cluster_labels = {}
for c in range(n_clusters):
    print(f"\nCluster {c} top phrases: {', '.join(cluster_top_terms[c][:10])}")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(3)
    print("\nSample notes:")
    for note in sample_notes:
        print("-", note)
    label = input("Enter label for this cluster (finding/nofinding/cancelled/unknown): ").strip().lower()
    cluster_labels[c] = label

df['pattern_label'] = df['cluster'].map(cluster_labels)

# ============================
# 10. Save
# ============================
df.to_csv("claims_cluster_tagged.csv", index=False)
print("\nSaved tagged data to claims_cluster_tagged.csv")


In [None]:
# ============================
# 1. Imports
# ============================
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction import text

# ============================
# 2. Load your data
# ============================
# df = pd.read_csv("your_claims.csv")
df["merged_text"] = (
    df["communication_notes"].fillna("").astype(str) + " " +
    df["free_flow_opt_note"].fillna("").astype(str)
)

# ============================
# 3. Clean text
# ============================
def clean_text(txt):
    txt = str(txt).lower()
    txt = re.sub(r"\bnan\b", " ", txt)  # remove 'nan'
    txt = re.sub(r"[^a-z0-9\s]", " ", txt)  # remove punctuation
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

df["clean_text"] = df["merged_text"].apply(clean_text)

# ============================
# 4. TF-IDF with bigrams/trigrams
# ============================
custom_stopwords = text.ENGLISH_STOP_WORDS - {
    "letter", "request", "sent", "medical", "record"
}

vectorizer = TfidfVectorizer(
    stop_words=custom_stopwords,
    ngram_range=(1, 3),        # unigrams, bigrams, trigrams
    min_df=5
)

X = vectorizer.fit_transform(df["clean_text"])

# ============================
# 5. KMeans clustering
# ============================
n_clusters = 5
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# ============================
# 6. Top phrases per cluster
# ============================
def top_terms_for_cluster(cluster_id, top_n=20):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    return [(terms[i], mean_tfidf[i]) for i in top_idx]

cluster_top_terms = {
    c: [t for t, _ in top_terms_for_cluster(c)]
    for c in range(n_clusters)
}

# ============================
# 7. Evidence phrase extraction
# ============================
def find_matches_in_text(text, terms):
    return [term for term in terms if re.search(r"\b" + re.escape(term) + r"\b", text)]

df["matched_terms"] = df.apply(
    lambda row: "; ".join(find_matches_in_text(row["clean_text"], cluster_top_terms[row["cluster"]])),
    axis=1
)

# ============================
# 8. Auto-suggest cluster labels
# ============================
FINDING_KEYWORDS = [
    "overpayment", "offset", "uncheck", "refund", "rpa",
    "recoupment", "dispute", "recover", "repayment"
]
NOFINDING_KEYWORDS = [
    "no findings", "no issue", "valid", "compliant",
    "nothing found", "no error"
]
CANCELLED_KEYWORDS = [
    "cancelled", "withdrawn", "closed without action",
    "claim withdrawn"
]

def auto_label_cluster(phrases):
    joined = " ".join(phrases).lower()
    if any(kw in joined for kw in FINDING_KEYWORDS):
        return "finding"
    elif any(kw in joined for kw in NOFINDING_KEYWORDS):
        return "nofinding"
    elif any(kw in joined for kw in CANCELLED_KEYWORDS):
        return "cancelled"
    else:
        return "unknown"

cluster_labels = {}
for c in range(n_clusters):
    suggestion = auto_label_cluster(cluster_top_terms[c])
    print(f"\n=== Cluster {c} ===")
    print("Top phrases:", ", ".join(cluster_top_terms[c]))
    sample_notes = df[df['cluster'] == c]['merged_text'].head(3)
    print("\nSample notes:")
    for note in sample_notes:
        print("-", note)
    label = input(f"Suggested label: {suggestion} — Press Enter to accept or type new label: ").strip().lower()
    cluster_labels[c] = label if label else suggestion

df['pattern_label'] = df['cluster'].map(cluster_labels)

# ============================
# 9. Save
# ============================
df.to_csv("claims_cluster_tagged.csv", index=False)
print("\nSaved tagged data to claims_cluster_tagged.csv")


In [None]:
# =====================================
# 1. Imports
# =====================================
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# =====================================
# 2. Load your dataset
# =====================================
# df = pd.read_csv("your_claims_data.csv")  # Replace with your file path

# Merge the relevant text columns into one
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " + df["free_flow_opt_note"].fillna("")
)

# Basic cleaning: lowercase, remove special chars
df["clean_text"] = (
    df["merged_text"]
    .str.lower()
    .str.replace(r"[^a-z0-9\s]", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

# =====================================
# 3. Remove stopwords, codes, numbers
# =====================================
REMOVE_TERMS = {"md", "drg", "hrp", "ovptrxid", "nan"}
MIN_WORD_LEN = 3
WHITELIST = {"rpa", "offset"}

def clean_and_filter_tokens(text):
    tokens = text.split()
    filtered = []
    for token in tokens:
        if token.lower() in REMOVE_TERMS:
            continue
        if token.isdigit():
            continue
        if len(token) < MIN_WORD_LEN and token.lower() not in WHITELIST:
            continue
        filtered.append(token)
    return " ".join(filtered)

df["filtered_text"] = df["clean_text"].apply(clean_and_filter_tokens)

# =====================================
# 4. TF-IDF Vectorization
# =====================================
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),  # unigrams, bigrams, trigrams
    min_df=5,            # drop rare terms
    token_pattern=r'\b[a-zA-Z][a-zA-Z]+\b'
)
X = vectorizer.fit_transform(df["filtered_text"])

# =====================================
# 5. Clustering
# =====================================
n_clusters = 5  # You can adjust
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# =====================================
# 6. Function to get top terms per cluster
# =====================================
def top_terms_for_cluster(cluster_id, top_n=15):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    return [(terms[i], mean_tfidf[i]) for i in top_idx]

# =====================================
# 7. Print top terms
# =====================================
for c in range(n_clusters):
    print(f"\nCluster {c} (size {len(df[df['cluster']==c])}):")
    for term, score in top_terms_for_cluster(c):
        print(f"{term:40s} {score:.4f}")

# =====================================
# 8. Add evidence_terms column
# =====================================
def find_matches_in_text(text, terms):
    found = []
    for term in terms:
        if re.search(r'\b' + re.escape(term) + r'\b', text):
            found.append(term)
    return found

cluster_top_terms = {c: [t for t, _ in top_terms_for_cluster(c)] for c in range(n_clusters)}

df["evidence_terms"] = df.apply(
    lambda row: "; ".join(find_matches_in_text(row["filtered_text"], cluster_top_terms[row["cluster"]])),
    axis=1
)

# =====================================
# 9. Show sample notes per cluster
# =====================================
SAMPLES_PER_CLUSTER = 5
for c in range(n_clusters):
    print(f"\n=== Cluster {c} ===")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(SAMPLES_PER_CLUSTER)
    for note in sample_notes:
        print("-", note)

# =====================================
# 10. Interactive cluster tagging
# =====================================
cluster_labels = {}
for c in range(n_clusters):
    print(f"\nCluster {c} top terms:")
    for term, score in top_terms_for_cluster(c):
        print(f"  {term:40s} {score:.4f}")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(3)
    print("\nSample notes:")
    for note in sample_notes:
        print("-", note)
    label = input("Enter label for this cluster (finding/nofinding/cancelled/unknown): ").strip().lower()
    cluster_labels[c] = label

df['pattern_label'] = df['cluster'].map(cluster_labels)

# =====================================
# 11. Save final labeled dataset
# =====================================
df.to_csv('claims_cluster_tagged.csv', index=False)
print("\nSaved tagged data to claims_cluster_tagged.csv")


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Load data
# df = pd.read_csv("claims_data.csv")
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " + df["free_flow_opt_note"].fillna("")
)

# === Step 1: Clean and remove generic terms ===
REMOVE_TERMS = {"md", "drg", "hrp", "nan"}
MIN_WORD_LEN = 3
WHITELIST = {"rpa", "offset"}

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  # remove numbers/special chars
    tokens = []
    for tok in text.split():
        if tok in REMOVE_TERMS:
            continue
        if len(tok) < MIN_WORD_LEN and tok not in WHITELIST:
            continue
        tokens.append(tok)
    return " ".join(tokens)

df["filtered_text"] = df["merged_text"].apply(clean_text)

# === Step 2: Vectorize with n-grams ===
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),
    min_df=5
)
X = vectorizer.fit_transform(df["filtered_text"])
terms = vectorizer.get_feature_names_out()

# === Step 3: Cluster ===
n_clusters = 5
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# === Step 4: Top terms per cluster (remove overlaps) ===
def remove_redundant_phrases(phrases):
    phrases = sorted(phrases, key=len, reverse=True)
    final = []
    for p in phrases:
        if not any(p in bigger for bigger in final if p != bigger):
            final.append(p)
    return final

def top_terms_for_cluster(c, top_n=20):
    idx = df[df["cluster"] == c].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    raw_terms = [terms[i] for i in top_idx]
    return remove_redundant_phrases(raw_terms)

cluster_terms = {c: top_terms_for_cluster(c) for c in range(n_clusters)}

# === Step 5: Evidence column ===
def find_matches(text, phrases):
    found = []
    for phrase in phrases:
        if re.search(r'\b' + re.escape(phrase) + r'\b', text):
            found.append(phrase)
    return found

df["evidence_terms"] = df.apply(
    lambda row: find_matches(row["filtered_text"], cluster_terms[row["cluster"]]),
    axis=1
)

# === Step 6: Rule-based label suggestion ===
FINDING_PHRASES = {"overpayment closing", "recover overpayment", "audit completed"}
NOFINDING_PHRASES = {"no findings", "no overpayment", "reconsideration denied"}
CANCELLED_PHRASES = {"withdrawn", "cancelled", "rescinded"}

def suggest_label(evidence, amount):
    ev_set = set(evidence)
    if any(p in ev_set for p in CANCELLED_PHRASES):
        return "cancelled"
    if any(p in ev_set for p in FINDING_PHRASES):
        return "finding"
    if any(p in ev_set for p in NOFINDING_PHRASES) and amount == 0:
        return "nofinding"
    if amount > 0:
        return "finding"
    return "unknown"

df["pattern_label"] = df.apply(
    lambda r: suggest_label(r["evidence_terms"], r.get("overpayment_amount", 0)), axis=1
)

# === Step 7: Save results ===
df.to_csv("claims_with_labels.csv", index=False)
print("Saved to claims_with_labels.csv")

# === Debug: Show clusters ===
for c in range(n_clusters):
    print(f"\nCluster {c}: {cluster_terms[c]}")


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# =========================
# Load & merge text columns
# =========================
# df = pd.read_csv("claims_data.csv")
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " + df["free_flow_opt_note"].fillna("")
)

# =========================
# Step 1: Clean text
# =========================
REMOVE_TERMS = {"md", "drg", "hrp", "nan", "request", "letter", "sent", "medical", "record"}
MIN_WORD_LEN = 3
WHITELIST = {"rpa", "offset"}  # short words you still want

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  # keep only letters
    tokens = []
    for tok in text.split():
        if tok in REMOVE_TERMS:
            continue
        if len(tok) < MIN_WORD_LEN and tok not in WHITELIST:
            continue
        tokens.append(tok)
    return " ".join(tokens)

df["filtered_text"] = df["merged_text"].apply(clean_text)

# =========================
# Step 2: TF-IDF Vectorizer (1-3 grams)
# =========================
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),
    min_df=5
)
X = vectorizer.fit_transform(df["filtered_text"])
terms = vectorizer.get_feature_names_out()

# =========================
# Step 3: KMeans clustering
# =========================
n_clusters = 5
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# =========================
# Step 4: Top terms per cluster (remove overlaps)
# =========================
def remove_redundant_phrases(phrases):
    phrases = sorted(phrases, key=len, reverse=True)
    final = []
    for p in phrases:
        if not any(p in bigger for bigger in final if p != bigger):
            final.append(p)
    return final

def top_terms_for_cluster(c, top_n=20):
    idx = df[df["cluster"] == c].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    raw_terms = [terms[i] for i in top_idx]
    return remove_redundant_phrases(raw_terms)

cluster_terms = {c: top_terms_for_cluster(c) for c in range(n_clusters)}

# =========================
# Step 5: Evidence terms per claim
# =========================
def find_matches(text, phrases):
    found = []
    for phrase in phrases:
        if re.search(r'\b' + re.escape(phrase) + r'\b', text):
            found.append(phrase)
    return found

df["evidence_terms"] = df.apply(
    lambda row: find_matches(row["filtered_text"], cluster_terms[row["cluster"]]),
    axis=1
)

# =========================
# Step 6: Rule-based label suggestion
# =========================
FINDING_PHRASES = {"overpayment closing", "recover overpayment", "audit completed"}
NOFINDING_PHRASES = {"no findings", "no overpayment", "reconsideration denied"}
CANCELLED_PHRASES = {"withdrawn", "cancelled", "rescinded"}

def suggest_label(evidence, amount):
    ev_set = set(evidence)
    if any(p in ev_set for p in CANCELLED_PHRASES):
        return "cancelled"
    if any(p in ev_set for p in FINDING_PHRASES):
        return "finding"
    if any(p in ev_set for p in NOFINDING_PHRASES) and amount == 0:
        return "nofinding"
    if amount > 0:
        return "finding"
    return "unknown"

df["pattern_label"] = df.apply(
    lambda r: suggest_label(r["evidence_terms"], r.get("overpayment_amount", 0)), axis=1
)

# =========================
# Step 7: Save results
# =========================
df.to_csv("claims_with_labels.csv", index=False)
print("✅ Saved to claims_with_labels.csv")

# =========================
# Step 8: Show cluster summaries
# =========================
SAMPLES_PER_CLUSTER = 5

for c in range(n_clusters):
    cluster_df = df[df['cluster'] == c]
    print("="*80)
    print(f"📌 Cluster {c} — size: {len(cluster_df)}")
    print("Top 20 terms/phrases:")
    for term in cluster_terms[c]:
        print(f"  {term}")

    # Overpayment stats
    if "overpayment_amount" in df.columns:
        pos_amount = (cluster_df["overpayment_amount"] > 0).mean() * 100
        print(f"\n💰 % with positive overpayment_amount: {pos_amount:.1f}%")

    print("\nSample claim notes:")
    sample_rows = cluster_df.sample(
        min(SAMPLES_PER_CLUSTER, len(cluster_df)),
        random_state=42
    )
    for note in sample_rows["merged_text"]:
        print(f"- {note}")
    print("="*80)


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# =========================
# Load data
# =========================
# df = pd.read_csv("claims_data.csv")
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " + df["free_flow_opt_note"].fillna("")
)

# =========================
# Step 1: Clean text
# =========================
REMOVE_TERMS = {
    "md", "drg", "hrp", "nan", "request", "letter", "sent", "medical", "record",
    "chart", "order", "verify", "reported", "please", "note", "per"
}
MIN_WORD_LEN = 3
WHITELIST = {"rpa", "offset"}  # short words you want to keep

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = []
    for tok in text.split():
        if tok in REMOVE_TERMS:
            continue
        if len(tok) < MIN_WORD_LEN and tok not in WHITELIST:
            continue
        tokens.append(tok)
    return " ".join(tokens)

df["filtered_text"] = df["merged_text"].apply(clean_text)

# =========================
# Step 2: TF-IDF Vectorizer (1-3 grams)
# =========================
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),
    min_df=5,
    max_df=0.8  # remove phrases appearing in >80% of claims
)
X = vectorizer.fit_transform(df["filtered_text"])
terms = vectorizer.get_feature_names_out()

# =========================
# Step 3: KMeans clustering
# =========================
n_clusters = 5
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# =========================
# Step 4: Top terms per cluster (remove overlaps)
# =========================
def remove_redundant_phrases(phrases):
    phrases = sorted(phrases, key=len, reverse=True)
    final = []
    for p in phrases:
        if not any(p in bigger for bigger in final if p != bigger):
            final.append(p)
    return final

def top_terms_for_cluster(c, top_n=20):
    idx = df[df["cluster"] == c].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    top_idx = mean_tfidf.argsort()[::-1][:top_n * 2]  # take more, then filter
    raw_terms = [terms[i] for i in top_idx]
    # remove admin-like meaningless phrases again
    cleaned = [t for t in raw_terms if not any(stop in t.split() for stop in REMOVE_TERMS)]
    return remove_redundant_phrases(cleaned)[:top_n]

cluster_terms = {c: top_terms_for_cluster(c) for c in range(n_clusters)}

# =========================
# Step 5: Evidence terms per claim
# =========================
def find_matches(text, phrases):
    found = []
    for phrase in phrases:
        if re.search(r'\b' + re.escape(phrase) + r'\b', text):
            found.append(phrase)
    return found

df["evidence_terms"] = df.apply(
    lambda row: find_matches(row["filtered_text"], cluster_terms[row["cluster"]]),
    axis=1
)

# =========================
# Step 6: Rule-based label suggestion
# =========================
FINDING_PHRASES = {"overpayment closing", "recover overpayment", "audit completed"}
NOFINDING_PHRASES = {"no findings", "no overpayment", "reconsideration denied"}
CANCELLED_PHRASES = {"withdrawn", "cancelled", "rescinded"}

def suggest_label(evidence, amount):
    ev_set = set(evidence)
    if any(p in ev_set for p in CANCELLED_PHRASES):
        return "cancelled"
    if any(p in ev_set for p in FINDING_PHRASES):
        return "finding"
    if any(p in ev_set for p in NOFINDING_PHRASES) and amount == 0:
        return "nofinding"
    if amount > 0:
        return "finding"
    return "unknown"

df["pattern_label"] = df.apply(
    lambda r: suggest_label(r["evidence_terms"], r.get("overpayment_amount", 0)), axis=1
)

# =========================
# Step 7: Create final descriptive column for clarity
# =========================
df["cluster_top_terms"] = df["cluster"].apply(lambda c: ", ".join(cluster_terms[c]))

# =========================
# Step 8: Save full enriched dataset
# =========================
df.to_csv("claims_with_clusters_and_labels.csv", index=False)
print("✅ Saved to claims_with_clusters_and_labels.csv")

# =========================
# Step 9: Show summaries
# =========================
SAMPLES_PER_CLUSTER = 5

for c in range(n_clusters):
    cluster_df = df[df['cluster'] == c]
    print("="*80)
    print(f"📌 Cluster {c} — size: {len(cluster_df)}")
    print("Top 20 terms/phrases:")
    for term in cluster_terms[c]:
        print(f"  {term}")

    if "overpayment_amount" in df.columns:
        pos_amount = (cluster_df["overpayment_amount"] > 0).mean() * 100
        print(f"\n💰 % with positive overpayment_amount: {pos_amount:.1f}%")

    print("\nSample claim notes:")
    sample_rows = cluster_df.sample(
        min(SAMPLES_PER_CLUSTER, len(cluster_df)),
        random_state=42
    )
    for note in sample_rows["merged_text"]:
        print(f"- {note}")
    print("="*80)


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# ----------------------------------------
# Load data
# ----------------------------------------
# df = pd.read_csv("claims_data.csv")
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " + df["free_flow_opt_note"].fillna("")
)

# ----------------------------------------
# Step 1: Clean and remove generic terms
# ----------------------------------------
REMOVE_TERMS = {"md", "drg", "hrp", "nan", "request", "letter", "sent", "medical", "record"}
MIN_WORD_LEN = 3
WHITELIST = {"rpa", "offset"}

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  # remove numbers/special chars
    tokens = []
    for tok in text.split():
        if tok in REMOVE_TERMS:
            continue
        if len(tok) < MIN_WORD_LEN and tok not in WHITELIST:
            continue
        tokens.append(tok)
    return " ".join(tokens)

df["filtered_text"] = df["merged_text"].apply(clean_text)

# ----------------------------------------
# Step 2: Vectorize with n-grams
# ----------------------------------------
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),
    min_df=5
)
X = vectorizer.fit_transform(df["filtered_text"])
terms = vectorizer.get_feature_names_out()

# ----------------------------------------
# Step 3: Cluster
# ----------------------------------------
n_clusters = 5
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# ----------------------------------------
# Step 4: Top terms per cluster (remove overlaps)
# ----------------------------------------
def remove_redundant_phrases(phrases):
    phrases = sorted(phrases, key=len, reverse=True)
    final = []
    for p in phrases:
        if not any(p in bigger for bigger in final if p != bigger):
            final.append(p)
    return final

def top_terms_for_cluster(c, top_n=20):
    idx = df[df["cluster"] == c].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    raw_terms = [terms[i] for i in top_idx]
    return remove_redundant_phrases(raw_terms)

cluster_terms = {c: top_terms_for_cluster(c) for c in range(n_clusters)}

# ----------------------------------------
# Step 5: Evidence column
# ----------------------------------------
def find_matches(text, phrases):
    found = []
    for phrase in phrases:
        if re.search(r'\b' + re.escape(phrase) + r'\b', text):
            found.append(phrase)
    return found

df["evidence_terms"] = df.apply(
    lambda row: find_matches(row["filtered_text"], cluster_terms[row["cluster"]]),
    axis=1
)

# ----------------------------------------
# Step 6: Rule-based label suggestion
# ----------------------------------------
FINDING_PHRASES = {"overpayment closing", "recover overpayment", "audit completed"}
NOFINDING_PHRASES = {"no findings", "no overpayment", "reconsideration denied"}
CANCELLED_PHRASES = {"withdrawn", "cancelled", "rescinded"}

def suggest_label(evidence, amount):
    ev_set = set(evidence)
    if any(p in ev_set for p in CANCELLED_PHRASES):
        return "cancelled"
    if any(p in ev_set for p in FINDING_PHRASES):
        return "finding"
    if any(p in ev_set for p in NOFINDING_PHRASES) and amount == 0:
        return "nofinding"
    if amount > 0:
        return "finding"
    return "unknown"

df["pattern_label"] = df.apply(
    lambda r: suggest_label(r["evidence_terms"], r.get("overpayment_amount", 0)), axis=1
)

# ----------------------------------------
# Step 7: Save results
# ----------------------------------------
df.to_csv("claims_with_labels.csv", index=False)
print("Saved to claims_with_labels.csv")

# ----------------------------------------
# Step 8: Debug - Show clusters
# ----------------------------------------
for c in range(n_clusters):
    print(f"\nCluster {c}: {cluster_terms[c]}")

# ----------------------------------------
# EXTRA VIEW: Filter top terms by minimum TF-IDF weight
# ----------------------------------------
MIN_TFIDF_WEIGHT = 0.05  # adjust threshold

def top_terms_above_threshold(c, min_weight=MIN_TFIDF_WEIGHT, top_n=20):
    idx = df[df["cluster"] == c].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    mask = mean_tfidf >= min_weight
    filtered_idx = np.where(mask)[0]
    sorted_idx = filtered_idx[np.argsort(mean_tfidf[filtered_idx])[::-1]]
    raw_terms = [terms[i] for i in sorted_idx]
    cleaned = [t for t in raw_terms if not any(stop in t.split() for stop in REMOVE_TERMS)]
    return remove_redundant_phrases(cleaned)[:top_n]

strong_cluster_terms = {c: top_terms_above_threshold(c) for c in range(n_clusters)}

print("\n\n📊 Strong Signal Top Terms per Cluster (TF-IDF >= {:.2f})".format(MIN_TFIDF_WEIGHT))
for c in range(n_clusters):
    print("="*80)
    print(f"Cluster {c}")
    for term in strong_cluster_terms[c]:
        print(f"  {term}")

# ----------------------------------------
# Step 9: Add strong terms column for each row & save
# ----------------------------------------
df["strong_terms"] = df["cluster"].map(strong_cluster_terms)

# Final export with everything
df.to_csv("claims_with_labels_and_strong_terms.csv", index=False)
print("Saved to claims_with_labels_and_strong_terms.csv")
