# Claim Notes Pattern Discovery & Cluster Tagging
This notebook helps you:
1. Load your claims data.
2. Merge and clean the `communication_notes` and `free_flow_opt_note` columns.
3. Cluster similar claim notes.
4. View **top keywords** and **sample notes** for each cluster.
5. Tag clusters as `finding`, `nofinding`, or `cancelled` interactively.

You can then use these tags to create regex patterns or train a classifier.

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# --------------------------
# Load your data
# --------------------------
# Replace 'claims.csv' with your file path
df = pd.read_csv('claims.csv')

# Merge notes
df["merged_text"] = (
    df["communication_notes"].fillna("") + " " +
    df["free_flow_opt_note"].fillna("")
)

# --------------------------
# Cleaning
# --------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["clean_text"] = df["merged_text"].apply(clean_text)

df.head()

In [None]:
# --------------------------
# Vectorize & cluster
# --------------------------
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, stop_words="english")
X = vectorizer.fit_transform(df["clean_text"])

# Choose cluster count (tweak as needed)
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df["cluster"] = kmeans.fit_predict(X)

df['cluster'].value_counts().sort_index()

In [None]:
# --------------------------
# Top terms per cluster
# --------------------------
import numpy as np

def top_terms_for_cluster(cluster_id, top_n=15):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1][:top_n]
    return [(terms[i], mean_tfidf[i]) for i in top_idx]

for c in range(n_clusters):
    print(f"\nCluster {c} (size {len(df[df['cluster']==c])}):")
    for term, score in top_terms_for_cluster(c):
        print(f"{term:20s} {score:.4f}")

In [None]:
# --------------------------
# Add evidence column: matched top terms from each note's cluster
# --------------------------
def find_matches_in_text(text, terms):
    found = []
    for term in terms:
        if re.search(r'\b' + re.escape(term) + r'\b', text):
            found.append(term)
    return found

# Build dictionary of cluster -> top terms
cluster_top_terms = {c: [t for t, _ in top_terms_for_cluster(c)] for c in range(n_clusters)}

# Create column with matches
df["matched_terms"] = df.apply(
    lambda row: "; ".join(find_matches_in_text(row["clean_text"], cluster_top_terms[row["cluster"]])),
    axis=1
)

df[["merged_text", "cluster", "matched_terms"]].head()

In [None]:
# --------------------------
# Show sample notes per cluster
# --------------------------
SAMPLES_PER_CLUSTER = 5
for c in range(n_clusters):
    print(f"\n=== Cluster {c} ===")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(SAMPLES_PER_CLUSTER)
    for note in sample_notes:
        print("-", note)

In [None]:
# --------------------------
# Interactive cluster tagging
# --------------------------
cluster_labels = {}
for c in range(n_clusters):
    print(f"\nCluster {c} top terms:")
    for term, score in top_terms_for_cluster(c):
        print(f"  {term:20s} {score:.4f}")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(3)
    print("\nSample notes:")
    for note in sample_notes:
        print("-", note)
    label = input("Enter label for this cluster (finding/nofinding/cancelled/unknown): ").strip().lower()
    cluster_labels[c] = label

# Assign labels back to df
df['pattern_label'] = df['cluster'].map(cluster_labels)

# Save labeled dataset
df.to_csv('claims_cluster_tagged.csv', index=False)
print("\nSaved tagged data to claims_cluster_tagged.csv")

In [None]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# --------------------------
# Cleaning function
# --------------------------
def clean_text(text):
    text = text.lower()
    # Remove IDs like n19878, abc123 (letters+digits combos)
    text = re.sub(r'\b[a-z]{0,3}\d{3,}\b', ' ', text)
    # Remove years/numbers
    text = re.sub(r'\b\d{2,4}\b', ' ', text)
    # Remove known meaningless codes (expand this list as you see them)
    custom_stopwords = {'drg', 'hrp', 'etc', 'na'}
    text = ' '.join(w for w in text.split() if w not in custom_stopwords)
    # Remove special characters & extra spaces
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --------------------------
# Prepare data
# --------------------------
df['merged_text'] = (
    df['communication_notes'].fillna('') + ' ' + df['free_flow_opt_note'].fillna('')
).apply(clean_text)

# --------------------------
# TF-IDF with bigrams
# --------------------------
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.85,
    min_df=5,
    ngram_range=(1, 2)  # single words + 2-word phrases
)
X = vectorizer.fit_transform(df['merged_text'])

# --------------------------
# Cluster
# --------------------------
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# --------------------------
# Top terms per cluster
# --------------------------
def top_terms_for_cluster(cluster_id, top_n=15, min_score=0.01):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1]
    results = []
    for i in top_idx:
        if mean_tfidf[i] >= min_score:  # skip very low-weight terms
            results.append((terms[i], mean_tfidf[i]))
        if len(results) >= top_n:
            break
    return results

# --------------------------
# Display results
# --------------------------
for c in range(n_clusters):
    print(f"\n{'='*50}\nCluster {c} (size {len(df[df['cluster']==c])})")
    for term, score in top_terms_for_cluster(c):
        print(f"{term:25s} {score:.4f}")


In [None]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# --------------------------
# Cleaning function
# --------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\b[a-z]{0,3}\d{3,}\b', ' ', text)   # IDs
    text = re.sub(r'\b\d{2,4}\b', ' ', text)            # years/numbers
    custom_stopwords = {'drg', 'hrp', 'etc', 'na'}      # expand this
    text = ' '.join(w for w in text.split() if w not in custom_stopwords)
    text = re.sub(r'[^a-z\s]', ' ', text)               # non-letters
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --------------------------
# Prepare data
# --------------------------
df['merged_text'] = (
    df['communication_notes'].fillna('') + ' ' + df['free_flow_opt_note'].fillna('')
).apply(clean_text)

# --------------------------
# TF-IDF with bigrams
# --------------------------
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.85,
    min_df=5,
    ngram_range=(1, 2)
)
X = vectorizer.fit_transform(df['merged_text'])

# --------------------------
# Cluster
# --------------------------
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# --------------------------
# Top terms per cluster
# --------------------------
def top_terms_for_cluster(cluster_id, top_n=15, min_score=0.01):
    idx = df[df["cluster"] == cluster_id].index
    if len(idx) == 0:
        return []
    sub_matrix = X[idx]
    mean_tfidf = sub_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = mean_tfidf.argsort()[::-1]
    results = []
    for i in top_idx:
        if mean_tfidf[i] >= min_score:
            results.append((terms[i], mean_tfidf[i]))
        if len(results) >= top_n:
            break
    return results

# --------------------------
# Add evidence column
# --------------------------
def find_matches_in_text(text, terms):
    found = []
    for term in terms:
        if re.search(r'\b' + re.escape(term) + r'\b', text):
            found.append(term)
    return found

cluster_top_terms = {c: [t for t, _ in top_terms_for_cluster(c)] for c in range(n_clusters)}
df["matched_terms"] = df.apply(
    lambda row: "; ".join(find_matches_in_text(row["merged_text"], cluster_top_terms[row["cluster"]])),
    axis=1
)

# --------------------------
# Show sample notes per cluster
# --------------------------
SAMPLES_PER_CLUSTER = 5
for c in range(n_clusters):
    print(f"\n=== Cluster {c} ===")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(SAMPLES_PER_CLUSTER)
    for note in sample_notes:
        print("-", note)

# --------------------------
# Interactive cluster tagging
# --------------------------
cluster_labels = {}
for c in range(n_clusters):
    print(f"\nCluster {c} top terms:")
    for term, score in top_terms_for_cluster(c):
        print(f"  {term:20s} {score:.4f}")
    sample_notes = df[df['cluster'] == c]['merged_text'].head(3)
    print("\nSample notes:")
    for note in sample_notes:
        print("-", note)
    label = input("Enter label for this cluster (finding/nofinding/cancelled/unknown): ").strip().lower()
    cluster_labels[c] = label

df['pattern_label'] = df['cluster'].map(cluster_labels)
df.to_csv('claims_cluster_tagged.csv', index=False)
print("\nSaved tagged data to claims_cluster_tagged.csv")
