In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 200)

df = pd.read_parquet("hf://datasets/argilla/medical-domain/data/train-00000-of-00001-67e4e7207342a623.parquet")

def extract_label(pred):
    if isinstance(pred, (list, np.ndarray)) and len(pred) > 0 and isinstance(pred[0], dict):
        return pred[0].get("label")
    return None

df['label'] = df['prediction'].apply(extract_label)
df['text_length'] = df['metrics'].apply(lambda x: x.get('text_length') if isinstance(x, dict) else None)

# drop empty columns
df = df.drop(columns=['inputs', 'prediction', 'prediction_agent', 'annotation', 'annotation_agent', 'multi_label', 'explanation', 'metadata', 'status', 'event_timestamp', 'metrics'], errors='ignore')

In [2]:
#print(df.shape)
#df.head()

In [3]:
df.map(lambda x: type(x).__name__).head(1)

Unnamed: 0,text,id,label,text_length
0,str,str,str,int


In [4]:
#df['label'].value_counts()

In [5]:
import sys
sys.path.insert(0, '..')
from src.exploration import DatasetExplorer

explorer = DatasetExplorer(df, text_column='text', label_column='label')

basic_stats = explorer.compute_basic_stats()
print("Basic Statistics:")
for key, value in basic_stats.items():
    print(f"  {key}: {value}")

Basic Statistics:
  total_samples: 4966
  missing_values: 0
  unique_samples: 2357
  class_distribution: {' Surgery': 1088, ' Consult - History and Phy.': 516, ' Cardiovascular / Pulmonary': 371, ' Orthopedic': 355, ' Radiology': 273, ' General Medicine': 259, ' Gastroenterology': 224, ' Neurology': 223, ' SOAP / Chart / Progress Notes': 166, ' Urology': 156, ' Obstetrics / Gynecology': 155, ' Discharge Summary': 108, ' ENT - Otolaryngology': 96, ' Neurosurgery': 94, ' Hematology - Oncology': 90, ' Ophthalmology': 83, ' Nephrology': 81, ' Emergency Room Reports': 75, ' Pediatrics - Neonatal': 70, ' Pain Management': 61, ' Psychiatry / Psychology': 53, ' Office Notes': 50, ' Podiatry': 47, ' Dermatology': 29, ' Cosmetic / Plastic Surgery': 27, ' Dentistry': 27, ' Letters': 23, ' Physical Medicine - Rehab': 21, ' Sleep Medicine': 20, ' Endocrinology': 19, ' Bariatrics': 18, ' IME-QME-Work Comp etc.': 16, ' Chiropractic': 14, ' Rheumatology': 10, ' Diets and Nutritions': 10, ' Speech - La

**Note:**
- Some clinical notes might be repeated (e.g., same “standard template” used multiple times).
- high imbalanced dataset.

In [6]:
# Measure text lengths
length_stats = explorer.measure_text_length()
print("Text Length Statistics:")
print("\nCharacter Length:")
for metric, value in length_stats['char_length'].items():
    print(f"  {metric:<10}: {value:>10.2f}")
print("\nToken Length:")
for metric, value in length_stats['token_length'].items():
    print(f"  {metric:<10}: {value:>10.2f}")

Text Length Statistics:

Character Length:
  min       :      11.00
  max       :   18425.00
  avg       :    3052.31
  median    :    2667.00
  std       :    1993.88

Token Length:
  min       :       3.00
  max       :    3489.00
  avg       :     553.02
  median    :     485.00
  std       :     361.98


**Note:**
- most notes are around 500 words, but some are very long.
- std shows very uneven text lenghts. -> truncate or pad to a max sequence token length.

In [7]:
# Get term frequency
term_freq, filtered_term_freq = explorer.compute_term_frequency(top_n=20)
print("Top 20 Most Frequent Terms:")
for term, count in term_freq:
    print(f"  {term:<10}: {count:>10}")
print("\nTop 20 Most Frequent Filtered Terms:")
for term, count in filtered_term_freq:
    print(f"  {term:<10}: {count:>10}")

Top 20 Most Frequent Terms:
  ,         :     177985
  the       :     149782
  .         :     140132
  and       :      82612
  was       :      71766
  of        :      59010
  to        :      50164
  :         :      48649
  a         :      42608
  with      :      35803
  in        :      32758
  is        :      26378
  patient   :      24108
  no        :      17829
  she       :      17593
  for       :      17047
  he        :      15544
  were      :      15535
  on        :      14654
  this      :      13857

Top 20 Most Frequent Filtered Terms:
  patient   :      24108
  right     :      11162
  left      :      10853
  history   :       9376
  procedure :       7179
  placed    :       6962
  normal    :       6917
  well      :       5851
  pain      :       5360
  also      :       4325
  using     :       4121
  blood     :       3917
  time      :       3896
  mg        :       3883
  noted     :       3877
  performed :       3792
  skin      :       3752
  without

**Note:**
- punctuation and common english stopwords are among the most frequent terms.
- ":" being so common suggests that the notes have many colon separated phrases.

In [8]:
# Detect languages
lang_dist, non_en = explorer.detect_languages()
print("Language Distribution:")
for lang, stats in lang_dist.items():
    print(f"  {lang}: {stats['count']:>5} ({stats['percentage']:>5}%)")
print("Non-English Samples:")
print(non_en.head(10))

Language Distribution:
  en:  4955 (99.78%)
  pt:     3 ( 0.06%)
  so:     2 ( 0.04%)
  de:     2 ( 0.04%)
  tl:     2 ( 0.04%)
  ro:     2 ( 0.04%)
Non-English Samples:
14                                                                   SUBJECTIVE:,
34                                  MANNER OF DEATH: , Homicide.,CAUSE OF DEATH:,
613                                                       REASON FOR EVALUATION:,
1243            XYZ, D.C.,60 Evergreen Place,Suite 902,East Orange, NJ  07018,Re:
1381                                                      REASON FOR EVALUATION:,
1638                                                                  OPERATION:,
1715                                                               INDICATION:  ,
2804    REASON FOR CONSULTATION: , Loculated left effusion, multilobar pneumonia.
3205                                                               INDICATION:  ,
3886                                                                 SUBJECTIVE:,
Name: text

**Note:**
- Most notes are in english
- The non-english examples are either very short, have a template-like structure or are medical headings. (detectino noise)

In [9]:
import sys
sys.path.insert(0, '..')
from src.exploration import TextClusterer

texts = df['text'].tolist()
labels_true = df['label'].astype('category').cat.codes  

In [10]:
cl = TextClusterer(n_clusters=5)
labels, metrics = cl.fit(texts, true_labels=labels_true)

print("Clustering metrics:")
for k, v in metrics.items():
    print(f"{k:12}: {v:.4f}")

print("\nTop terms per cluster:")
cl.top_terms_per_cluster(n=8)

Clustering metrics:
silhouette  : 0.0064
v_measure   : 0.2252

Top terms per cluster:
Cluster 0: chief complaint, chief, complaint, history, mg, patient, reveals, normal
Cluster 1: artery, coronary, left, right, aortic, coronary artery, stenosis, valve
Cluster 2: patient, placed, procedure, right, incision, left, using, anesthesia
Cluster 3: patient, right, normal, left, pain, exam, unremarkable, evidence
Cluster 4: history, patient, mg, pain, normal, denies, past, daily


In [11]:
cl_stem = TextClusterer(n_clusters=5, stemming=True)
labels_stem, metrics_stem = cl_stem.fit(texts, true_labels=labels_true)
print(metrics_stem)

print("Clustering metrics with stemming:")
for k, v in metrics_stem.items():
    print(f"{k:12}: {v:.4f}")

print("\nTop terms per cluster:")
cl_stem.top_terms_per_cluster(n=8)



{'silhouette': 0.011199906926716552, 'v_measure': 0.23569455086644714}
Clustering metrics with stemming:
silhouette  : 0.0112
v_measure   : 0.2357

Top terms per cluster:
Cluster 0: histori, patient, wa, ha, mg, medic, pain, daili
Cluster 1: wa, left, right, arteri, normal, patient, coronari, procedur
Cluster 2: wa, patient, place, procedur, use, incis, oper, right
Cluster 3: normal, neg, mass, ear, intact, clear, bilater, reveal
Cluster 4: hi, ha, wa, patient, pain, histori, thi, ani


In [12]:
cl_lemma = TextClusterer(n_clusters=5, lemmatization=True)
labels_lemma, metrics_lemma = cl_lemma.fit(texts, true_labels=labels_true)
print(metrics_lemma)

print("Clustering metrics with lemmatization:")
for k, v in metrics_lemma.items():
    print(f"{k:12}: {v:.4f}")

print("\nTop terms per cluster:")
cl_lemma.top_terms_per_cluster(n=8)




{'silhouette': 0.005425755779072793, 'v_measure': 0.24227657560716775}
Clustering metrics with lemmatization:
silhouette  : 0.0054
v_measure   : 0.2423

Top terms per cluster:
Cluster 0: patient, history, mg, pain, day, deny, daily, time
Cluster 1: artery, coronary, coronary artery, right, aortic, catheter, valve, stenosis
Cluster 2: patient, place, right, procedure, suture, incision, remove, diagnosis
Cluster 3: normal, negative, history, patient, clear, bilaterally, pain, masse
Cluster 4: patient, place, right, tendon, screw, medial, lateral, bone


**Note:**
- low silhouette -> not well separated clusters in TF-IDF space (expected in medical text where documents overlap in content)
- low v-measure -> defines agreement with true class labels (expected as clustering != classification)
- clustering works but it groups text by surface similarity, even if overall numeric scores are low
- cluster keywords make semantic sense given the medical dataset
- stemming slightly increases cluster compactness (higher silhouette) but reduces readability due to word truncation
- lemmatization produces cleaner, semantically more consistent clusters (higher v-measure) and is best for interpretability