In [None]:
# This notebook serves as an EDA notebook and it documents our thought process for model selection

In [None]:
import pandas as pd
from collections import Counter

In [None]:


# file_path = '../data/full_data.csv'
# chunk_size = 50000
# abbrev_counter = Counter()
# total_processed = 0

# for chunk in pd.read_csv(file_path, chunksize=chunk_size):
#     for _, row in chunk.iterrows():
#         text_words = row['TEXT'].split()
        
#         # Handle multiple locations/labels separated by |
#         locations = [int(x) for x in str(row['LOCATION']).split('|')]
#         labels = str(row['LABEL']).split('|')
        
#         for loc, label in zip(locations, labels):
#             if loc < len(text_words):
#                 abbrev = text_words[loc].upper()
#                 abbrev_counter[(abbrev, label.lower())] += 1
        
#         total_processed += 1
#         if total_processed % 10000 == 0:
#             print(f"Processed {total_processed}")

# print(f"\nTotal processed: {total_processed}")

# target_abbrevs = ['PT', 'CA', 'RA']

# for abbrev in target_abbrevs:
#     abbrev_pairs = [(a, l) for (a, l) in abbrev_counter.keys() if a == abbrev]
#     print(f"\n{abbrev}:")
#     for pair in sorted(abbrev_pairs, key=lambda x: abbrev_counter[x], reverse=True)[:10]:
#         print(f"  {pair[1]}: {abbrev_counter[pair]}")
    
#     total = sum(abbrev_counter[p] for p in abbrev_pairs)
#     unique_senses = len(abbrev_pairs)
#     print(f"  Total: {total}, Unique senses: {unique_senses}")

In [None]:
# Filter to multi-word labels where first letters match abbreviation
valid_counter = Counter()

for (abbrev, label), count in abbrev_counter.items():
    if ' ' in label:
        words = label.split()
        if len(words) >= 2:
            # Check if first letters match abbreviation
            initials = ''.join([w[0].upper() for w in words[:len(abbrev)]])
            if initials == abbrev:
                valid_counter[(abbrev, label)] = count

# Get abbreviations with at least 3 valid meanings
abbrev_meanings = {}
for abbrev in set(a for (a, l) in valid_counter.keys()):
    meanings = [(a, l) for (a, l) in valid_counter.keys() if a == abbrev]
    if len(meanings) >= 3:
        total = sum(valid_counter[p] for p in meanings)
        abbrev_meanings[abbrev] = total

# Top 10 abbreviations
top_abbrevs = sorted(abbrev_meanings.items(), key=lambda x: x[1], reverse=True)[:10]

print("Top abbreviations (filtered, >=3 meanings):\n")
for abbrev, total in top_abbrevs:
    pairs = [(a, l) for (a, l) in valid_counter.keys() if a == abbrev]
    top_3 = sorted(pairs, key=lambda x: valid_counter[x], reverse=True)[:3]
    
    print(f"{abbrev} (total: {total:,}, meanings: {len(pairs)}):")
    for pair in top_3:
        print(f"  {pair[1]}: {valid_counter[pair]:,}")
    print()

In [None]:
#SA meant South AFrica, lets look for 4th meaning 

In [None]:
sa_pairs = [(a, l) for (a, l) in valid_counter.keys() if a == 'SA']
top_4_sa = sorted(sa_pairs, key=lambda x: valid_counter[x], reverse=True)[:4]
for pair in top_4_sa:
    print(f"{pair[1]}: {valid_counter[pair]:,}")

Based on this we pick CC, CP, and SA based on three criteria:
- 1. Sufficient data volume (50K+ examples) for robust model training
- 2. Class balance - top sense represents <30% to avoid severe imbalance
- 3. Distinct medical meanings where first letters match abbreviation (true acronyms)

Each abbreviation will use only the top 3 most frequent meanings for a focused 3-class classification task

In [5]:
selected = {
    'CC': ['colorectal cancer', 'cell culture', 'cervical cancer'],
    'CP': ['chronic pain', 'chest pain', 'cerebral palsy'],
    'SA': ['surface area', 'sleep apnea', 'substance abuse']
}

# Filter and create new dataset
filtered_rows = []
stats = {abbrev: {meaning: 0 for meaning in meanings} for abbrev, meanings in selected.items()}

for chunk in pd.read_csv(file_path, chunksize=50000):
    for _, row in chunk.iterrows():
        text_words = row['TEXT'].split()
        locations = [int(x) for x in str(row['LOCATION']).split('|')]
        labels = str(row['LABEL']).split('|')
        
        for loc, label in zip(locations, labels):
            if loc < len(text_words):
                abbrev = text_words[loc].upper()
                label_clean = label.lower()
                
                if abbrev in selected and label_clean in selected[abbrev]:
                    filtered_rows.append({
                        'abbreviation': abbrev,
                        'text': row['TEXT'],
                        'location': loc,
                        'label': label_clean
                    })
                    stats[abbrev][label_clean] += 1
    
    if len(filtered_rows) % 50000 == 0:
        print(f"Collected {len(filtered_rows)} examples")

filtered_df = pd.DataFrame(filtered_rows)
filtered_df.to_csv('../data/filtered_dataset.csv', index=False)

print(f"\nTotal examples collected: {len(filtered_rows)}")
print("\nClass distribution:")
for abbrev, meanings in stats.items():
    total = sum(meanings.values())
    print(f"\n{abbrev} (total: {total}):")
    for meaning, count in sorted(meanings.items(), key=lambda x: x[1], reverse=True):
        print(f"  {meaning}: {count} ({100*count/total:.1f}%)")

NameError: name 'file_path' is not defined

In [None]:
# Synthetic Data Generation

In [None]:
import random
import pandas as pd

keywords = {
    'CC': {
        'colorectal cancer': ['colon', 'rectal', 'tumor', 'polyp', 'screening', 'bowel', 'adenocarcinoma'],
        'cell culture': ['medium', 'serum', 'flask', 'incubation', 'confluent', 'passage', 'cells'],
        'cervical cancer': ['HPV', 'screening', 'women', 'pap', 'uterine', 'cervix', 'gynecologic']
    },
    'CP': {
        'chronic pain': ['persistent', 'management', 'opioid', 'fibromyalgia', 'neuropathic', 'syndrome'],
        'chest pain': ['cardiac', 'angina', 'myocardial', 'thoracic', 'ECG', 'infarction'],
        'cerebral palsy': ['motor', 'developmental', 'spastic', 'children', 'disability', 'pediatric']
    },
    'SA': {
        'surface area': ['volume', 'ratio', 'measurement', 'calculated', 'cm2', 'size'],
        'sleep apnea': ['obstructive', 'CPAP', 'snoring', 'breathing', 'apneic', 'episodes'],
        'substance abuse': ['addiction', 'drugs', 'alcohol', 'treatment', 'dependence', 'rehabilitation']
    }
}

templates = [
    "Patient with {abbrev} showing {kw1} and {kw2} findings",
    "The {abbrev} diagnosis revealed {kw1} with {kw2} present",
    "Treatment for {abbrev} included {kw1} and {kw2} interventions",
    "Study examined {abbrev} patients with {kw1} and {kw2}",
    "{abbrev} assessment showed {kw1} and {kw2} indicators",
    "Research on {abbrev} identified {kw1} and {kw2} patterns",
    "Clinical presentation of {abbrev} included {kw1} and {kw2}",
    "Analysis of {abbrev} demonstrated {kw1} with {kw2}",
    "The {abbrev} case exhibited {kw1} and {kw2} characteristics",
    "Evaluation of {abbrev} detected {kw1} and {kw2} markers",
    "Investigation into {abbrev} found {kw1} with {kw2} evidence",
    "Medical report documented {abbrev} with {kw1} and {kw2}",
    "Screening for {abbrev} revealed {kw1} and {kw2} signs",
    "Diagnosis of {abbrev} confirmed {kw1} and {kw2} features",
    "Monitoring {abbrev} showed {kw1} and {kw2} progression"
]

synthetic_data = []
examples_per_meaning = 300
seen_texts = set()

for abbrev, meanings in keywords.items():
    for meaning, kw_list in meanings.items():
        generated = 0
        attempts = 0
        max_attempts = examples_per_meaning * 10
        
        while generated < examples_per_meaning and attempts < max_attempts:
            template = random.choice(templates)
            kw1 = random.choice(kw_list)
            kw2 = random.choice(kw_list)
            
            text = template.format(abbrev=abbrev, kw1=kw1, kw2=kw2)
            
            if text not in seen_texts:
                seen_texts.add(text)
                words = text.split()
                location = words.index(abbrev)
                
                synthetic_data.append({
                    'abbreviation': abbrev,
                    'text': text,
                    'location': location,
                    'label': meaning
                })
                generated += 1
            
            attempts += 1

synthetic_df = pd.DataFrame(synthetic_data)
synthetic_df.to_csv('../data/synthetic_dataset.csv', index=False)

print(f"Generated {len(synthetic_data)} unique synthetic examples")
print("\nDistribution:")
for abbrev in keywords.keys():
    abbrev_data = synthetic_df[synthetic_df['abbreviation'] == abbrev]
    print(f"\n{abbrev}: {len(abbrev_data)} total")
    for meaning in keywords[abbrev].keys():
        count = len(abbrev_data[abbrev_data['label'] == meaning])
        print(f"  {meaning}: {count}")

In [None]:
# Approach 1: Naive Bayes

In [None]:
# Feature Engineering

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Load synthetic data
synthetic_df = pd.read_csv('../data/synthetic_dataset.csv')

# Extract context
def extract_context(row, window_size=5):
    text_words = row['text'].lower().split()
    loc = int(row['location'])
    
    start = max(0, loc - window_size)
    end = min(len(text_words), loc + window_size + 1)
    
    context = text_words[start:loc] + text_words[loc+1:end]
    return context

# Generate n-grams
def get_ngrams(words, n):
    ngrams = []
    for i in range(len(words) - n + 1):
        ngram = ' '.join(words[i:i+n])
        ngrams.append(ngram)
    return ngrams

# Extract all contexts with n-grams
contexts = []
labels = []

for _, row in synthetic_df.iterrows():
    context_words = extract_context(row)
    
    # Generate unigrams, bigrams, trigrams
    ngrams = context_words.copy()
    ngrams.extend(get_ngrams(context_words, 2))
    ngrams.extend(get_ngrams(context_words, 3))
    
    contexts.append(ngrams)
    labels.append(row['label'])

print(f"Extracted {len(contexts)} contexts")

Extracted 2700 contexts


In [2]:
# Build vocabulary
vocabulary = {}
vocab_idx = 0

for context_ngrams in contexts:
    for ngram in context_ngrams:
        if ngram not in vocabulary:
            vocabulary[ngram] = vocab_idx
            vocab_idx += 1

print(f"Vocabulary size: {len(vocabulary)}")

# Create feature matrix
def vectorize(context_ngrams, vocab):
    vector = np.zeros(len(vocab))
    for ngram in context_ngrams:
        if ngram in vocab:
            vector[vocab[ngram]] += 1
    return vector

X = np.array([vectorize(context, vocabulary) for context in contexts])
y = np.array(labels)

print(f"Feature matrix shape: {X.shape}")
print(f"Labels shape: {y.shape}")

# Train/test split WITH index tracking
indices = np.arange(len(X))

X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, y, indices, test_size=0.3, random_state=42, stratify=y
)

print(f"Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")

# Retrain models
baseline = MostFrequentBaseline()
baseline.fit(y_train)
baseline_pred = baseline.predict(X_test)

nb = MultinomialNB(alpha=1.0)
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)

print("Baseline accuracy:", (baseline_pred == y_test).mean())
print("Naive Bayes accuracy:", (nb_pred == y_test).mean())

Vocabulary size: 3978
Feature matrix shape: (2700, 3978)
Labels shape: (2700,)
Train: 1890, Test: 810


NameError: name 'MostFrequentBaseline' is not defined

In [None]:
# Baseline: Most-frequent-sense classifier
from collections import Counter

class MostFrequentBaseline:
    def fit(self, y_train):
        self.most_frequent = Counter(y_train).most_common(1)[0][0]
    
    def predict(self, X_test):
        return np.array([self.most_frequent] * len(X_test))

# Multinomial Naive Bayes
class MultinomialNB:
    def __init__(self, alpha=1.0):
        self.alpha = alpha  # Laplace smoothing
        
    def fit(self, X_train, y_train):
        self.classes = np.unique(y_train)
        self.class_counts = {}
        self.feature_counts = {}
        self.class_probs = {}
        
        n_samples = len(y_train)
        
        for c in self.classes:
            # Get samples for this class
            X_c = X_train[y_train == c]
            
            # P(class)
            self.class_counts[c] = len(X_c)
            self.class_probs[c] = self.class_counts[c] / n_samples
            
            # Sum of feature counts for this class
            feature_sum = X_c.sum(axis=0) + self.alpha
            total_count = feature_sum.sum()
            
            # P(feature|class)
            self.feature_counts[c] = np.log(feature_sum / total_count)
    
    def predict(self, X_test):
        predictions = []
        
        for x in X_test:
            class_scores = {}
            
            for c in self.classes:
                # Log probability: log P(class) + sum(log P(feature|class))
                score = np.log(self.class_probs[c])
                score += np.sum(x * self.feature_counts[c])
                class_scores[c] = score
            
            predictions.append(max(class_scores, key=class_scores.get))
        
        return np.array(predictions)

# Train baseline
baseline = MostFrequentBaseline()
baseline.fit(y_train)
baseline_pred = baseline.predict(X_test)

# Train Naive Bayes
nb = MultinomialNB(alpha=1.0)
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)

print("Baseline accuracy:", (baseline_pred == y_test).mean())
print("Naive Bayes accuracy:", (nb_pred == y_test).mean())

In [None]:
# Per-class metrics
def calculate_metrics(y_true, y_pred, classes):
    metrics = {}
    
    for c in classes:
        tp = np.sum((y_pred == c) & (y_true == c))
        fp = np.sum((y_pred == c) & (y_true != c))
        fn = np.sum((y_pred != c) & (y_true == c))
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        metrics[c] = {'precision': precision, 'recall': recall, 'f1': f1}
    
    return metrics

# Confusion matrix
def confusion_matrix(y_true, y_pred, classes):
    n = len(classes)
    cm = np.zeros((n, n), dtype=int)
    class_to_idx = {c: i for i, c in enumerate(classes)}
    
    for true, pred in zip(y_true, y_pred):
        cm[class_to_idx[true]][class_to_idx[pred]] += 1
    
    return cm, classes

classes = np.unique(y_train)
metrics = calculate_metrics(y_test, nb_pred, classes)

print("Per-class metrics (Naive Bayes):\n")
for c in classes:
    m = metrics[c]
    print(f"{c}:")
    print(f"  Precision: {m['precision']:.4f}")
    print(f"  Recall: {m['recall']:.4f}")
    print(f"  F1: {m['f1']:.4f}\n")

cm, class_labels = confusion_matrix(y_test, nb_pred, classes)
print("Confusion Matrix:")
cm_df = pd.DataFrame(cm, index=class_labels, columns=class_labels)
cm_df

In [None]:
# Qualitative analysis with correct indices
success_mask = nb_pred == y_test
failure_mask = nb_pred != y_test

success_idx = idx_test[success_mask]
failure_idx = idx_test[failure_mask]

print(f"Successes: {len(success_idx)}, Failures: {len(failure_idx)}")

# Sample 15 successes
sample_success = np.random.choice(success_idx, size=min(15, len(success_idx)), replace=False)

print("\n=== SUCCESS EXAMPLES (15) ===\n")
for i, idx in enumerate(sample_success, 1):
    row = synthetic_df.loc[idx]
    pred = nb_pred[np.where(idx_test == idx)[0][0]]
    print(f"{i}. [{row['abbreviation']}] Label: {row['label']}")
    print(f"   Text: {row['text']}\n")

# All failures
print(f"\n=== FAILURE EXAMPLES ({len(failure_idx)}) ===\n")
for i, idx in enumerate(failure_idx, 1):
    row = synthetic_df.loc[idx]
    pred = nb_pred[np.where(idx_test == idx)[0][0]]
    print(f"{i}. [{row['abbreviation']}] True: {row['label']} | Pred: {pred}")
    print(f"   Text: {row['text']}\n")

Why it failed:
Text: "Treatment for CC included screening and screening interventions"

Both keywords are "screening"
"screening" appears in BOTH colorectal cancer AND cervical cancer keyword lists
No discriminative power when same ambiguous keyword used twice

Explanation:
Naive Bayes relies on discriminative keywords. When the synthetic generator randomly picked "screening" twice (a keyword shared between classes), the model couldn't distinguish. It predicted cervical cancer likely due to slightly higher prior probability or other feature weights.
This demonstrates:
The model works when keywords are distinctive but fails when ambiguous/shared keywords dominate the context - exactly validating Naive Bayes assumptions about word independence and clear class separation.

In [3]:
import pandas as pd
#Step 3b: Real MeDAL Data
# Load filtered dataset
real_df = pd.read_csv('../data/filtered_dataset.csv')

print(f"Total examples: {len(real_df)}")
print("\nClass distribution:")
real_df['label'].value_counts()

# Extract contexts (same pipeline as synthetic)
real_contexts = []
real_labels = []

for _, row in real_df.iterrows():
    context_words = extract_context(row)
    
    # Generate n-grams
    ngrams = context_words.copy()
    ngrams.extend(get_ngrams(context_words, 2))
    ngrams.extend(get_ngrams(context_words, 3))
    
    real_contexts.append(ngrams)
    real_labels.append(row['label'])

print(f"\nExtracted {len(real_contexts)} contexts")

Total examples: 113371

Class distribution:

Extracted 113371 contexts


In [4]:
# Build vocabulary with frequency filtering
from collections import Counter

ngram_counts = Counter()
for context_ngrams in real_contexts:
    for ngram in context_ngrams:
        ngram_counts[ngram] += 1

# Keep only n-grams that appear at least 3 times
min_frequency = 3
real_vocabulary = {}
vocab_idx = 0

for ngram, count in ngram_counts.items():
    if count >= min_frequency:
        real_vocabulary[ngram] = vocab_idx
        vocab_idx += 1

print(f"Filtered vocabulary size: {len(real_vocabulary)} (from {len(ngram_counts)})")

# Now create feature matrix
X_real = np.array([vectorize(context, real_vocabulary) for context in real_contexts])
y_real = np.array(real_labels)

print(f"Feature matrix shape: {X_real.shape}")

Filtered vocabulary size: 91205 (from 943627)
Feature matrix shape: (113371, 91205)


In [5]:
# Train/test split
X_train_real, X_test_real, y_train_real, y_test_real, idx_train_real, idx_test_real = train_test_split(
    X_real, y_real, np.arange(len(X_real)), test_size=0.3, random_state=42, stratify=y_real
)

print(f"Train: {X_train_real.shape[0]}, Test: {X_test_real.shape[0]}")
print(f"\nTrain label distribution:")
pd.Series(y_train_real).value_counts()

# Train models
baseline_real = MostFrequentBaseline()
baseline_real.fit(y_train_real)
baseline_pred_real = baseline_real.predict(X_test_real)

nb_real = MultinomialNB(alpha=1.0)
nb_real.fit(X_train_real, y_train_real)
nb_pred_real = nb_real.predict(X_test_real)

print("\n=== REAL DATA RESULTS ===")
print("Baseline accuracy:", (baseline_pred_real == y_test_real).mean())
print("Naive Bayes accuracy:", (nb_pred_real == y_test_real).mean())

Train: 79359, Test: 34012

Train label distribution:


NameError: name 'MostFrequentBaseline' is not defined

In [None]:
# Per-class metrics
classes_real = np.unique(y_train_real)
metrics_real = calculate_metrics(y_test_real, nb_pred_real, classes_real)

print("Per-class metrics (Real Data):\n")
for c in classes_real:
    m = metrics_real[c]
    print(f"{c}:")
    print(f"  Precision: {m['precision']:.4f}")
    print(f"  Recall: {m['recall']:.4f}")
    print(f"  F1: {m['f1']:.4f}\n")

# Confusion matrix
cm_real, class_labels_real = confusion_matrix(y_test_real, nb_pred_real, classes_real)
print("Confusion Matrix:")
cm_df_real = pd.DataFrame(cm_real, index=class_labels_real, columns=class_labels_real)
cm_df_real

In [14]:
# Successes and failures
success_mask_real = nb_pred_real == y_test_real
failure_mask_real = nb_pred_real != y_test_real

success_idx_real = idx_test_real[success_mask_real]
failure_idx_real = idx_test_real[failure_mask_real]

print(f"Successes: {len(success_idx_real)}, Failures: {len(failure_idx_real)}")

# Sample 15 successes and 15 failures
sample_success_real = np.random.choice(success_idx_real, size=15, replace=False)
sample_failure_real = np.random.choice(failure_idx_real, size=15, replace=False)

print("\n=== SUCCESS EXAMPLES (15) ===\n")
for i, idx in enumerate(sample_success_real, 1):
    row = real_df.loc[idx]
    context = extract_context(row)
    context_str = ' '.join(context[:20])
    print(f"{i}. [{row['abbreviation']}] Label: {row['label']}")
    print(f"   Context: ...{context_str}...")
    print(f"   Why: Clear discriminative keywords present\n")

print(f"\n=== FAILURE EXAMPLES (15) ===\n")
for i, idx in enumerate(sample_failure_real, 1):
    row = real_df.loc[idx]
    pred = nb_pred_real[np.where(idx_test_real == idx)[0][0]]
    context = extract_context(row)
    context_str = ' '.join(context[:20])
    print(f"{i}. [{row['abbreviation']}] True: {row['label']} | Pred: {pred}")
    print(f"   Context: ...{context_str}...")
    print(f"   Why: Ambiguous/overlapping terminology\n")

Successes: 26802, Failures: 7210

=== SUCCESS EXAMPLES (15) ===

1. [CC] Label: colorectal cancer
   Context: ...xenografted with the human sw crc cell l1 and from...
   Why: Clear discriminative keywords present

2. [CC] Label: cell culture
   Context: ...building on earlier research insect began with the successful establishment...
   Why: Clear discriminative keywords present

3. [CC] Label: cervical cancer
   Context: ...apoptosis in hela and siha cell lines expressing elevated c2...
   Why: Clear discriminative keywords present

4. [CC] Label: cell culture
   Context: ...and cell apoptosis in a mm of neuronal stretch injury...
   Why: Clear discriminative keywords present

5. [SA] Label: sleep apnea
   Context: ...continuous positive airway pressurecompliant obstructive called residual excessive sleepiness although...
   Why: Clear discriminative keywords present

6. [CC] Label: colorectal cancer
   Context: ...which includes the deleted in dcc gene has been linked...
   Why: Clea

In [21]:
# Result eval

In [None]:
# 1. Per-abbreviation performance
print("=== PER-ABBREVIATION ANALYSIS ===\n")
for abbrev in ['CC', 'CP', 'SA']:
    mask = real_df.loc[idx_test_real, 'abbreviation'] == abbrev
    abbrev_true = y_test_real[mask]
    abbrev_pred = nb_pred_real[mask]
    
    accuracy = (abbrev_true == abbrev_pred).mean()
    print(f"{abbrev}: {accuracy:.4f} accuracy")
    print(f"  Classes: {np.unique(abbrev_true)}")
    print()

# 2. Most confused class pairs
print("\n=== MOST CONFUSED PAIRS ===\n")
confusion_pairs = []
for i, true_class in enumerate(class_labels_real):
    for j, pred_class in enumerate(class_labels_real):
        if i != j and cm_real[i][j] > 0:
            confusion_pairs.append((true_class, pred_class, cm_real[i][j]))

confusion_pairs.sort(key=lambda x: x[2], reverse=True)
for true_c, pred_c, count in confusion_pairs[:10]:
    print(f"{true_c} â†’ {pred_c}: {count} times")

# 3. Feature importance - top words per class
print("\n=== TOP DISCRIMINATIVE FEATURES ===\n")
for c in classes_real:
    # Get log probabilities for this class
    feature_probs = nb_real.feature_counts[c]
    top_indices = np.argsort(feature_probs)[-10:][::-1]
    
    inv_vocab = {v: k for k, v in real_vocabulary.items()}
    top_features = [inv_vocab[idx] for idx in top_indices if idx in inv_vocab]
    
    print(f"{c}:")
    print(f"  {top_features[:10]}\n")

In [None]:
# TF-IDF

In [None]:
# def compute_tfidf(X_counts):
#     """
#     X_counts: raw count matrix (n_samples, n_features)
#     Returns: TF-IDF matrix
#     """
#     n_samples, n_features = X_counts.shape
    
#     # Term Frequency: normalize by document length
#     doc_lengths = X_counts.sum(axis=1, keepdims=True)
#     doc_lengths[doc_lengths == 0] = 1  # Avoid division by zero
#     tf = X_counts / doc_lengths
    
#     # Inverse Document Frequency
#     df = (X_counts > 0).sum(axis=0)  # Document frequency per term
#     idf = np.log((n_samples + 1) / (df + 1)) + 1  # Smooth IDF
    
#     # TF-IDF
#     tfidf = tf * idf
    
#     return tfidf

# # Apply TF-IDF to training data
# X_train_tfidf = compute_tfidf(X_train_real)
# X_test_tfidf = compute_tfidf(X_test_real)

# print(f"TF-IDF train shape: {X_train_tfidf.shape}")
# print(f"TF-IDF test shape: {X_test_tfidf.shape}")

# # Train Naive Bayes on TF-IDF features
# nb_tfidf = MultinomialNB(alpha=1.0)
# nb_tfidf.fit(X_train_tfidf, y_train_real)
# nb_pred_tfidf = nb_tfidf.predict(X_test_tfidf)

# print("\nTF-IDF RESULTS")
# print("Raw counts accuracy:", (nb_pred_real == y_test_real).mean())
# print("TF-IDF accuracy:", (nb_pred_tfidf == y_test_real).mean())

In [None]:
# Compute TF-IDF in batches to avoid memory issues
def compute_tfidf_batched(X_counts, batch_size=5000):
    n_samples, n_features = X_counts.shape
    
    # Compute IDF on full dataset first (small operation)
    df = (X_counts > 0).sum(axis=0)
    idf = np.log((n_samples + 1) / (df + 1)) + 1
    
    # Process TF in batches
    tfidf = np.zeros_like(X_counts, dtype=np.float32)
    
    for start_idx in range(0, n_samples, batch_size):
        end_idx = min(start_idx + batch_size, n_samples)
        batch = X_counts[start_idx:end_idx]
        
        # TF for this batch
        doc_lengths = batch.sum(axis=1, keepdims=True)
        doc_lengths[doc_lengths == 0] = 1
        tf_batch = batch / doc_lengths
        
        # TF-IDF for this batch
        tfidf[start_idx:end_idx] = tf_batch * idf
        
        if start_idx % 20000 == 0:
            print(f"Processed {start_idx}/{n_samples}")
    
    return tfidf

# Apply
print("Computing TF-IDF for training data...")
X_train_tfidf = compute_tfidf_batched(X_train_real, batch_size=5000)

print("Computing TF-IDF for test data...")
X_test_tfidf = compute_tfidf_batched(X_test_real, batch_size=5000)

# Train
nb_tfidf = MultinomialNB(alpha=1.0)
nb_tfidf.fit(X_train_tfidf, y_train_real)
nb_pred_tfidf = nb_tfidf.predict(X_test_tfidf)

print("\nRaw counts accuracy:", (nb_pred_real == y_test_real).mean())
print("TF-IDF accuracy:", (nb_pred_tfidf == y_test_real).mean())