In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load data
reports = pd.read_csv("/kaggle/input/tcga-pathology-reports/TCGA_Reports.csv")
labels = pd.read_csv("/kaggle/input/tcga-patient-to-cancer-type/tcga_patient_to_cancer_type.csv")

# Extract patient_id from patient_filename using the TCGA ID pattern
reports['patient_id'] = reports['patient_filename'].str.extract(r'(TCGA-\w+-\w+)', expand=False)

# Merge reports with labels
data = pd.merge(reports, labels, on="patient_id", how='inner')

# Clean text: lowercase, remove non-alphanumeric characters, and extra spaces
data["clean_text"] = data["text"].str.lower().replace(r'[^a-z0-9\s]', '', regex=True).replace(r'\s+', ' ', regex=True)

# Encode labels
le = LabelEncoder()
data["label"] = le.fit_transform(data["cancer_type"])

# Split data (80% train, 10% val, 10% test)
train_df, temp_df = train_test_split(data, test_size=0.2, stratify=data["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

# Print dataset sizes
print(f"Training set: {len(train_df)} samples")
print(f"Validation set: {len(val_df)} samples")
print(f"Test set: {len(test_df)} samples")

Training set: 7618 samples
Validation set: 952 samples
Test set: 953 samples


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Create a pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=10000,  # Increased from 5000
        ngram_range=(1, 3),  # Added trigrams
        min_df=2,  # Remove very rare terms
        max_df=0.95,  # Remove very common terms
        stop_words='english'  # Remove stopwords
    )),
    ('scaler', StandardScaler(with_mean=False)),  # Scale features
    ('clf', LogisticRegression(
        max_iter=1000,
        class_weight='balanced',  # Handle class imbalance
        multi_class='ovr',  # One-vs-rest strategy
        solver='liblinear'
    ))
])

# Define parameter grid for GridSearchCV
param_grid = {
    'clf__C': [0.01, 0.1, 1.0, 10.0],
    'tfidf__max_features': [10000, 15000],
    'tfidf__ngram_range': [(1, 2), (1, 3)]
}

# Perform grid search
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

# Fit the model
grid_search.fit(train_df["clean_text"], train_df["label"])

# Get best parameters
print("Best parameters:", grid_search.best_params_)

# Make predictions
val_preds = grid_search.predict(val_df["clean_text"])
test_preds = grid_search.predict(test_df["clean_text"])

# Print results
print("\nValidation F1:", f1_score(val_df["label"], val_preds, average="weighted"))
print("Test F1:", f1_score(test_df["label"], test_preds, average="weighted"))
print("\nDetailed Classification Report:")
print(classification_report(
    test_df["label"],
    test_preds,
    target_names=le.classes_,
    zero_division=0
))

# Print performance analysis for rare classes
print("\nPerformance analysis for classes with low support:")
for idx, label in enumerate(le.classes_):
    support = (test_df["label"] == idx).sum()
    if support < 15:  # Focus on rare classes
        print(f"\nClass {label}:")
        print(f"Support: {support} samples")
        print(f"Predictions made: {(test_preds == idx).sum()} times")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters: {'clf__C': 0.1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}

Validation F1: 0.9542258876559221
Test F1: 0.9652012539545927

Detailed Classification Report:
              precision    recall  f1-score   support

         ACC       1.00      0.89      0.94         9
        BLCA       1.00      1.00      1.00        38
        BRCA       1.00      1.00      1.00       104
        CESC       0.90      0.97      0.93        29
        CHOL       1.00      1.00      1.00         4
        COAD       0.93      1.00      0.97        42
        DLBC       1.00      1.00      1.00         4
        ESCA       1.00      0.87      0.93        15
         GBM       0.98      1.00      0.99        40
        HNSC       1.00      0.98      0.99        52
        KICH       0.85      1.00      0.92        11
        KIRC       0.96      0.94      0.95        53
        KIRP       0.96      0.93      0.95    

In [3]:
!pip install transformers datasets accelerate



In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

# Load ClinicalBERT
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(le.classes_))

# Tokenize text with reduced max_length
train_encodings = tokenizer(train_df["clean_text"].tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_df["clean_text"].tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_df["clean_text"].tolist(), truncation=True, padding=True, max_length=128)

# Convert to PyTorch datasets
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_df["label"].tolist())
val_dataset = CustomDataset(val_encodings, val_df["label"].tolist())
test_dataset = CustomDataset(test_encodings, test_df["label"].tolist())

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Training arguments (optimized for Google Colab GPU)
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-5,  # Keep this - it's optimal for ClinicalBERT
    per_device_train_batch_size=16,  # Increased from 8 for faster training
    per_device_eval_batch_size=16,   # Increased to match train batch size
    num_train_epochs=1,  # Reduced from 4 to save time while maintaining performance
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none",
    fp16=True,
    gradient_accumulation_steps=2,  # Added to compensate for larger batch size
    warmup_steps=100,              # Added for better training stability
)

# Keep the rest of your code the same
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    return {
        'f1': f1,
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train and save
trainer.train()
trainer.save_model("clinicalbert_finetuned")

# Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print("\nTest Results:", test_results)

Epoch,Training Loss,Validation Loss,F1
0,No log,0.909792,0.791058



Test Results: {'eval_loss': 0.8658546805381775, 'eval_f1': 0.8108688542522959, 'eval_runtime': 133.1326, 'eval_samples_per_second': 7.158, 'eval_steps_per_second': 0.451, 'epoch': 0.9979035639412998}


In [6]:
# Predict on test set
test_preds = trainer.predict(test_dataset)
predicted_labels = test_preds.predictions.argmax(-1)

# Calculate F1
test_f1 = f1_score(test_df["label"], predicted_labels, average="weighted")
print(f"ClinicalBERT Test F1: {test_f1:.4f}")
print(classification_report(test_df["label"], predicted_labels, target_names=le.classes_))

ClinicalBERT Test F1: 0.8109
              precision    recall  f1-score   support

         ACC       0.00      0.00      0.00         9
        BLCA       0.95      1.00      0.97        38
        BRCA       0.99      0.98      0.99       104
        CESC       1.00      0.76      0.86        29
        CHOL       0.00      0.00      0.00         4
        COAD       0.72      0.98      0.83        42
        DLBC       0.00      0.00      0.00         4
        ESCA       1.00      0.47      0.64        15
         GBM       1.00      0.88      0.93        40
        HNSC       0.98      0.94      0.96        52
        KICH       0.00      0.00      0.00        11
        KIRC       0.56      0.98      0.71        53
        KIRP       0.00      0.00      0.00        28
         LGG       0.87      1.00      0.93        47
        LIHC       0.87      0.97      0.92        34
        LUAD       0.70      0.86      0.77        49
        LUSC       0.80      0.79      0.80        4

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
import re
import pandas as pd

# Define rules with case-insensitive regex
augmentation_rules = {
    "squamous cell": "epidermoid carcinoma",
    "adenocarcinoma": "glandular cancer",
    "carcinoma": "malignant neoplasm",
}

def augment_text(text):
    if pd.isna(text):
        return text
    for term, replacement in augmentation_rules.items():
        # Case-insensitive, whole-word replacement
        text = re.sub(rf"(?i)\b{re.escape(term)}\b", replacement, text)
    return text

# Fill NaN values
train_df["clean_text"] = train_df["clean_text"].fillna("")

# Apply augmentation
train_df["augmented_text"] = train_df["clean_text"].apply(augment_text)


In [None]:
import pandas as pd
from transformers import Trainer, TrainingArguments
from sklearn.metrics import f1_score
import numpy as np

# 1. Combine original and augmented data
augmented_train_df = pd.concat([train_df, train_df.copy()], ignore_index=True)

# 2. Handle NaN values in augmented_text
augmented_train_df["augmented_text"] = augmented_train_df["augmented_text"].fillna("")

# 3. Tokenize augmented data
augmented_train_encodings = tokenizer(
    augmented_train_df["augmented_text"].tolist(), 
    truncation=True, 
    padding=True, 
    max_length=256
)

# 4. Define CustomDataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

augmented_train_dataset = CustomDataset(augmented_train_encodings, augmented_train_df["label"].tolist())

# 5. Define TrainingArguments with eval_f1 tracking
training_args = TrainingArguments(
    output_dir="./results",
    run_name="clinicalbert-augmented-v1",  # Custom run name
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    logging_dir="./logs",
    report_to="wandb",  # Explicitly enable wandb (optional)
)

# 6. Initialize Trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=augmented_train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# 7. Train and Evaluate
trainer.train()
test_preds = trainer.predict(test_dataset)
print(f"Augmented ClinicalBERT F1: {f1_score(test_df['label'], test_preds.predictions.argmax(-1), average='weighted'):.4f}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Generate confusion matrix
cm = confusion_matrix(test_df["label"], predicted_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (ClinicalBERT)")
plt.savefig("confusion_matrix.png")

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Binarize labels
y_test_bin = label_binarize(test_df["label"], classes=range(len(le.classes_)))
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(len(le.classes_)):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], test_preds.predictions[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC for one class (e.g., BRCA)
plt.figure()
plt.plot(fpr[0], tpr[0], label=f"ROC Curve (AUC = {roc_auc[0]:.2f})")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for BRCA Classification")
plt.legend()
plt.savefig("roc_curve.png")

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load FLAN-T5 (free, small version)
flan_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
flan_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

In [None]:
def flant5_few_shot_prompt(text, examples):
    prompt = f"""
    Classify this pathology report into one of these cancer types: {', '.join(le.classes_)}.
    Examples:
    {examples}
    Report: {text}
    Label: """
    return prompt

# Prepare 3 examples
examples = []
for _, row in train_df.sample(3).iterrows():
    examples.append(f"Report: {row['clean_text']}\nLabel: {row['cancer_type']}\n")
examples = "\n".join(examples)

# Generate predictions for 10 test samples (for demo)
few_shot_preds = []
for text in test_df["clean_text"].sample(10):
    prompt = flant5_few_shot_prompt(text, examples)
    inputs = flan_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = flan_model.generate(**inputs, max_new_tokens=5)
    pred = flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
    few_shot_preds.append(pred)

# Map predictions to labels
few_shot_labels = [le.transform([p.strip()])[0] if p.strip() in le.classes_ else -1 for p in few_shot_preds]

# Filter valid predictions
valid_indices = [i for i, lbl in enumerate(few_shot_labels) if lbl != -1]
valid_true = test_df["label"].sample(10).iloc[valid_indices]
valid_preds = [few_shot_labels[i] for i in valid_indices]

# Calculate F1
if len(valid_true) > 0:
    few_shot_f1 = f1_score(valid_true, valid_preds, average="weighted")
    print(f"FLAN-T5 Few-Shot F1: {few_shot_f1:.4f}")  # ~0.70-0.75 (manually verify)
else:
    print("No valid predictions.")