# üè• Document Classifier Training - French Medical Documents

Train **CamemBERT** to classify 7 types of medical documents

## ‚ö° Quick Start
1. Enable GPU: Runtime ‚Üí Change runtime type ‚Üí T4 GPU
2. Upload training_data.csv (Files tab)
3. Run all cells
4. Download model


In [None]:
import torch

if torch.cuda.is_available():
    print(f"‚úÖ GPU Available: {torch.cuda.get_device_name(0)}")
    device = "cuda"
else:
    print("‚ö†Ô∏è GPU NOT available. Enable: Runtime ‚Üí Change runtime type ‚Üí T4 GPU")
    device = "cpu"

print(f"Using device: {device}")

In [None]:
!pip install transformers datasets scikit-learn pandas numpy torch -q

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import CamembertTokenizer, CamembertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm.auto import tqdm
import json
import zipfile
import os

print("‚úÖ All libraries imported!")

In [None]:
# Load training data
if not os.path.exists('training_data.csv'):
    raise FileNotFoundError("Upload training_data.csv using Files tab")

df = pd.read_csv('training_data.csv')
print(f"‚úÖ Loaded {len(df)} samples")
print(f"\nClass distribution:\n{df['label'].value_counts()}")

In [None]:
# Define document types
DOCUMENT_TYPES = ['blood_test', 'xray', 'mri', 'prescription', 'medical_report', 'lab_result', 'consultation_note']

# Create label mappings
label2id = {label: idx for idx, label in enumerate(DOCUMENT_TYPES)}
id2label = {idx: label for idx, label in enumerate(DOCUMENT_TYPES)}

# Convert labels to IDs
df['label_id'] = df['label'].map(label2id)
texts = df['text'].tolist()
labels = df['label_id'].tolist()

print(f"‚úÖ Labels: {label2id}")

In [None]:
# Split data (80% train, 20% validation)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

print(f"‚úÖ Train: {len(train_texts)}, Val: {len(val_texts)}")

In [None]:
# Load tokenizer
MODEL_NAME = "camembert-base"
tokenizer = CamembertTokenizer.from_pretrained(MODEL_NAME)

# Create dataset class
class DocumentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            str(self.texts[idx]),
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets
train_dataset = DocumentDataset(train_texts, train_labels, tokenizer)
val_dataset = DocumentDataset(val_texts, val_labels, tokenizer)

print(f"‚úÖ Datasets created")

In [None]:
# Configuration
BATCH_SIZE = 8
NUM_EPOCHS = 10

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"‚úÖ DataLoaders ready: {len(train_loader)} train batches, {len(val_loader)} val batches")

In [None]:
# Load model
model = CamembertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(DOCUMENT_TYPES),
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3
)
model = model.to(device)

print(f"‚úÖ Model initialized on {device}")

In [None]:
# Setup optimizer and scheduler
LEARNING_RATE = 2e-5
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)

total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=100, num_training_steps=total_steps
)

print(f"‚úÖ Optimizer ready (LR: {LEARNING_RATE})")

In [None]:
# Training and validation functions
def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    correct = 0
    
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        
        preds = torch.argmax(logits, dim=1)
        correct += torch.sum(preds == labels)
        total_loss += loss.item()
    
    return total_loss / len(data_loader), correct.double() / len(data_loader.dataset)

def eval_epoch(model, data_loader, device):
    model.eval()
    total_loss = 0
    correct = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            
            preds = torch.argmax(logits, dim=1)
            correct += torch.sum(preds == labels)
            total_loss += loss.item()
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return total_loss / len(data_loader), correct.double() / len(data_loader.dataset), all_preds, all_labels

# Main training loop
print("üöÄ Starting training...\n")
best_val_acc = 0
training_history = []

for epoch in range(NUM_EPOCHS):
    print(f"\n{'='*60}")
    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    print(f"{'='*60}")
    
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f"Train - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
    
    val_loss, val_acc, val_preds, val_labels = eval_epoch(model, val_loader, device)
    print(f"Val   - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        print(f"üèÜ New best: {best_val_acc:.4f}")
    
    training_history.append({
        'epoch': epoch + 1,
        'train_loss': train_loss,
        'train_acc': train_acc.item(),
        'val_loss': val_loss,
        'val_acc': val_acc.item()
    })

print(f"\n‚úÖ Training complete! Best accuracy: {best_val_acc:.4f} ({best_val_acc*100:.2f}%)")

In [None]:
# Classification report
print("\nüìä Classification Report:")
print("="*60)
report = classification_report(val_labels, val_preds, target_names=DOCUMENT_TYPES, digits=4)
print(report)

# Confusion matrix
print("\nüî¢ Confusion Matrix:")
cm = confusion_matrix(val_labels, val_preds)
print(cm)

In [None]:
# Save model
OUTPUT_DIR = "document_classifier_model"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Save PyTorch model with wrapped state dict
model_path = os.path.join(OUTPUT_DIR, "model.pth")
torch.save({
    'model_state_dict': model.state_dict(),
    'label_map': label2id,
    'best_val_acc': best_val_acc.item(),
    'training_history': training_history
}, model_path)

# Save tokenizer
tokenizer.save_pretrained(OUTPUT_DIR)

# Save config
config_data = {
    'base_model': MODEL_NAME,
    'num_labels': len(DOCUMENT_TYPES),
    'document_types': DOCUMENT_TYPES,
    'label_map': label2id,
    'id_to_label': id2label,
    'best_val_acc': best_val_acc.item(),
    'training_samples': len(train_texts),
    'validation_samples': len(val_texts),
    'num_epochs': NUM_EPOCHS,
    'batch_size': BATCH_SIZE,
    'learning_rate': LEARNING_RATE,
    'max_length': 512
}

with open(os.path.join(OUTPUT_DIR, "config.json"), 'w', encoding='utf-8') as f:
    json.dump(config_data, f, indent=2, ensure_ascii=False)

# Create zip file
zip_filename = "document_classifier_model.zip"
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(OUTPUT_DIR):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, os.path.dirname(OUTPUT_DIR))
            zipf.write(file_path, arcname)

print(f"‚úÖ Model saved! Download: {zip_filename}")
print(f"Size: {os.path.getsize(zip_filename) / (1024*1024):.2f} MB")

In [None]:
# Test with examples
def predict(text):
    model.eval()
    encoding = tokenizer(text, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs.logits, dim=1)
        pred_id = torch.argmax(probs, dim=1).item()
        confidence = probs[0][pred_id].item()
    
    return id2label[pred_id], confidence

# Test examples
test_texts = [
    "R√©sultats analyse sanguine: H√©moglobine 14.5 g/dL, Leucocytes 7200/mm¬≥",
    "Radiographie thoracique: Poumons clairs sans opacit√©",
    "IRM c√©r√©brale: Pas de processus expansif intracr√¢nien",
    "ORDONNANCE: AMOXICILLINE 1g, 3 fois par jour"
]

print("üß™ Testing predictions:\n")
for text in test_texts:
    label, conf = predict(text)
    print(f"Text: {text[:60]}...")
    print(f"Predicted: {label} (confidence: {conf:.4f})\n")