## üîç Step 1: Check GPU Availability

In [None]:
import torch

if torch.cuda.is_available():
    print(f"‚úÖ GPU Available: {torch.cuda.get_device_name(0)}")
    print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    device = "cuda"
else:
    print("‚ö†Ô∏è GPU NOT available. Training will be VERY slow (2-3 hours).")
    print("   Enable GPU: Runtime ‚Üí Change runtime type ‚Üí T4 GPU")
    device = "cpu"

print(f"\nUsing device: {device}")

## üì¶ Step 2: Install Required Libraries

In [None]:
!pip install transformers datasets scikit-learn pandas numpy torch -q

## üìö Step 3: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    CamembertTokenizer, 
    CamembertForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm.auto import tqdm
import json
import zipfile
import os

print("‚úÖ All libraries imported successfully!")

## üìÇ Step 4: Upload and Load Training Data

**IMPORTANT**: Upload your `training_data.csv` file using the Files tab (üìÅ on the left)

In [None]:
# Check if training_data.csv exists
if not os.path.exists('training_data.csv'):
    print("‚ö†Ô∏è ERROR: training_data.csv not found!")
    print("""\nüì§ Please upload your training_data.csv file:
1
,
2
,
3
,
4
,
5
"")
    raise FileNotFoundError("training_data.csv not found")

# Load the CSV
df = pd.read_csv('training_data.csv')

print(f"‚úÖ Data loaded successfully!")
print(f"\nüìä Dataset Statistics:")
print(f"   Total samples: {len(df)}")
print(f"   Columns: {df.columns.tolist()}")
print(f"\nüìà Class Distribution:")
print(df['label'].value_counts())
print(f"\nüìù Sample data:")
print(df.head(3))

# Validate data
valid_labels = ['blood_test', 'xray', 'mri', 'prescription', 
                'medical_report', 'lab_result', 'consultation_note']
invalid = df[~df['label'].isin(valid_labels)]
if len(invalid) > 0:
    print(f"\n‚ö†Ô∏è WARNING: {len(invalid)} invalid labels found!")
    print(invalid[['text', 'label']].head())
else:
    print("\n‚úÖ All labels are valid!")

# Check for minimum samples per class
counts = df['label'].value_counts()
min_samples = counts.min()
if min_samples < 20:
    print(f"\n‚ö†Ô∏è WARNING: Some classes have very few samples (min: {min_samples})")
    print("   Recommendation: Add more samples for better accuracy")
elif min_samples < 50:
    print(f"\n‚ö†Ô∏è Note: Min samples per class: {min_samples}")
    print("   Recommendation: 50-100 samples per class for best results")
else:
    print(f"\n‚úÖ Good! Min samples per class: {min_samples}")

## üèóÔ∏è Step 5: Prepare Data and Labels

In [None]:
# Define document types and label mapping
DOCUMENT_TYPES = [
    'blood_test',        # Analyse de sang
    'xray',              # Radiographie
    'mri',               # IRM
    'prescription',      # Ordonnance
    'medical_report',    # Rapport m√©dical
    'lab_result',        # R√©sultat de laboratoire
    'consultation_note'  # Note de consultation
]

# Create label to ID mapping
label2id = {label: idx for idx, label in enumerate(DOCUMENT_TYPES)}
id2label = {idx: label for idx, label in enumerate(DOCUMENT_TYPES)}

# Convert labels to numeric IDs
df['label_id'] = df['label'].map(label2id)

# Extract texts and labels
texts = df['text'].tolist()
labels = df['label_id'].tolist()

print(f"‚úÖ Data prepared!")
print(f"   Number of classes: {len(DOCUMENT_TYPES)}")
print(f"   Label mapping: {label2id}")
print(f"\nüìù Example:")
print(f"   Text: {texts[0][:100]}...")
print(f"   Label: {df['label'].iloc[0]} ‚Üí ID: {labels[0]}")

## ‚úÇÔ∏è Step 6: Split Data (Train/Validation)

In [None]:
# Split data: 80% train, 20% validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, 
    test_size=0.2, 
    random_state=42, 
    stratify=labels  # Ensure balanced split
)

print(f"‚úÖ Data split completed!")
print(f"   Training samples: {len(train_texts)}")
print(f"   Validation samples: {len(val_texts)}")
print(f"\nüìä Training set distribution:")
train_df = pd.DataFrame({'label_id': train_labels})
print(train_df['label_id'].value_counts().sort_index())
print(f"\nüìä Validation set distribution:")
val_df = pd.DataFrame({'label_id': val_labels})
print(val_df['label_id'].value_counts().sort_index())

## üî§ Step 7: Load Tokenizer and Create Dataset

In [None]:
# Load CamemBERT tokenizer (French BERT)
MODEL_NAME = "camembert-base"
tokenizer = CamembertTokenizer.from_pretrained(MODEL_NAME)

print(f"‚úÖ Tokenizer loaded: {MODEL_NAME}")

# Define custom dataset class
class DocumentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = DocumentDataset(train_texts, train_labels, tokenizer)
val_dataset = DocumentDataset(val_texts, val_labels, tokenizer)

print(f"\n‚úÖ Datasets created!")
print(f"   Train dataset size: {len(train_dataset)}")
print(f"   Val dataset size: {len(val_dataset)}")

# Test tokenization
sample = train_dataset[0]
print(f"\nüìù Sample tokenized data:")
print(f"   Input IDs shape: {sample['input_ids'].shape}")
print(f"   Attention mask shape: {sample['attention_mask'].shape}")
print(f"   Label: {sample['labels'].item()} ‚Üí {id2label[sample['labels'].item()]}")

## üéØ Step 8: Create DataLoaders

In [None]:
# Training configuration
BATCH_SIZE = 8  # Reduce to 4 if out of memory
NUM_EPOCHS = 10  # More epochs = better accuracy (but longer training)

# Create data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

print(f"‚úÖ DataLoaders created!")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   Training batches: {len(train_loader)}")
print(f"   Validation batches: {len(val_loader)}")
print(f"   Total epochs: {NUM_EPOCHS}")

## ü§ñ Step 9: Initialize Model

In [None]:
# Load pre-trained CamemBERT model
model = CamembertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(DOCUMENT_TYPES),
    hidden_dropout_prob=0.3,  # Dropout for regularization
    attention_probs_dropout_prob=0.3
)

# Move model to GPU if available
model = model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"‚úÖ Model initialized!")
print(f"   Base model: {MODEL_NAME}")
print(f"   Number of classes: {len(DOCUMENT_TYPES)}")
print(f"   Total parameters: {total_params:,}")
print(f"   Trainable parameters: {trainable_params:,}")
print(f"   Device: {device}")

## ‚öôÔ∏è Step 10: Setup Optimizer and Scheduler

In [None]:
# Training hyperparameters
LEARNING_RATE = 2e-5
WARMUP_STEPS = 100

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)

# Learning rate scheduler
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=total_steps
)

print(f"‚úÖ Optimizer and scheduler configured!")
print(f"   Learning rate: {LEARNING_RATE}")
print(f"   Warmup steps: {WARMUP_STEPS}")
print(f"   Total training steps: {total_steps}")
print(f"   Weight decay: 0.01 (L2 regularization)")

## üöÄ Step 11: Training Loop

**This will take 10-15 minutes with GPU** (or 2-3 hours without GPU)

In [None]:
# Training function
def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    
    progress_bar = tqdm(data_loader, desc="Training")
    
    for batch in progress_bar:
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        
        # Calculate accuracy
        preds = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_loss += loss.item()
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': f"{loss.item():.4f}",
            'lr': f"{scheduler.get_last_lr()[0]:.2e}"
        })
    
    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / len(data_loader.dataset)
    
    return avg_loss, accuracy.item()

# Validation function
def eval_epoch(model, data_loader, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            logits = outputs.logits
            
            preds = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_loss += loss.item()
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / len(data_loader.dataset)
    
    return avg_loss, accuracy.item(), all_preds, all_labels

# Main training loop
print("üöÄ Starting training...\n")
best_val_acc = 0
training_history = []

for epoch in range(NUM_EPOCHS):
    print(f"\n{'='*60}")
    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    print(f"{'='*60}")
    
    # Train
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f"\nüìä Training - Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
    
    # Validate
    val_loss, val_acc, val_preds, val_labels = eval_epoch(model, val_loader, device)
    print(f"üìä Validation - Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}")
    
    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        print(f"\nüèÜ New best validation accuracy: {best_val_acc:.4f}")
    
    # Save history
    training_history.append({
        'epoch': epoch + 1,
        'train_loss': train_loss,
        'train_acc': train_acc,
        'val_loss': val_loss,
        'val_acc': val_acc
    })

print(f"\n\n{'='*60}")
print(f"‚úÖ Training Complete!")
print(f"{'='*60}")
print(f"üèÜ Best Validation Accuracy: {best_val_acc:.4f} ({best_val_acc*100:.2f}%)")

## üìä Step 12: Evaluation and Metrics

In [None]:
# Get final predictions
_, final_acc, final_preds, final_labels = eval_epoch(model, val_loader, device)

# Classification report
print("\nüìä Classification Report:")
print("="*60)
report = classification_report(
    final_labels, 
    final_preds, 
    target_names=DOCUMENT_TYPES,
    digits=4
)
print(report)

# Confusion matrix
print("\nüî¢ Confusion Matrix:")
print("="*60)
cm = confusion_matrix(final_labels, final_preds)
print("\nRows = True labels, Columns = Predicted labels")
print(f"\n{' '*20}", end="")
for label in DOCUMENT_TYPES:
    print(f"{label[:8]:>10}", end="")
print()
for i, label in enumerate(DOCUMENT_TYPES):
    print(f"{label:>20}", end="")
    for j in range(len(DOCUMENT_TYPES)):
        print(f"{cm[i][j]:>10}", end="")
    print()

# Per-class accuracy
print("\n\nüìà Per-Class Accuracy:")
print("="*60)
for i, label in enumerate(DOCUMENT_TYPES):
    class_mask = np.array(final_labels) == i
    if class_mask.sum() > 0:
        class_acc = (np.array(final_preds)[class_mask] == i).sum() / class_mask.sum()
        print(f"{label:>20}: {class_acc:.4f} ({class_acc*100:.2f}%)")

# Training history
print("\n\nüìâ Training History:")
print("="*60)
print(f"{'Epoch':<10} {'Train Loss':<15} {'Train Acc':<15} {'Val Loss':<15} {'Val Acc':<15}")
print("-"*70)
for h in training_history:
    print(f"{h['epoch']:<10} {h['train_loss']:<15.4f} {h['train_acc']:<15.4f} {h['val_loss']:<15.4f} {h['val_acc']:<15.4f}")

## üíæ Step 13: Save Model

In [None]:
# Create directory
OUTPUT_DIR = "document_classifier_model"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Save model state dict (PyTorch format)
model_path = os.path.join(OUTPUT_DIR, "model.pth")
torch.save({
    'model_state_dict': model.state_dict(),
    'label_map': label2id,
    'best_val_acc': best_val_acc,
    'training_history': training_history
}, model_path)

# Save tokenizer
tokenizer.save_pretrained(OUTPUT_DIR)

# Save config with metadata
config_data = {
    'base_model': MODEL_NAME,
    'num_labels': len(DOCUMENT_TYPES),
    'document_types': DOCUMENT_TYPES,
    'label_map': label2id,
    'id_to_label': id2label,
    'best_val_acc': best_val_acc,
    'training_samples': len(train_texts),
    'validation_samples': len(val_texts),
    'num_epochs': NUM_EPOCHS,
    'batch_size': BATCH_SIZE,
    'learning_rate': LEARNING_RATE,
    'max_length': 512
}

config_path = os.path.join(OUTPUT_DIR, "config.json")
with open(config_path, 'w', encoding='utf-8') as f:
    json.dump(config_data, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Model saved to: {OUTPUT_DIR}/")
print(f"   Files created:")
print(f"   - model.pth (model weights)")
print(f"   - config.json (metadata)")
print(f"   - tokenizer files")

# Create zip file for easy download
zip_filename = "document_classifier_model.zip"
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(OUTPUT_DIR):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, os.path.dirname(OUTPUT_DIR))
            zipf.write(file_path, arcname)

print(f"\nüì¶ Zip file created: {zip_filename}")
print(f"   Size: {os.path.getsize(zip_filename) / (1024*1024):.2f} MB")
print(f"\nüì• Download instructions:")
print(f"   1. Click on Files tab (üìÅ)")
print(f"   2. Find {zip_filename}")
print(f"   3. Right-click ‚Üí Download")
print(f"   4. Extract on your computer")
print(f"   5. Move to: backend/ml_service/saved_models/")

## üß™ Step 14: Test Model with Examples

In [None]:
def predict_document(text, model, tokenizer, device, label2id, id2label):
    """Predict document type for a given text"""
    model.eval()
    
    # Tokenize
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Predict
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        
        pred_id = torch.argmax(probs, dim=1).item()
        confidence = probs[0][pred_id].item()
    
    return id2label[pred_id], confidence, probs[0].cpu().numpy()

# Test examples
test_examples = [
    "R√©sultats de l'analyse sanguine: H√©moglobine 14.5 g/dL, Leucocytes 7200/mm¬≥, Plaquettes 250000/mm¬≥",
    "Radiographie thoracique de face: Poumons clairs sans opacit√© parenchymateuse. C≈ìur de taille normale.",
    "IRM c√©r√©brale avec injection de gadolinium: Pas de processus expansif intracr√¢nien. Examen normal.",
    "ORDONNANCE: AMOXICILLINE 1g, 1 comprim√© 3 fois par jour pendant 7 jours. PARACETAMOL 1g si douleur.",
    "Compte-rendu d'hospitalisation: Patient admis le 12/03/2024 pour dyspn√©e aigu√´ et douleur thoracique.",
    "R√©sultats laboratoire: HbA1c 7.2%, Cholest√©rol total 2.10 g/L, TSH 2.8 mUI/L, Cr√©atinin√©mie 92 ¬µmol/L.",
    "Note de consultation: Patient √¢g√© de 45 ans consultant pour lombalgies chroniques √©voluant depuis 6 mois."
]

print("üß™ Testing model with examples:\n")
print("="*80)

for i, text in enumerate(test_examples, 1):
    pred_label, confidence, all_probs = predict_document(
        text, model, tokenizer, device, label2id, id2label
    )
    
    print(f"\nExample {i}:")
    print(f"Text: {text[:80]}...")
    print(f"Predicted: {pred_label} (confidence: {confidence:.4f})")
    
    # Show top 3 predictions
    top3_indices = np.argsort(all_probs)[-3:][::-1]
    print(f"Top 3 predictions:")
    for idx in top3_indices:
        print(f"  {id2label[idx]:>20}: {all_probs[idx]:.4f}")
    print("-" * 80)

print("\n‚úÖ Testing complete!")

## üéâ Summary and Next Steps

### What You've Accomplished:
- ‚úÖ Loaded and validated your training data
- ‚úÖ Split data into train/validation sets
- ‚úÖ Trained a CamemBERT model on French medical documents
- ‚úÖ Achieved validation accuracy (check above)
- ‚úÖ Saved model as downloadable ZIP file

### Next Steps:
1. **Download model**: Files tab ‚Üí document_classifier_model.zip ‚Üí Download
2. **Extract on your computer**
3. **Move to project**: `C:\docqa-ms\backend\ml_service\saved_models\`
4. **Restart ML service**: `docker-compose restart ml-service`
5. **Test API**: Use Postman or curl to test classification endpoint

### If Accuracy is Low (<80%):
- **Add more training data** (50-100 samples per class)
- **Increase NUM_EPOCHS** to 15-20
- **Check data quality** (correct labels, clean text)
- **Balance classes** (equal samples per class)
- **Try data augmentation** (back-translation, synonym replacement)

### Commands for Deployment:
```powershell
# PowerShell (Windows)
cd C:\docqa-ms
Expand-Archive -Path "$env:USERPROFILE\Downloads\document_classifier_model.zip" -DestinationPath ".\backend\ml_service\saved_models\"
docker-compose restart ml-service
docker logs ml-service --tail 30
```

**Good luck with your project!** üöÄ

## üîç Step 1: Check GPU Availability

In [None]:
import torch

if torch.cuda.is_available():
    print(f"‚úÖ GPU Available: {torch.cuda.get_device_name(0)}")
    print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    device = "cuda"
else:
    print("‚ö†Ô∏è GPU NOT available. Training will be VERY slow (2-3 hours).")
    print("   Enable GPU: Runtime ‚Üí Change runtime type ‚Üí T4 GPU")
    device = "cpu"

print(f"\nUsing device: {device}")

## üì¶ Step 2: Install Required Libraries

In [None]:
!pip install transformers datasets scikit-learn pandas numpy torch -q

## üìö Step 3: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import CamembertTokenizer, CamembertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm.auto import tqdm
import json
import zipfile
import os

print("‚úÖ All libraries imported successfully!")

## üìÇ Step 4: Upload and Load Training Data

**IMPORTANT**: Upload your `training_data.csv` file using the Files tab (üìÅ on the left)

In [None]:
if not os.path.exists('training_data.csv'):
    print("‚ö†Ô∏è ERROR: training_data.csv not found!")
    print("\nüì§ Please upload your training_data.csv file:")
    print("    1. Click on Files tab (üìÅ on the left)")
    print("    2. Click Upload button")
    print("    3. Select your training_data.csv")
    print("    4. Wait for upload to complete")
    print("    5. Re-run this cell")
    raise FileNotFoundError("training_data.csv not found")

df = pd.read_csv('training_data.csv')

print(f"‚úÖ Data loaded successfully!")
print(f"\nüìä Dataset Statistics:")
print(f"   Total samples: {len(df)}")
print(f"   Columns: {df.columns.tolist()}")
print(f"\nüìà Class Distribution:")
print(df['label'].value_counts())
print(f"\nüìù Sample data:")
print(df.head(3))

valid_labels = ['blood_test', 'xray', 'mri', 'prescription', 'medical_report', 'lab_result', 'consultation_note']
invalid = df[~df['label'].isin(valid_labels)]
if len(invalid) > 0:
    print(f"\n‚ö†Ô∏è WARNING: {len(invalid)} invalid labels found!")
    print(invalid[['text', 'label']].head())
else:
    print("\n‚úÖ All labels are valid!")

counts = df['label'].value_counts()
min_samples = counts.min()
if min_samples < 20:
    print(f"\n‚ö†Ô∏è WARNING: Some classes have very few samples (min: {min_samples})")
    print("   Recommendation: Add more samples for better accuracy")
elif min_samples < 50:
    print(f"\n‚ö†Ô∏è Note: Min samples per class: {min_samples}")
    print("   Recommendation: 50-100 samples per class for best results")
else:
    print(f"\n‚úÖ Good! Min samples per class: {min_samples}")