# 🎓 RinominatorAI - Sistema di Training

## 📋 Come funziona:
1. **Carica PDF già rinominati** (esempi di training)
2. **Il sistema impara** come hai rinominato i file
3. **Applica** quello che ha imparato su nuovi PDF

---

## 🚀 Istruzioni:
1. ▶️ Esegui le celle in ordine
2. 📂 Carica i tuoi PDF quando richiesto
3. ⏳ Attendi il training
4. 🎉 Usa il modello per rinominare nuovi PDF!


---
## 1️⃣ SETUP - Installazione Dipendenze

In [None]:
print("🔧 Installazione dipendenze...")
print("⏱️ Questo richiederà 2-3 minuti...\n")

!pip install -q PyMuPDF Pillow easyocr pandas python-dateutil scikit-learn numpy

print("\n✅ Dipendenze installate!")

---
## 2️⃣ INIZIALIZZAZIONE - Import e Setup

In [None]:
import fitz
import easyocr
import re
import pickle
from pathlib import Path
from datetime import datetime
from collections import Counter
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from google.colab import files
import zipfile
import os
import shutil

print("✅ Import completati!")

print("\n🔍 Inizializzazione OCR (EasyOCR)...")
reader = easyocr.Reader(['it', 'en'], gpu=True)
print("✅ OCR pronto!")

---
## 3️⃣ FUNZIONI DI UTILITÀ

In [None]:
def pdf_to_image(pdf_path):
    doc = fitz.open(pdf_path)
    page = doc[0]
    pix = page.get_pixmap(dpi=200)
    img_path = pdf_path.replace('.pdf', '_page1.png')
    pix.save(img_path)
    doc.close()
    return img_path

def extract_text_with_ocr(image_path):
    results = reader.readtext(image_path)
    extracted = []
    for bbox, text, confidence in results:
        x_coords = [point[0] for point in bbox]
        y_coords = [point[1] for point in bbox]
        center_x = sum(x_coords) / len(x_coords)
        center_y = sum(y_coords) / len(y_coords)
        extracted.append({
            'text': text,
            'confidence': confidence,
            'x': center_x / 2000,
            'y': center_y / 3000,
            'bbox': bbox
        })
    return extracted

def parse_filename(filename):
    name = filename.replace('.pdf', '').replace('.PDF', '')
    pattern1 = r'^(.+?)\s+([A-Z0-9\-/]+)\s+del\s+(.+)$'
    match = re.match(pattern1, name, re.IGNORECASE)
    if match:
        return {
            'denominazione': match.group(1).strip(),
            'numero_documento': match.group(2).strip(),
            'data_documento': match.group(3).strip()
        }
    pattern2 = r'^(.+?)[\s_\-]+([A-Z0-9\-/]+)[\s_\-]+(.+)$'
    match = re.match(pattern2, name)
    if match:
        parts = [p.strip() for p in [match.group(1), match.group(2), match.group(3)]]
        date_pattern = r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}'
        if re.search(date_pattern, parts[2]):
            return {
                'denominazione': parts[0],
                'numero_documento': parts[1],
                'data_documento': parts[2]
            }
    date_match = re.search(r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}', name)
    number_match = re.search(r'\b[A-Z]{0,3}\d{3,}\b|\b\d{3,}\b', name)
    if date_match and number_match:
        data = date_match.group(0)
        numero = number_match.group(0)
        denom_end = name.find(numero)
        denominazione = name[:denom_end].strip(' -_')
        return {
            'denominazione': denominazione,
            'numero_documento': numero,
            'data_documento': data
        }
    print(f"⚠️ Impossibile parsificare: {filename}")
    return None

def fuzzy_find_text(search_text, ocr_data, threshold=0.6):
    search_text = search_text.lower().strip()
    search_words = set(search_text.split())
    best_match = None
    best_score = 0
    for item in ocr_data:
        item_text = item['text'].lower().strip()
        item_words = set(item_text.split())
        if search_text in item_text or item_text in search_text:
            return item
        if not item_words or not search_words:
            continue
        intersection = len(search_words & item_words)
        union = len(search_words | item_words)
        score = intersection / union
        if score > best_score and score >= threshold:
            best_score = score
            best_match = item
    return best_match

def clean_filename(text):
    text = re.sub(r'[<>:"/\\\\|?*]', '', text)
    return text[:150].strip()

print("✅ Funzioni caricate!")

---
## 4️⃣ CARICA PDF DI TRAINING

⚠️ **IMPORTANTE**: Carica almeno 10-15 PDF **già rinominati correttamente**!

Esempio di nomi corretti:
- `Fornitore ABC 12345 del 15-01-2024.pdf`
- `Azienda XYZ FT-2024-001 del 20-03-2024.pdf`

In [None]:
print("📂 CARICA I TUOI PDF DI TRAINING")
print("(Seleziona più file tenendo premuto Ctrl/Cmd)\n")

!mkdir -p pdf_training

uploaded = files.upload()

training_files = []
for filename in uploaded.keys():
    with open(f'pdf_training/{filename}', 'wb') as f:
        f.write(uploaded[filename])
    training_files.append(f'pdf_training/{filename}')

print(f"\n✅ Caricati {len(training_files)} PDF!")

if len(training_files) < 5:
    print("\n⚠️ ATTENZIONE: Hai caricato pochi PDF!")
    print("   Per un buon training servono almeno 10-15 PDF.")

---
## 5️⃣ TRAINING - Impara dai PDF

In [None]:
print("\n" + "="*60)
print("🎓 INIZIO TRAINING")
print("="*60 + "\n")

training_data = []

for i, pdf_path in enumerate(training_files, 1):
    filename = os.path.basename(pdf_path)
    print(f"📄 [{i}/{len(training_files)}] Analisi: {filename}")
    try:
        fields = parse_filename(filename)
        if not fields:
            print("  ❌ Nome non parsificabile, saltato\n")
            continue
        print(f"  ✓ Denominazione: {fields['denominazione']}")
        print(f"  ✓ Numero: {fields['numero_documento']}")
        print(f"  ✓ Data: {fields['data_documento']}")
        image_path = pdf_to_image(pdf_path)
        ocr_data = extract_text_with_ocr(image_path)
        print(f"  ✓ OCR: {len(ocr_data)} elementi trovati")
        associations = {}
        denom = fuzzy_find_text(fields['denominazione'], ocr_data, 0.5)
        if denom:
            associations['denominazione'] = denom
        numero = fuzzy_find_text(fields['numero_documento'], ocr_data, 0.8)
        if numero:
            associations['numero_documento'] = numero
        data = fuzzy_find_text(fields['data_documento'], ocr_data, 0.7)
        if data:
            associations['data_documento'] = data
        training_data.append({
            'filename': filename,
            'fields': fields,
            'ocr_data': ocr_data,
            'associations': associations
        })
        print(f"  ✓ Associazioni: {len(associations)}/3\n")
    except Exception as e:
        print(f"  ❌ Errore: {e}\n")

print(f"\n✅ Raccolti {len(training_data)} esempi validi!")

if len(training_data) < 3:
    print("\n❌ ERRORE: Troppi pochi esempi validi!")
else:
    print("\n✅ Dati pronti per il training ML!")

---
## 6️⃣ MACHINE LEARNING - Training Modelli

In [None]:
print("\n🤖 TRAINING MODELLI MACHINE LEARNING...\n")

print("  ├─ Analisi zone posizionali...")
zone_patterns = {}

for field in ['denominazione', 'numero_documento', 'data_documento']:
    positions = []
    for ex in training_data:
        if field in ex['associations']:
            item = ex['associations'][field]
            positions.append([item['x'], item['y']])
    if positions:
        positions = np.array(positions)
        zone_patterns[field] = {
            'x_range': (float(positions[:, 0].min()), float(positions[:, 0].max())),
            'y_range': (float(positions[:, 1].min()), float(positions[:, 1].max())),
            'x_mean': float(positions[:, 0].mean()),
            'y_mean': float(positions[:, 1].mean())
        }

print(f"  ✓ Zone identificate: {len(zone_patterns)}")

print("\n  ├─ Training classificatori di testo...")
text_classifiers = {}
vectorizers = {}
accuracies = {}

def is_fuzzy_match(text1, text2, threshold=0.6):
    text1 = text1.lower().strip()
    text2 = text2.lower().strip()
    if text1 == text2 or text1 in text2 or text2 in text1:
        return True
    words1 = set(text1.split())
    words2 = set(text2.split())
    if not words1 or not words2:
        return False
    return (len(words1 & words2) / len(words1 | words2)) >= threshold

for field in ['denominazione', 'numero_documento', 'data_documento']:
    texts = []
    labels = []
    for ex in training_data:
        ground_truth = ex['fields'][field]
        for item in ex['ocr_data']:
            texts.append(item['text'])
            labels.append(1 if is_fuzzy_match(item['text'], ground_truth) else 0)
    if len(set(labels)) >= 2:
        vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
        X = vectorizer.fit_transform(texts)
        clf = RandomForestClassifier(n_estimators=50, random_state=42, max_depth=10)
        scores = cross_val_score(clf, X, labels, cv=min(3, len(training_data)))
        acc = scores.mean()
        clf.fit(X, labels)
        text_classifiers[field] = clf
        vectorizers[field] = vectorizer
        accuracies[field] = acc
        print(f"    ✓ {field}: {acc:.1%} accuracy")

avg_accuracy = sum(accuracies.values()) / len(accuracies) if accuracies else 0
print(f"\n  ✓ Accuratezza media: {avg_accuracy:.1%}")

print("\n  ├─ Salvataggio modello...")
model_data = {
    'zone_patterns': zone_patterns,
    'text_classifiers': text_classifiers,
    'vectorizers': vectorizers,
    'stats': {
        'n_samples': len(training_data),
        'accuracy': avg_accuracy,
        'training_date': datetime.now().isoformat()
    }
}

with open('trained_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("  ✓ Modello salvato: trained_model.pkl")
print("\n" + "="*60)
print("✅ TRAINING COMPLETATO!")
print("="*60)
print(f"📊 STATISTICHE:")
print(f"  - Documenti: {len(training_data)}")
print(f"  - Accuratezza: {avg_accuracy:.1%}")
print(f"  - Zone: {len(zone_patterns)}")
print(f"  - Modelli: {len(text_classifiers)}")
print("="*60 + "\n")

---
## 7️⃣ CARICA NUOVI PDF DA RINOMINARE

In [None]:
print("📂 CARICA I PDF DA RINOMINARE\n")

!mkdir -p pdf_input

uploaded_new = files.upload()

input_files = []
for filename in uploaded_new.keys():
    with open(f'pdf_input/{filename}', 'wb') as f:
        f.write(uploaded_new[filename])
    input_files.append(f'pdf_input/{filename}')

print(f"\n✅ Caricati {len(input_files)} PDF da processare!")

---
## 8️⃣ PREDIZIONE - Rinomina Automaticamente!

In [None]:
print("\n" + "="*60)
print("🤖 MODALITÀ PREDIZIONE")
print("="*60 + "\n")

with open('trained_model.pkl', 'rb') as f:
    model = pickle.load(f)

zone_patterns = model['zone_patterns']
text_classifiers = model['text_classifiers']
vectorizers = model['vectorizers']
stats = model['stats']

print(f"✓ Modello caricato (acc: {stats['accuracy']:.1%})\n")

!mkdir -p pdf_output

for i, pdf_path in enumerate(input_files, 1):
    filename = os.path.basename(pdf_path)
    print(f"📄 [{i}/{len(input_files)}] {filename}")
    try:
        print("  🔍 OCR...")
        image_path = pdf_to_image(pdf_path)
        ocr_data = extract_text_with_ocr(image_path)
        print("  🤖 Predizione...")
        predicted_fields = {}
        confidences = {}
        for field in ['denominazione', 'numero_documento', 'data_documento']:
            if field not in text_classifiers:
                predicted_fields[field] = "N/A"
                confidences[field] = 0.0
                continue
            clf = text_classifiers[field]
            vec = vectorizers[field]
            zone = zone_patterns.get(field)
            candidates = []
            for item in ocr_data:
                if zone:
                    x, y = item['x'], item['y']
                    x_min, x_max = zone['x_range']
                    y_min, y_max = zone['y_range']
                    margin = 0.2
                    if x_min-margin <= x <= x_max+margin and y_min-margin <= y <= y_max+margin:
                        candidates.append(item)
                else:
                    candidates.append(item)
            if not candidates:
                candidates = ocr_data
            texts = [c['text'] for c in candidates]
            X = vec.transform(texts)
            probs = clf.predict_proba(X)[:, 1]
            best_idx = np.argmax(probs)
            best_prob = probs[best_idx]
            if best_prob >= 0.6:
                predicted_fields[field] = candidates[best_idx]['text']
                confidences[field] = best_prob
            else:
                predicted_fields[field] = "N/A"
                confidences[field] = best_prob
        print(f"    ✓ {predicted_fields['denominazione']} ({confidences['denominazione']:.0%})")
        print(f"    ✓ {predicted_fields['numero_documento']} ({confidences['numero_documento']:.0%})")
        print(f"    ✓ {predicted_fields['data_documento']} ({confidences['data_documento']:.0%})")
        new_name = f"{predicted_fields['denominazione']} {predicted_fields['numero_documento']} del {predicted_fields['data_documento']}.pdf"
        new_name = clean_filename(new_name)
        new_path = f'pdf_output/{new_name}'
        shutil.copy2(pdf_path, new_path)
        print(f"  ✅ {new_name}\n")
    except Exception as e:
        print(f"  ❌ Errore: {e}\n")

print("\n✅ COMPLETATO!\n")

---
## 9️⃣ SCARICA RISULTATI

In [None]:
print("📦 Creazione ZIP...\n")

!zip -r pdf_rinominati.zip pdf_output/

print("\n✅ Download...\n")
files.download('pdf_rinominati.zip')

print("\n🎉 FATTO!")