In [1]:
import os
import shutil
from pathlib import Path
from difflib import SequenceMatcher
import re

# Define paths
TMP_FILE = 'tmp2.txt'
SOURCE_DIR = 'assets/Norme'
DEST_DIR = 'assets/Utili'

# Create destination directory if it doesn't exist
os.makedirs(DEST_DIR, exist_ok=True)

print(f"Source directory: {SOURCE_DIR}")
print(f"Destination directory: {DEST_DIR}")
print(f"Target file list: {TMP_FILE}")

Source directory: assets/Norme
Destination directory: assets/Utili
Target file list: tmp2.txt


In [2]:
# Read target file names from tmp2.txt
with open(TMP_FILE, 'r', encoding='utf-8') as f:
    target_names = [line.strip() for line in f.readlines() if line.strip()]

# Remove duplicates and clean up
target_names = list(set(target_names))
print(f"Found {len(target_names)} unique target file names/patterns")
print("\nSample target names:")
for name in target_names[:10]:
    print(f"  - {name}")

Found 30 unique target file names/patterns

Sample target names:
  - D.Lgs. 116/2020"
  - Norme\Giorgia\ADR 2025 (1).pdf
  - Norme\Giorgia\D.M_protocollo 194 del 07-08-2023.pdf
  - UNI EN ISO 14001:2025
  - Reg. 1013/06"
  - -
  - DLgs 03.04.2006 n.152 - 10.09.2021
  - "Richiesta Regione Lombardia ripresa
  - dgr-4107_2020-AIA-Applicativo-regionale
  - Circolare ANGA PROT.0000009.01-08-2019"


In [3]:
# Scan all files in the source directory
all_files = []
for root, dirs, files in os.walk(SOURCE_DIR):
    for file in files:
        if not file.startswith('.'):  # Skip hidden files
            full_path = os.path.join(root, file)
            all_files.append(full_path)

print(f"Found {len(all_files)} files in {SOURCE_DIR}")
print("\nSample files:")
for file in all_files[:5]:
    print(f"  - {file}")

Found 407 files in assets/Norme

Sample files:
  - assets/Norme/Sara/Riassunto p3.docx
  - assets/Norme/Sara/Sentenza-del-10042024-n.-9655-Corte-di-Cassazione-Sezione-5.pdf
  - assets/Norme/Sara/allegato-5-dd-n-254_2024.pdf
  - assets/Norme/Sara/Dichiarazione PRTR 2025 (dati 2024) — Italiano.pdf
  - assets/Norme/Sara/iso-140012015---environmental-management-systems-ita.pdf


In [4]:
def normalize_string(s):
    """Normalize string for better matching"""
    # Remove file extensions
    s = re.sub(r'\.(pdf|docx?|xlsx?|txt)$', '', s, flags=re.IGNORECASE)
    # Replace backslashes with forward slashes
    s = s.replace('\\', '/')
    # Remove common prefixes like "Norme/"
    s = re.sub(r'^(assets/)?Norme/', '', s, flags=re.IGNORECASE)
    # Convert to lowercase
    s = s.lower()
    # Remove extra spaces
    s = ' '.join(s.split())
    # Remove special characters except alphanumeric, spaces, and common separators
    s = re.sub(r'[^\w\s\-_./()]', '', s)
    return s

def similarity_score(s1, s2):
    """Calculate similarity between two strings"""
    return SequenceMatcher(None, s1, s2).ratio()

def find_matches(target_name, all_files, threshold=0.4):
    """
    Find matching files for a target name
    Lower threshold = more recall (finds more matches, may include false positives)
    """
    normalized_target = normalize_string(target_name)
    matches = []
    
    for file_path in all_files:
        # Extract just the filename
        filename = os.path.basename(file_path)
        # Also get the relative path from Norme
        rel_path = file_path.replace('assets/Norme/', '')
        
        # Normalize both filename and relative path for comparison
        norm_filename = normalize_string(filename)
        norm_relpath = normalize_string(rel_path)
        
        # Calculate similarity with both filename and relative path
        score_filename = similarity_score(normalized_target, norm_filename)
        score_relpath = similarity_score(normalized_target, norm_relpath)
        
        # Also check if target is contained in the file name (substring match)
        substring_match = normalized_target in norm_filename or normalized_target in norm_relpath
        
        # Use the best score
        best_score = max(score_filename, score_relpath)
        
        # Boost score if substring match
        if substring_match and len(normalized_target) > 5:
            best_score = max(best_score, 0.7)
        
        if best_score >= threshold:
            matches.append({
                'file': file_path,
                'score': best_score,
                'target': target_name
            })
    
    # Sort by score (highest first)
    matches.sort(key=lambda x: x['score'], reverse=True)
    return matches

print("Matching functions defined successfully")

Matching functions defined successfully


In [5]:
# Find matches for all target names
all_matches = []
files_to_copy = set()  # Use set to avoid duplicates

print("Finding matches for each target name...\n")

for target_name in target_names:
    if not target_name or target_name == '-':  # Skip empty or dash-only entries
        continue
    
    matches = find_matches(target_name, all_files, threshold=0.4)
    
    if matches:
        all_matches.extend(matches)
        # Add files to copy set
        for match in matches:
            files_to_copy.add(match['file'])
        
        # Print first few matches for this target
        print(f"Target: '{target_name}'")
        print(f"  Found {len(matches)} match(es):")
        for match in matches[:3]:  # Show top 3 matches
            print(f"    - {os.path.basename(match['file'])} (score: {match['score']:.2f})")
        if len(matches) > 3:
            print(f"    ... and {len(matches) - 3} more")
        print()
    else:
        print(f"Target: '{target_name}' - NO MATCHES FOUND")
        print()

print(f"\n{'='*60}")
print(f"Total unique files to copy: {len(files_to_copy)}")
print(f"Total match instances: {len(all_matches)}")
print(f"{'='*60}")

Finding matches for each target name...

Target: 'D.Lgs. 116/2020"'
  Found 18 match(es):
    - D.Lgs 116_2020.pdf (score: 0.90)
    - D.Lgs 116_2020 (1).pdf (score: 0.79)
    - D.lgs.151_20.pdf (score: 0.74)
    ... and 15 more

Target: 'Norme\Giorgia\ADR 2025 (1).pdf'
  Found 112 match(es):
    - ADR 2025 (1).pdf (score: 1.00)
    - ADR 2021.pdf (score: 0.89)
    - ADR 2023 PROTETTO.pdf (score: 0.71)
    ... and 109 more

Target: 'Norme\Giorgia\D.M_protocollo 194 del 07-08-2023.pdf'
  Found 77 match(es):
    - D.M_protocollo 194 del 07-08-2023.pdf (score: 1.00)
    - Circolare_protocollo_13921 del 14-05-2024.pdf (score: 0.71)
    - 124-Del3_26.07.2023.pdf (score: 0.56)
    ... and 74 more

Target: 'UNI EN ISO 14001:2025'
  Found 9 match(es):
    - ISO 14001_2015.pdf (score: 0.71)
    - 1_ISO14001.pdf (score: 0.53)
    - Requisiti e Struttura della ISO 14001.pdf (score: 0.49)
    ... and 6 more

Target: 'Reg. 1013/06"'
  Found 6 match(es):
    - Reg. 1013_2006 aggiornato.pdf (score: 0

In [6]:
# Copy files to destination directory
print("Copying files to", DEST_DIR)
print(f"{'='*60}\n")

copied_count = 0
error_count = 0

for file_path in sorted(files_to_copy):
    try:
        # Get the relative path from Norme to preserve subfolder structure
        rel_path = os.path.relpath(file_path, SOURCE_DIR)
        dest_path = os.path.join(DEST_DIR, rel_path)
        
        # Create subdirectories if needed
        os.makedirs(os.path.dirname(dest_path), exist_ok=True)
        
        # Copy the file
        shutil.copy2(file_path, dest_path)
        copied_count += 1
        print(f"✓ Copied: {rel_path}")
        
    except Exception as e:
        error_count += 1
        print(f"✗ Error copying {file_path}: {e}")

print(f"\n{'='*60}")
print(f"Successfully copied: {copied_count} files")
if error_count > 0:
    print(f"Errors: {error_count}")
print(f"{'='*60}")

Copying files to assets/Utili

✓ Copied: Debora/04 Decreto di autorizzazione prot0002856-2025.pdf
✓ Copied: Debora/ALL 9 - Registrazione Controlli - PORTATILE uscita.pdf
✓ Copied: Debora/ALLEGATO_VIII_alla_parte_seconda.pdf
✓ Copied: Debora/ARPA Lombardia Indicazioni modelli dispersione contam.pdf
✓ Copied: Debora/AT_AIA_SOLID_CASTEGNATO.pdf
✓ Copied: Debora/Atto Amministrativo.pdf
✓ Copied: Debora/Copia con segnatura Prot.N.0002859-2025.pdf
✓ Copied: Debora/D.Lgs 116_2020.pdf
✓ Copied: Debora/D.lgs.151_20.pdf
✓ Copied: Debora/DGR 196 del 22_06_2005.pdf
✓ Copied: Debora/DLgs 03.04.2006 n.152 - 10.09.2021.pdf
✓ Copied: Debora/DLgs-155-del-13-08-2010.pdf
✓ Copied: Debora/Dgr X-4792-2016_Salute pubblica.pdf
✓ Copied: Debora/ISPRA MLG_109_2014.pdf
✓ Copied: Debora/ITALIA_DLGS_n199__08_11_2021.pdf
✓ Copied: Debora/Indicazioni per relazione tecnica.pdf
✓ Copied: Debora/Ordinanza_520_01042020.pdf
✓ Copied: Debora/PRESCRIZIONI GENERALI  _Emissioni in atmosfera.pdf
✓ Copied: Debora/Prescrizioni

In [7]:
# Summary: Show detailed matching results
print("\nDETAILED MATCHING SUMMARY")
print(f"{'='*80}\n")

# Group matches by target
from collections import defaultdict
matches_by_target = defaultdict(list)

for match in all_matches:
    matches_by_target[match['target']].append(match)

# Sort targets alphabetically
for target in sorted(matches_by_target.keys()):
    matches = matches_by_target[target]
    print(f"Target: '{target}'")
    print(f"  Matched {len(matches)} file(s):")
    for match in sorted(matches, key=lambda x: x['score'], reverse=True):
        print(f"    - {match['file'].replace('assets/Norme/', '')} (score: {match['score']:.2f})")
    print()

# Show files that were copied
print(f"\n{'='*80}")
print(f"FILES COPIED TO {DEST_DIR}:")
print(f"{'='*80}")
for file_path in sorted(files_to_copy):
    rel_path = file_path.replace('assets/Norme/', '')
    print(f"  - {rel_path}")


DETAILED MATCHING SUMMARY

Target: '"Consenso Regione Lombardia'
  Matched 7 file(s):
    - Sara/Convenzione Basilea.pdf (score: 0.62)
    - Giorgia/TAR Lombardia.pdf (score: 0.56)
    - Debora/Regolamento Regionale 24 marzo 2006.pdf (score: 0.52)
    - Debora/Indicazioni per relazione tecnica.pdf (score: 0.44)
    - Giorgia/Studio definizione valori di fondo Valcamonica.pdf (score: 0.42)
    - Sara/RENTRI_con sanzioni.pdf (score: 0.40)
    - Giorgia/Presentazione novità ADR 2025.pdf (score: 0.40)

Target: '"Legge n. 147 del 2013'
  Matched 16 file(s):
    - Sara/Legge n. 147_2013.pdf (score: 0.84)
    - Sara/legge 70 del 25 gennaio 1994.pdf (score: 0.57)
    - Debora/DGR 196 del 22_06_2005.pdf (score: 0.47)
    - Sara/Legge 25 gennaio 1994 n. 70.pdf (score: 0.46)
    - Giorgia/D.M_protocollo 194 del 07-08-2023.pdf (score: 0.44)
    - Sara/Regolamento TARI dal 2023.aspx.pdf (score: 0.43)
    - Giorgia/DGR n.8-2838 del 27 giugno 2006.pdf (score: 0.42)
    - Giorgia/127-Del6_16.10.2023.