In [1]:
# ============================================================================
# COMPLETE PROJECT IMPORTS
# ============================================================================

# Standard library
import os
import sys
import json
import random
import warnings
import re
import time
import csv
import math
from io import StringIO
from pathlib import Path
from itertools import combinations
from collections import Counter
from typing import List, Dict, Set, Tuple, Any
from dataclasses import dataclass

# Third party - general
import requests
import urllib3
import numpy as np
import pandas as pd
from tqdm import tqdm

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score
)
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.utils.class_weight import compute_class_weight

# Transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import logging as transformers_logging
from sentence_transformers import SentenceTransformer

# Graph/Network
import networkx as nx
from networkx.algorithms.community import louvain_communities

# Document processing
from unstructured.partition.pdf import partition_pdf
from unstructured.cleaners.core import clean_extra_whitespace

# ============================================================================
# CONFIGURATION
# ============================================================================

# Disable warnings
warnings.filterwarnings('ignore')
os.environ['PYTHONWARNINGS'] = 'ignore'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TRANSFORMERS_NO_TF'] = '1'
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
transformers_logging.set_verbosity_error()


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
''' 
1: This cell downloads an arXiv paper dataset by fetching 100 papers per subject 
It retrieves papers from multiple academic disciplines (mathematics, physics, computer science, statistics, economics, biology, and finance.)
For each paper, the code downloads the PDF and saves metadata (title, authors, abstract, publication date) as a text file'''

class ArxivDownloader:
    CATEGORIES = {
        # Mathematics
        "math.AG": "mathematics", "math.AT": "mathematics", "math.AP": "mathematics",
        "math.CT": "mathematics", "math.CA": "mathematics", "math.CO": "mathematics",
        "math.DG": "mathematics", "math.DS": "mathematics", "math.FA": "mathematics",
        "math.GM": "mathematics",
        # Physics
        "physics.acc-ph": "physics", "physics.ao-ph": "physics", "physics.atom-ph": "physics",
        "physics.atm-clus": "physics", "physics.bio-ph": "physics", "physics.chem-ph": "physics",
        "physics.class-ph": "physics", "physics.comp-ph": "physics", "physics.data-an": "physics",
        "physics.flu-dyn": "physics", 
        
        # Computer Science
        "cs.AI": "computer_science", "cs.AR": "computer_science", "cs.CC": "computer_science",
        "cs.CE": "computer_science", "cs.CG": "computer_science", "cs.CL": "computer_science",
        "cs.CR": "computer_science", "cs.CV": "computer_science", "cs.CY": "computer_science",
        "cs.DB": "computer_science", 

        # Statistics
        "stat.AP": "statistics", "stat.CO": "statistics", "stat.ME": "statistics",
        "stat.ML": "statistics", "stat.OT": "statistics", "stat.TH": "statistics",
        
        # Economics
        "econ.EM": "economics", "econ.GN": "economics", "econ.TH": "economics",
        
        # Biology
        "q-bio.BM": "biology", "q-bio.CB": "biology", "q-bio.GN": "biology",
        "q-bio.MN": "biology", "q-bio.NC": "biology", "q-bio.OT": "biology",
        "q-bio.PE": "biology", "q-bio.QM": "biology", "q-bio.SC": "biology",
        "q-bio.TO": "biology",
        
        # Finance
        "q-fin.CP": "finance", "q-fin.EC": "finance", "q-fin.GN": "finance",
        "q-fin.MF": "finance", "q-fin.PM": "finance", "q-fin.PR": "finance",
        "q-fin.RM": "finance", "q-fin.ST": "finance", "q-fin.TR": "finance",
    }
    
    def __init__(self, base_dir="paper_dataset"):
        self.base_dir = Path(base_dir)
        self.base_dir.mkdir(exist_ok=True)
        self.session = requests.Session()
        self.session.verify = False

    def download_papers(self, category: str, subject: str, limit: int = 100, progress_callback=None) -> int:
        downloaded = 0
        start = 0
        batch_size = 50
        max_attempts = limit * 3
        attempts = 0
        subject_dir = self.base_dir / subject
        subject_dir.mkdir(exist_ok=True)
        
        while downloaded < limit and attempts < max_attempts:
            response = self._request_batch(category, start, batch_size)
            entries = response.text.split('<entry>')
            
            if len(entries) <= 1:
                break

            for entry in entries[1:]:
                if downloaded >= limit:
                    break
                
                attempts += 1
                
                metadata = self._extract_metadata(entry, category)
                if not metadata:
                    continue
                
                if not self._download_pdf(metadata['arxiv_id'], subject_dir):
                    continue
                
                self._save_abstract(metadata, subject_dir)
                
                downloaded += 1
                if progress_callback:
                    progress_callback()
            
            start += batch_size
            time.sleep(3) 
        
        return downloaded
    
    def _request_batch(self, category: str, start: int, max_results: int):
        url = "https://export.arxiv.org/api/query"
        params = {
            "search_query": f"cat:{category}",
            "start": start,
            "max_results": max_results,
            "sortBy": "submittedDate",
            "sortOrder": "descending"
        }
        response = self.session.get(url, params=params, timeout=30, verify=False)
        response.raise_for_status()
        return response
    

    def _extract_metadata(self, entry: str, category: str) -> dict:
        id_match = re.search(r'<id>(.*?)</id>', entry)
        title_match = re.search(r'<title>(.*?)</title>', entry, re.DOTALL)
        summary_match = re.search(r'<summary>(.*?)</summary>', entry, re.DOTALL)
        published_match = re.search(r'<published>(.*?)</published>', entry)
        authors = re.findall(r'<name>(.*?)</name>', entry)
        
        if not (id_match and title_match and summary_match):
            return None
        
        arxiv_id = id_match.group(1).split('/')[-1].split('v')[0]
        title = self._clean_text(title_match.group(1))
        abstract = self._clean_text(summary_match.group(1))
        date = published_match.group(1)[:10] if published_match else "N/A"
        
        return {
            'arxiv_id': arxiv_id,
            'title': title,
            'abstract': abstract,
            'authors': authors,
            'date': date,
            'category': category
        }
    
    def _clean_text(self, text: str) -> str:
        return text.strip().replace('\n', ' ').replace('  ', ' ')
    
    def _download_pdf(self, arxiv_id: str, subject_dir: Path) -> bool:
        pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
        
        response = self.session.get(pdf_url, timeout=60, verify=False)
        if response.status_code != 200:
            return False
        
        filepath = subject_dir / f"{arxiv_id}.pdf"
        with open(filepath, 'wb') as f:
            f.write(response.content)
        return True
    
    def _save_abstract(self, metadata: dict, subject_dir: Path):
        safe_title = re.sub(r'[^\w\s-]', '', metadata['title'])[:100]
        safe_title = re.sub(r'\s+', '_', safe_title)

        filename = f"{metadata['arxiv_id']}_{safe_title}.txt"
        filepath = subject_dir / filename
        
        content = f"""ArXiv ID: {metadata['arxiv_id']}
Title: {metadata['title']}
Authors: {', '.join(metadata['authors'])}
Date: {metadata['date']}
Category: {metadata['category']}
URL: https://arxiv.org/abs/{metadata['arxiv_id']}
ABSTRACT: {metadata['abstract']}
"""
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
    
    def download_dataset(self, papers_per_subject: int = 100):
        subject_categories = {}
        for cat, sub in self.CATEGORIES.items():
            subject_categories.setdefault(sub, []).append(cat)
        
        total_papers = papers_per_subject * len(subject_categories)
        downloaded_total = 0
        
        print(f"Starting download: {total_papers} papers across {len(subject_categories)} subjects\n")
        
        for subject, categories in subject_categories.items():
            papers_per_cat = papers_per_subject // len(categories)
            extra = papers_per_subject % len(categories)
            
            for i, cat in enumerate(categories):
                limit = papers_per_cat + (1 if i < extra else 0)
                
                def update_progress():
                    nonlocal downloaded_total
                    downloaded_total += 1
                    percentage = (downloaded_total / total_papers) * 100
                    print(f'\r {downloaded_total}/{total_papers} ({percentage:.1f}%) - {subject}', end='', flush=True)
                
                self.download_papers(cat, subject, limit, progress_callback=update_progress)
        
        print(f'\n{downloaded_total}/{total_papers}')

downloader = ArxivDownloader(base_dir="arxiv_paper_dataset")
downloader.download_dataset(papers_per_subject=100)

Starting download: 700 papers across 7 subjects

 673/700 (96.1%) - financecsscience
673/700


In [None]:
'''
2: This cell processes the downloaded arXiv papers to create a structured JSON dataset. 
It extracts metadata (title and abstract) from the text files and full paper content from PDFs.
The code applies text cleaning to remove LaTeX formatting and mathematical expressions, then validates that papers meet minimum length requirements. 
It processes up to 50 papers per subject category, filtering out any papers with incomplete metadata or insufficient content.'''


class SuppressOutput:
    def __enter__(self):
        self.old_stdout = sys.stdout
        self.old_stderr = sys.stderr
        sys.stdout = StringIO()
        sys.stderr = StringIO()
        return self
    
    def __exit__(self, *args):
        sys.stdout = self.old_stdout
        sys.stderr = self.old_stderr



def extract_metadata_from_txt(txt_path):
    metadata = {'title': '', 'abstract': ''}
    
    try:
        with open(txt_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        title_match = re.search(r'Title[:\s]+([^\n]+)', content, re.IGNORECASE)
        if title_match:
            metadata['title'] = title_match.group(1).strip()
        
        abstract_match = re.search(r'ABSTRACT[:\s]+([^\n]+.*?)(?:\n\n|\Z)', content, re.DOTALL | re.IGNORECASE)
        if abstract_match:
            metadata['abstract'] = abstract_match.group(1).strip()
    except:
        pass
    
    return metadata


def extract_text_from_pdf(pdf_path):
    try:
        with SuppressOutput():
            elements = partition_pdf(filename=pdf_path, strategy="auto", languages=["eng"])
            text = "\n\n".join([el.text for el in elements])
        return text if len(text) > 100 else ""
    except:
        return ""


def clean_text(text):
    if not text:
        return ""
    
    text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text)
    text = re.sub(r'\$[^$]+\$', '', text)
    text = clean_extra_whitespace(text)
    
    return text.strip()


def find_introduction(text):
    patterns = [r'\b1\.?\s+Introduction\b', r'\bIntroduction\b', r'\bINTRODUCTION\b']
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return text[match.start():].strip(), True
    return text, False


def process_papers(base_path, max_per_subject=50):

    subjects = ['biology', 'computer_science', 'economics', 'finance', 
                'mathematics', 'physics', 'statistics']
    
    articles = []
    article_id = 1
    processed_total = 0
    total_needed = max_per_subject * len(subjects)
    
    for subject in subjects:
        subject_path = os.path.join(base_path, subject)

        if not os.path.exists(subject_path):
            continue

        all_pdf_files = [f for f in os.listdir(subject_path) if f.endswith('.pdf')]
        
        subject_count = 0
        file_index = 0

        while subject_count < max_per_subject and file_index < len(all_pdf_files):
            file = all_pdf_files[file_index]
            file_index += 1
            
            pdf_path = os.path.join(subject_path, file)
            arxiv_id = os.path.splitext(file)[0]

            txt_path = None
            for txt_file in os.listdir(subject_path):
                if txt_file.endswith('.txt') and arxiv_id in txt_file:
                    txt_path = os.path.join(subject_path, txt_file)
                    break
            
            if not txt_path:
                continue
            
            metadata = extract_metadata_from_txt(txt_path)
            if not metadata['title'] or not metadata['abstract']:
                continue
            
            full_text = extract_text_from_pdf(pdf_path)
            if len(full_text) < 500:
                continue
            
            cleaned = clean_text(full_text)
            final_text, found_intro = find_introduction(cleaned)
            
            if not found_intro or len(final_text) < 500:
                continue
            
            article = {
                'id': f'id{article_id}',
                'title': metadata['title'],
                'description': metadata['abstract'],
                'full_text': final_text,
                'subject': subject,
                'version': [],
                'type': 'paper',
                'folder': [f"c{random.randint(1,100)}" for _ in range(random.randint(1,5))]
            }
            
            articles.append(article)
            article_id += 1
            subject_count += 1
            processed_total += 1
            
            percentage = (processed_total / total_needed) * 100
            print(f'\r {processed_total}/{total_needed} ({percentage:.1f}%) - {subject}', end='', flush=True)
    
    return articles

def create_json_dataset(output_file="dataset_papers.json", max_per_subject=50):
    
    base_path = "arxiv_paper_dataset"
    
    articles = process_papers(base_path, max_per_subject)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(articles, f, ensure_ascii=False, indent=2)
    
    print(f'\n File saved: {output_file}')
    print(f'Articles processed: {len(articles)}\n')


create_json_dataset(output_file="dataset_papers.json", max_per_subject=50)

 350/350 (100.0%) - statisticsience
 File saved: dataset_papers.json
Articles processed: 350



In [2]:
'''3: 
For each original paper, it creates 5 modified versions by randomly removing 10-50% of the text from different portions 
It then organizes all documents (originals + versions) by random folder tags and generates document pairs with three types of labels: 
"versions" (same paper with modifications), "similar" (different papers from the same subject), and "unrelated" (papers from different subjects).'''

class DocumentPair:
    def __init__(self, doc1_id, doc2_id, doc1_text, doc2_text, label, label_name):
        self.doc1_id = doc1_id
        self.doc2_id = doc2_id
        self.doc1_text = doc1_text
        self.doc2_text = doc2_text
        self.label = label
        self.label_name = label_name

def remove_text_portion(text, percentage):
    text_len = len(text)
    remove_len = int(text_len * percentage / 100)
    mode = random.choice(['start', 'end', 'middle', 'start_end'])
    
    if mode == 'start': 
        return text[remove_len:]
    elif mode == 'end': 
        return text[:-remove_len]
    elif mode == 'middle':
        start = (text_len - remove_len) // 2
        return text[:start] + text[start + remove_len:]
    else: 
        half_remove = remove_len // 2
        return text[half_remove:-half_remove]

def get_next_id(articles):
    max_id = 0
    for article in articles:
        article_id = article.get('id', '')
        if article_id.startswith('id'):
            max_id = max(max_id, int(article_id[2:]))
    return max_id + 1

def determine_label(doc1, doc2):
    if doc2['id'] in doc1.get('version', []) or doc1['id'] in doc2.get('version', []):
        return 'versions'
    if doc1.get('subject') == doc2.get('subject') and doc1.get('subject') is not None:
        return 'similar'
    return 'unrelated'

def process_pipeline(input_file="dataset_papers.json"):
    with open(input_file, 'r', encoding='utf-8') as f:
        articles = json.load(f)
    
    num_original = len(articles)
    new_versions = []
    next_id = get_next_id(articles)

    for article in articles:
        article.setdefault('version', [])
        article.setdefault('folder', [])
        
        version_ids = []
        temp_versions = []
        
        for i in range(5):
            percentage = random.randint(10, 50)
            v_id = f'id{next_id}'
            version_ids.append(v_id)
            
            version = {
                'id': v_id,
                'title': article['title'],
                'description': article.get('description', ''),
                'full_text': remove_text_portion(article['full_text'], percentage),
                'subject': article['subject'],
                'version': [article['id']] + [vid for vid in version_ids if vid != v_id],
                'type': article['type'],
                'folder': article['folder'].copy()
            }
            temp_versions.append(version)
            next_id += 1
        
        article['version'].extend(version_ids)
        
        for version in temp_versions:
            version['version'] = [article['id']] + [vid for vid in version_ids if vid != version['id']]
        
        new_versions.extend(temp_versions)
    
    all_docs = articles + new_versions
    
    by_folder = {}
    for doc in all_docs:
        for folder in doc.get('folder', []):
            by_folder.setdefault(folder, []).append(doc)
    
    LABEL_MAP = {'unrelated': 0, 'similar': 1, 'versions': 2}
    pairs = []
    seen_pairs = set()
    
    for docs in by_folder.values():
        for d1, d2 in combinations(docs, 2):
            pair_key = tuple(sorted([d1['id'], d2['id']]))
            if pair_key not in seen_pairs:
                seen_pairs.add(pair_key)
                label_name = determine_label(d1, d2)
                pairs.append(DocumentPair(
                    d1['id'], d2['id'], 
                    d1.get('full_text', ''), d2.get('full_text', ''),
                    LABEL_MAP[label_name], label_name
                ))
    
    random.shuffle(pairs)

    with open("dataset_papers_versions.json", 'w', encoding='utf-8') as f:
        json.dump(all_docs, f, ensure_ascii=False, indent=2)
    
    with open("document_pairs.json", 'w', encoding='utf-8') as f:
        json.dump([{
            'doc1_id': p.doc1_id, 
            'doc2_id': p.doc2_id, 
            'label': p.label, 
            'label_name': p.label_name
        } for p in pairs], f, indent=2, ensure_ascii=False)
    
    v_c = sum(1 for p in pairs if p.label_name == 'versions')
    s_c = sum(1 for p in pairs if p.label_name == 'similar')
    u_c = sum(1 for p in pairs if p.label_name == 'unrelated')
    
    print(f' Total Documents: {len(all_docs)} ({num_original} orig + {len(new_versions)} versions)')
    print(f' Total Pairs: {len(pairs)} [Version: {v_c}, Similar: {s_c}, Unrelated: {u_c}]\n')
    
    return all_docs, pairs

all_docs, pairs = process_pipeline("dataset_papers.json")

 Total Documents: 2100 (350 orig + 1750 versions)
 Total Pairs: 193008 [Version: 5250, Similar: 26082, Unrelated: 161676]



In [3]:
'''4:
The prepare_datasets function splits the document pairs into training (80%) and validation (20%) sets 
using stratified sampling to maintain label distribution, then creates DataLoader objects with batch size 8 for efficient model training '''

class DocumentPairDataset(Dataset):
    def __init__(self, pairs):
        self.texts_1 = [p.doc1_text for p in pairs]
        self.texts_2 = [p.doc2_text for p in pairs]
        self.labels = torch.tensor([p.label for p in pairs], dtype=torch.long)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.texts_1[idx], self.texts_2[idx], self.labels[idx]


def collate_fn(batch):
    texts_1, texts_2, labels = zip(*batch)
    return {
        'texts_1': list(texts_1),
        'texts_2': list(texts_2),
        'labels': torch.stack(labels)
    }


def prepare_datasets(pairs, test_size=0.2, batch_size=8):
    train_pairs, val_pairs = train_test_split(
        pairs, 
        test_size=test_size, 
        random_state=42,
        stratify=[p.label for p in pairs]
    )
    
    print(f"Train: {len(train_pairs)} samples | Val: {len(val_pairs)} samples")

    train_dataset = DocumentPairDataset(train_pairs)
    val_dataset = DocumentPairDataset(val_pairs)

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn
    )
    
    print(f"Train batches: {len(train_loader)} | Val batches: {len(val_loader)}")
    
    return train_dataset, val_dataset, train_loader, val_loader

train_dataset, val_dataset, train_loader, val_loader = prepare_datasets(pairs=pairs, test_size=0.2, batch_size=8)

Train: 154406 samples | Val: 38602 samples
Train batches: 19301 | Val batches: 4826


In [4]:
'''5:
This cell defines a Siamese neural network that classifies document pairs into three categories (unrelated, similar, or versions). 
It uses a pre-trained sentence transformer to encode both documents, then combines their embeddings with difference and product features before 
passing them through a classifier.'''

class SiameseDocumentClassifier(nn.Module):
    def __init__(self, model_name="all-MiniLM-L6-v2", num_classes=3, 
                 dropout=0.1, class_weights=None):
        super().__init__()

        self.encoder = SentenceTransformer(model_name)
        self.hidden_size = self.encoder.get_sentence_embedding_dimension()

        self.classifier = nn.Sequential(
            nn.Linear(self.hidden_size * 4, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, num_classes)
        )
        
        self.register_buffer('class_weights', class_weights)
        
    
    def forward(self, texts_1, texts_2, labels=None):
        emb1 = self.encoder.encode(texts_1, convert_to_tensor=True, show_progress_bar=False)
        emb2 = self.encoder.encode(texts_2, convert_to_tensor=True, show_progress_bar=False)
        
        #features for classifier
        diff = torch.abs(emb1 - emb2)
        prod = emb1 * emb2
        features = torch.cat([emb1, emb2, diff, prod], dim=1)
        
        logits = self.classifier(features)
        
        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        
        return {"loss": loss, "logits": logits}


def compute_class_weights(train_loader, device):
    all_labels = []
    for batch in train_loader:
        all_labels.extend(batch['labels'].numpy())
    all_labels = np.array(all_labels)
    
    unique = np.unique(all_labels)
    weights = compute_class_weight('balanced', classes=unique, y=all_labels)
    class_weights = torch.FloatTensor(weights).to(device)
    
    return class_weights

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

class_weights = compute_class_weights(train_loader, device)

siamese_model = SiameseDocumentClassifier(
    model_name="sentence-transformers/all-mpnet-base-v2",
    num_classes=3,
    dropout=0.1,
    class_weights=class_weights
)
siamese_model.to(device);

Device: cuda


In [None]:
'''6:
This cell trains the Siamese model over 7 epochs using the AdamW optimizer. 
It tracks training and validation loss/accuracy for each epoch, automatically reduces the learning rate when validation loss stops improving
and saves the best model based on validation accuracy. '''

def train_epoch(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()
        
        labels = batch['labels'].to(device)
        outputs = model(batch['texts_1'], batch['texts_2'], labels)
        
        loss = outputs['loss']
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        predictions = torch.argmax(outputs['logits'], dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)
    
    avg_loss = total_loss / len(train_loader)
    accuracy = correct / total if total > 0 else 0
    return avg_loss, accuracy


def validate(model, val_loader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            labels = batch['labels'].to(device)
            outputs = model(batch['texts_1'], batch['texts_2'], labels)
            
            total_loss += outputs['loss'].item()
            predictions = torch.argmax(outputs['logits'], dim=1)
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(val_loader)
    accuracy = sum(p == l for p, l in zip(all_preds, all_labels)) / len(all_labels) if all_labels else 0.0
    return avg_loss, accuracy, all_preds, all_labels


def train_model(model, train_loader, val_loader, device, epochs=5, lr=2e-5):
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
    
    best_val_acc = 0.0
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    
    print(f"Training: {epochs} epochs | LR: {lr}")
    
    for epoch in range(1, epochs + 1):
        print(f"\nEpoch {epoch}/{epochs}")
        
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, device)
        val_loss, val_acc, val_preds, val_labels = validate(model, val_loader, device)
        
        scheduler.step(val_loss)
        
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        
        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
        print(f"Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f}")
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')
            print("Best model saved!")
    
    print(f"\nBest Validation Accuracy: {best_val_acc:.4f}")
    
    label_names = ['unrelated', 'similar', 'versions']
    print("\nClassification Report:")
    print(classification_report(val_labels, val_preds, target_names=label_names, digits=4))
    
    return history, val_preds, val_labels


history, val_preds, val_labels = train_model(
    model=siamese_model,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device,
    epochs=7,
    lr=2e-5
)

Training: 7 epochs | LR: 2e-05

Epoch 1/7


Training: 100%|██████████| 19301/19301 [2:17:39<00:00,  2.34it/s]  
Validation: 100%|██████████| 4826/4826 [30:57<00:00,  2.60it/s]


Train Loss: 0.5443 | Train Acc: 0.8416
Val Loss:   0.3832 | Val Acc:   0.8330
⭐ Best model saved!

Epoch 2/7


Training: 100%|██████████| 19301/19301 [3:07:46<00:00,  1.71it/s]  
Validation: 100%|██████████| 4826/4826 [53:23<00:00,  1.51it/s]  


Train Loss: 0.3207 | Train Acc: 0.8834
Val Loss:   0.2805 | Val Acc:   0.8949
⭐ Best model saved!

Epoch 3/7


Training:  47%|████▋     | 9128/19301 [57:31<1:00:32,  2.80it/s]

In [None]:
'''7: CONFUSION MATRIX & METRICS
Generates confusion matrix, detailed metrics per class, and error analysis
using the best trained model from the previous cell.'''

def plot_confusion_matrix(y_true, y_pred, class_names=['unrelated', 'similar', 'versions']):
    cm = confusion_matrix(y_true, y_pred)
    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=False, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    
    for i in range(len(class_names)):
        for j in range(len(class_names)):
            count = cm[i, j]
            percent = cm_percent[i, j]
            color = 'white' if cm[i, j] > cm.max() / 2 else 'black'
            plt.text(j + 0.5, i + 0.5, f'{count}\n({percent:.1f}%)',
                    ha='center', va='center', color=color, 
                    fontsize=12, fontweight='bold')
    
    plt.title('Confusion Matrix\n(Count and Row %)', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=12, fontweight='bold')
    plt.xlabel('Predicted Label', fontsize=12, fontweight='bold')
    plt.tight_layout()
    plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    return cm


def analyze_results(val_labels, val_preds):
    class_names = ['unrelated', 'similar', 'versions']

    
    # Classification report
    report = classification_report(val_labels, val_preds, 
                                   target_names=class_names, 
                                   digits=4, 
                                   output_dict=True)
    
    # Print per-class metrics
    for class_name in class_names:
        metrics = report[class_name]
        print(f"\nClass: {class_name.upper()}")
        print(f"  Precision: {metrics['precision']:.4f} ({metrics['precision']*100:.2f}%)")
        print(f"  Recall:    {metrics['recall']:.4f} ({metrics['recall']*100:.2f}%)")
        print(f"  F1-Score:  {metrics['f1-score']:.4f} ({metrics['f1-score']*100:.2f}%)")
        print(f"  Support:   {int(metrics['support'])} samples")
    
    # Overall metrics

    print("OVERALL METRICS")

    print(f"Accuracy:        {report['accuracy']:.4f} ({report['accuracy']*100:.2f}%)")
    print(f"Macro Avg F1:    {report['macro avg']['f1-score']:.4f}")
    print(f"Weighted Avg F1: {report['weighted avg']['f1-score']:.4f}")
    
    # Error analysis
    errors = np.array(val_labels) != np.array(val_preds)
    n_errors = errors.sum()
    n_total = len(val_labels)

    print("ERROR ANALYSIS")

    print(f"\nTotal errors: {n_errors}/{n_total} ({n_errors/n_total*100:.2f}%)")

    if n_errors > 0:
        error_types = {}
        for true_label, pred_label in zip(np.array(val_labels)[errors], 
                                          np.array(val_preds)[errors]):
            key = f"{class_names[true_label]} → {class_names[pred_label]}"
            error_types[key] = error_types.get(key, 0) + 1
        
        print("\nError types (frequency):")
        for error_type, count in sorted(error_types.items(), key=lambda x: x[1], reverse=True):
            pct = count / n_errors * 100
            print(f"  {error_type:<30} {count:>4} ({pct:>5.1f}%)")

    cm = plot_confusion_matrix(val_labels, val_preds, class_names)
    return report, cm


report, cm = analyze_results(val_labels, val_preds)

In [6]:
'''8:
This cell generates embeddings for all documents in the dataset using the trained Siamese model's encoder. 
For each document, it creates separate embeddings for the title, abstract (description)'''

def get_single_embedding(text, model, device):
    model.to(device)
    model.eval()
    
    with torch.no_grad():
        embedding_tensor = model.encoder.encode([text], convert_to_tensor=True, show_progress_bar=False)
        embedding_list = embedding_tensor[0].cpu().numpy().tolist()
        return embedding_list

with open("dataset_papers_versions.json", "r", encoding="utf-8") as f:
    dataset = json.load(f)


for article in tqdm(dataset, desc="Embeddings"):
    title = article.get("title")
    description = article.get("description")
    full_text = article.get("full_text")
    
    if title and title.strip():
        article["embedding_title"] = get_single_embedding(title, siamese_model, device)
    
    if description and description.strip():
        article["embedding_description"] = get_single_embedding(description, siamese_model, device)
    
    if full_text and full_text.strip():
        article["embedding_full_text"] = get_single_embedding(full_text, siamese_model, device)

with open("dataset_papers_with_embeddings.json", "w", encoding="utf-8") as f:
    json.dump(dataset, f, indent=2, ensure_ascii=False)


Embeddings: 100%|██████████| 2100/2100 [04:18<00:00,  8.12it/s]


In [7]:
'''9: 
This cell implements document clustering within folders using graph-based community detection. 
It builds a weighted similarity graph for documents in each folder based on their embeddings, connecting documents with cosine similarity above 0.70 
(keeping top 8 connections per document). 
The Louvain algorithm then detects communities in this graph to group similar documents into clusters.'''  

def load_dataset_simple(embeddings_path):
    with open(embeddings_path, "r", encoding="utf-8") as f:
        documents = json.load(f)
    
    for doc in documents:
        doc["id"] = doc.get("id") 
    
    return documents


def build_folder_weighted_graph(documents, folder_name, embedding_key, 
                                min_similarity=0.70, top_k=8):

    folder_docs = [d for d in documents if folder_name in d["folder"]]
    
    valid_docs, valid_emb = [], []
    for doc in folder_docs:
        emb = doc.get(embedding_key)
        if isinstance(emb, list):
            valid_docs.append(doc)
            valid_emb.append(np.array(emb))
    
    emb_matrix = np.vstack(valid_emb)
    sim_matrix = cosine_similarity(emb_matrix)
    
    G = nx.Graph()
    for doc in valid_docs:
        G.add_node(doc["id"], **doc)
    
    for i in range(len(valid_docs)):
        sims = sim_matrix[i].copy()
        sims[i] = -1 
        

        top_indices = np.argsort(sims)[::-1][:top_k]

        for j in top_indices:
            if sims[j] >= min_similarity:
                if not G.has_edge(valid_docs[i]["id"], valid_docs[j]["id"]):
                    G.add_edge(valid_docs[i]["id"], valid_docs[j]["id"], weight=float(sims[j]))
    
    return G, valid_docs


def apply_louvain_clustering(G, resolution=0.8):
    communities = louvain_communities(G, weight="weight", resolution=resolution)
    partition = {node: cid for cid, nodes in enumerate(communities) for node in nodes}
    
    clusters = {}
    for node, cid in partition.items():
        clusters.setdefault(cid, []).append(node)
    
    return partition, clusters


def create_cluster_json(documents, folder_name, clusters):
    doc_by_id = {doc["id"]: doc for doc in documents}
    
    result = {"folder": folder_name, "num_clusters": len(clusters)}
    
    sorted_clusters = sorted(clusters.items(), key=lambda x: len(x[1]), reverse=True)
    
    for idx, (cid, doc_ids) in enumerate(sorted_clusters, 1):
        result[f"cluster_{idx}"] = doc_ids
        result[f"file_names_cluster_{idx}"] = [
            doc_by_id[d].get("id", d) for d in doc_ids
        ]
    
    return result

In [None]:
'''10:
This cell defines functions to evaluate clustering quality using multiple metrics.'''

def compute_embedding_metrics(embeddings_matrix, labels):
    metrics = {}
    

    distances = cosine_distances(embeddings_matrix)
    sil_score = silhouette_score(distances, labels, metric='precomputed')
    metrics["silhouette_cosine"] = float(sil_score)

    db_score = davies_bouldin_score(embeddings_matrix, labels)
    metrics["davies_bouldin"] = float(db_score)
    
    return metrics


def compute_conductance(G, partition):
    communities = {}
    for node, comm_id in partition.items():
        communities.setdefault(comm_id, set()).add(node)
    
    conductances = []
    
    for comm_id, nodes in communities.items():
        if len(nodes) < 2:
            continue
        
        internal_edges = 0
        external_edges = 0
        
        for node in nodes:
            for neighbor in G.neighbors(node):
                if neighbor in nodes:
                    internal_edges += 1
                else:
                    external_edges += 1
        
        internal_edges = internal_edges / 2
        total_edges = internal_edges + external_edges
        
        if total_edges > 0:
            cond = external_edges / total_edges
            conductances.append(cond)
    
    return {
        "conductance_mean": float(np.mean(conductances)),
        "conductance_std": float(np.std(conductances)),
        "conductance_min": float(np.min(conductances)),
        "conductance_max": float(np.max(conductances))
    }


def evaluate_folder_clustering(G, partition, valid_docs, embedding_key="embedding_full_text"):
    results = {}
    
    results["graph"] = compute_conductance(G, partition)
    
    doc_by_id = {doc["id"]: doc for doc in valid_docs}
    embeddings_list = []
    labels_list = []
    
    for node_id in G.nodes():
        if node_id in doc_by_id and node_id in partition:
            emb = doc_by_id[node_id].get(embedding_key)
            if emb:
                embeddings_list.append(emb)
                labels_list.append(partition[node_id])
    
    embeddings_matrix = np.array(embeddings_list)
    labels_array = np.array(labels_list)
    results["embedding"] = compute_embedding_metrics(embeddings_matrix, labels_array)
    
    return results

In [None]:
'''11:
This cell runs the complete clustering pipeline across all folders in the dataset. 
For each folder, it builds a similarity graph, applies Louvain clustering with specified parameters 
(minimum similarity 0.25, top 6 connections per document, resolution 1), and evaluates the clustering quality.
It computes and displays average clustering metrics'''

def run_complete_pipeline(embeddings_json, output_json,
                          min_similarity=0.70, top_k=8, resolution=0.8,
                          test_folder=None):

    
    documents = load_dataset_simple(embeddings_json)
    all_folders = sorted({f for d in documents for f in d.get("folder", [])})
    
    if test_folder:
        all_folders = [test_folder]
    
    all_results = {}
    metrics_results = {}
    
    print(f"Processing {len(all_folders)} folders")
    
    for folder in tqdm(all_folders, desc="Clustering"):
        G, valid_docs = build_folder_weighted_graph(
            documents, folder, "embedding_full_text",
            min_similarity=min_similarity,
            top_k=top_k
        )
        
        if G is None or valid_docs is None or len(valid_docs) < 2:
            continue
        
        partition, clusters = apply_louvain_clustering(G, resolution=resolution)
        all_results[folder] = create_cluster_json(documents, folder, clusters)
        metrics_results[folder] = evaluate_folder_clustering(G, partition, valid_docs)
    
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)
    
    print(f"Saved clustering results to {output_json}")
    
    return all_results, metrics_results


def summarize_metrics(metrics_results):
    rows = []
    
    for folder, metrics in metrics_results.items():
        row = {"folder": folder}
        
        for key, value in metrics["graph"].items():
            row[f"graph_{key}"] = value
        
        for key, value in metrics["embedding"].items():
            row[f"embedding_{key}"] = value
        
        rows.append(row)
    
    df = pd.DataFrame(rows)
    
    summary = pd.DataFrame({
        "mean": df.drop(columns="folder").mean(),
        "std": df.drop(columns="folder").std()
    })
    
    print("\n=== Average metrics across folders ===")
    print(summary)
    
    return df, summary


all_results, metrics_results = run_complete_pipeline(
    embeddings_json="dataset_papers_with_embeddings.json",
    output_json="all_folders_fewer_clusters.json",
    min_similarity=0.25,
    top_k=6,
    resolution=1
)

df_metrics, summary_metrics = summarize_metrics(metrics_results)

print(f"\nProcessed {len(all_results)} folders")
print(f"Total clusters: {sum(r['num_clusters'] for r in all_results.values())}")

In [None]:
'''12:
This cell evaluates clustering quality using purity metrics.
For each cluster, it calculates purity as the fraction of documents belonging to the most common subject. 
It then reports the percentage of clusters achieving purity thresholds of ≥50%, ≥70%, and ≥90%, both per folder and averaged across all folders.''' 

def build_id_to_label_map(dataset_json_path):
    """Map document_id -> subject (ground truth)"""
    with open(dataset_json_path, "r", encoding="utf-8") as f:
        dataset = json.load(f)
    
    id_to_label = {}
    for item in dataset:
        subject = item.get("subject")
        for doc_id in item.get("version", []):
            id_to_label[doc_id] = subject
    
    return id_to_label


def compute_cluster_purities(folder_data, id_to_label):
    """Compute purity for each cluster in a folder"""
    purities = []
    
    for key, ids in folder_data.items():
        if not key.startswith("cluster_"):
            continue
        
        labels = [id_to_label[i] for i in ids if i in id_to_label]
        if not labels:
            continue
        
        most_common = Counter(labels).most_common(1)[0][1]
        purities.append(most_common / len(labels))
    
    return purities


def compute_purity_all_folders(cluster_json_path, dataset_json_path):
    """Compute purity per folder and global average"""
    
    with open(cluster_json_path, "r", encoding="utf-8") as f:
        all_clusters = json.load(f)
    
    id_to_label = build_id_to_label_map(dataset_json_path)
    
    folder_results = {
        "per_folder": {},
        "average": {}
    }

    purity_50_vals = []
    purity_70_vals = []
    purity_90_vals = []
    
    for folder, folder_data in all_clusters.items():
        purities = compute_cluster_purities(folder_data, id_to_label)
        if not purities:
            continue
        
        N = len(purities)
        
        p50 = sum(p >= 0.50 for p in purities) / N
        p70 = sum(p >= 0.70 for p in purities) / N
        p90 = sum(p >= 0.90 for p in purities) / N
        
        folder_results["per_folder"][folder] = {
            "num_clusters": N,
            "purity_50_frac": p50,
            "purity_70_frac": p70,
            "purity_90_frac": p90
        }
        
        purity_50_vals.append(p50)
        purity_70_vals.append(p70)
        purity_90_vals.append(p90)
    
    folder_results["average"] = {
        "num_folders": len(folder_results["per_folder"]),
        "purity_50_mean": float(np.mean(purity_50_vals)),
        "purity_50_std": float(np.std(purity_50_vals)),
        "purity_70_mean": float(np.mean(purity_70_vals)),
        "purity_70_std": float(np.std(purity_70_vals)),
        "purity_90_mean": float(np.mean(purity_90_vals)),
        "purity_90_std": float(np.std(purity_90_vals)),
    }
    
    return folder_results


def print_purity_results(results):
    """Print purity statistics"""
    avg = results["average"]
    

    print("\n AVERAGE PURITY ACROSS ALL FOLDERS")

    print(f"Number of folders: {avg['num_folders']}")
    print(f"\nClusters with purity:")
    print(f"  ≥ 50%: {avg['purity_50_mean']:.2%} ± {avg['purity_50_std']:.2%}")
    print(f"  ≥ 70%: {avg['purity_70_mean']:.2%} ± {avg['purity_70_std']:.2%}")
    print(f"  ≥ 90%: {avg['purity_90_mean']:.2%} ± {avg['purity_90_std']:.2%}")
    print("="*60)


purity_results = compute_purity_all_folders(
    cluster_json_path="all_folders_fewer_clusters.json",
    dataset_json_path="dataset_papers_versions.json"
)

print_purity_results(purity_results)


In [None]:
'''13:
This cell generates concise titles for documents using Qwen.
It takes the first 800 characters, prompts the LLM to create a title of maximum 10 words.
It creates concatenated title strings for each cluster by combining the titles of all documents in that cluster... We will use it later
Then it generate the embedding of the generated title... We will need it'''

def generate_title(text, tokenizer, model, device):
    snippet = text[:800] #limit
    messages = [
        {"role": "system", "content": "Create concise titles. Output ONLY the title."},
        {"role": "user", "content": f"Create a title (max 10 words):\n\n{snippet}\n\nTitle:"}
    ]
    
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=24, do_sample=False, num_beams=4)
    
    title = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()
    
    title = title.split("\n")[0].strip('"\'')
    title = "".join(c for c in title if c.isalnum() or c in " -_.,").strip()
    
    return title 

def get_single_embedding(text, model, device):
    model.to(device)
    model.eval()
    
    with torch.no_grad():
        embedding_tensor = model.encoder.encode([text], convert_to_tensor=True, show_progress_bar=False)
        embedding_list = embedding_tensor[0].cpu().numpy().tolist()
        return embedding_list

def run_pipeline(input_json, output_json, cluster_json=None, model_name="Qwen/Qwen2.5-1.5B-Instruct"):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    llm_model = AutoModelForCausalLM.from_pretrained(
        model_name, dtype=torch.float16 if device == "cuda" else torch.float32, device_map="auto"
    ).eval()
    
    with open(input_json, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    id_to_title = {}
    
    for item in tqdm(data, desc="Generating titles and embeddings"):
        text = item.get("full_text", "")
        
        # Generate title
        title = generate_title(text, tokenizer, llm_model, device) 
        item["generated_title"] = title
        
        # Generate embedding for the generated title
        if title and title.strip():
            item["embedding_generated_title"] = get_single_embedding(title, siamese_model, device)
        
        if "id" in item:
            id_to_title[item["id"]] = title

    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    
    valid = sum(1 for item in data if item.get("generated_title") != "Untitled")
    print(f"{valid}/{len(data)} titles | Saved to {output_json}")
    
    if cluster_json:
        with open(cluster_json, "r", encoding="utf-8") as f:
            clusters = json.load(f)
        
        updated = 0
        for folder in clusters.values():
            if isinstance(folder, dict):
                items = list(folder.items())
                for key, ids in items:
                    if re.match(r"^cluster_(\d+)$", key) and isinstance(ids, list):
                        idx = re.match(r"^cluster_(\d+)$", key).group(1)
                        titles = [id_to_title.get(i) for i in ids if id_to_title.get(i)]
                        folder[f"testo_completo_cluster_{idx}"] = " ".join(titles)
                        updated += 1
        
        with open(cluster_json, "w", encoding="utf-8") as f:
            json.dump(clusters, f, ensure_ascii=False, indent=2)
        


run_pipeline(
    input_json="dataset_papers_with_embeddings.json",
    output_json="dataset_papers_with_titles.json",
    cluster_json="all_folders_fewer_clusters.json"  
)

In [None]:
'''14: 
This cell generates descriptive names for clusters and parent folders using QWEN
It analyzes the document titles within that cluster and creates a name that captures the common theme.'''

def generate_cluster_name(text, tokenizer, model, device):
    messages = [
        {
            "role": "system",
            "content": (
                "You are an expert at analyzing document collections and creating meaningful "
                "folder names. Your task is to identify the common theme or topic across "
                "multiple documents and create a concise, descriptive folder name."
            )
        },
        {
            "role": "user",
            "content": f"""Below is a list of document titles that belong together in the same folder.
Analyze their common theme and create a descriptive folder name.

Document titles:
{text}

Requirements:
- Maximum 6 words
- Use hyphens to separate words (e.g., "machine-learning-applications")
- Be specific and descriptive
- Capture the main topic or theme shared by these documents
- Use lowercase letters

Output ONLY the folder name, nothing else.

Folder name:"""
        }
    ]

    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer(
        prompt, return_tensors="pt", truncation=True, max_length=512  
    ).to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=15,  
            num_beams=1,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )

    name = tokenizer.decode(
        output[0][inputs["input_ids"].shape[1]:],
        skip_special_tokens=True
    ).strip()

    name = name.split("\n")[0].strip('"\'').replace(" ", "-")
    name = "".join(c for c in name if c.isalnum() or c in "-_")
    while "--" in name:
        name = name.replace("--", "-")
    name = name.strip("-_")[:60]

    return name


def generate_parent_name(texts, tokenizer, model, device):
    combined = "\n".join([t[:100] for t in texts[:10]])

    messages = [
        {
            "role": "system",
            "content": (
                "You are an expert at analyzing collections of document clusters and creating "
                "meaningful parent folder names that capture the overarching theme."
            )
        },
        {
            "role": "user",
            "content": f"""Below are summaries of multiple document clusters that belong in the same parent folder.
Analyze their common theme and create a descriptive parent folder name.

Cluster summaries:
{combined}

Requirements:
- Maximum 6 words
- Use hyphens to separate words
- Be broad enough to encompass all clusters
- Capture the overarching theme
- Use lowercase letters

Output ONLY the parent folder name, nothing else.

Parent folder name:"""
        }
    ]

    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer(
        prompt, return_tensors="pt", truncation=True, max_length=512  
    ).to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=15, 
            num_beams=1,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )

    name = tokenizer.decode(
        output[0][inputs["input_ids"].shape[1]:],
        skip_special_tokens=True
    ).strip()

    name = name.split("\n")[0].strip('"\'').replace(" ", "-")
    name = "".join(c for c in name if c.isalnum() or c in "-_")
    while "--" in name:
        name = name.replace("--", "-")
    name = name.strip("-_")[:60]

    return name


def add_folder_names(cluster_json, output_json, model_name, device):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    llm_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,  
        low_cpu_mem_usage=True  
    ).eval()

    with open(cluster_json, "r", encoding="utf-8") as f:
        data = json.load(f)


    total_clusters = 0
    for folder_data in data.values():
        cluster_idx = 1
        while f"cluster_{cluster_idx}" in folder_data:
            total_clusters += 1
            cluster_idx += 1
    
    print(f"Total clusters to process: {total_clusters}")
    parent_success = 0

    for folder_name, folder_data in tqdm(data.items(), desc="Parent folders"):
        texts = []
        cluster_idx = 1

        while f"testo_completo_cluster_{cluster_idx}" in folder_data:
            text = folder_data[f"testo_completo_cluster_{cluster_idx}"]
            if text:
                texts.append(text)
            cluster_idx += 1

        if texts:
            parent_name = generate_parent_name(
                texts, tokenizer, llm_model, device
            )
            folder_data["parent_folder_name"] = parent_name
            parent_success += 1

    named_clusters = 0

    with tqdm(total=total_clusters, desc="Naming clusters") as pbar:
        for folder_name, folder_data in data.items():
            cluster_idx = 1

            while f"cluster_{cluster_idx}" in folder_data:
                text_key = f"testo_completo_cluster_{cluster_idx}"
                if text_key in folder_data:
                    text = folder_data[text_key]
                    name = generate_cluster_name(
                        text, tokenizer, llm_model, device
                    )
                    folder_data[f"cluster_name_{cluster_idx}"] = name
                    named_clusters += 1
                    pbar.update(1)

                cluster_idx += 1


    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    


add_folder_names(
    cluster_json="all_folders_fewer_clusters.json",
    output_json="all_folders_with_names.json",
    model_name="Qwen/Qwen2.5-1.5B-Instruct",
    device= device
)

In [None]:
'''15:
This cell computes similarity scores between documents using their embeddings. 
It performs similarity search by comparing title embeddings against full-text embeddings to find the top 6 most similar documents for each paper. '''

NUM_RESULTS = 6

def compute_similarities(data: List[Dict[str, Any]], query_embedding_key: str, 
                        target_embedding_key: str, output_key: str, k: int) -> List[Dict[str, Any]]:

    print(f"Query: {query_embedding_key} → Target: {target_embedding_key}")
    
    target_matrix = np.array([a[target_embedding_key] for a in data], dtype=float)
    
    article_ids = [a["id"] for a in data]
    article_titles = [a.get("title", "N/A") for a in data]
    
    output_data = data.copy()
    
    for i in tqdm(range(len(data)), desc="Similarity"):
        query_article = output_data[i]
        query_id = article_ids[i]
        query_vector = np.array(query_article[query_embedding_key], dtype=float).reshape(1, -1)
        

        similarity_scores = cosine_similarity(query_vector, target_matrix)[0]
        top_k_indices = np.argsort(similarity_scores)[::-1][:k]
        
        results = []
        for rank, idx in enumerate(top_k_indices, start=1):
            results.append({
                "rank": rank,
                "is_query_itself": (query_id == article_ids[idx]),
                "similarity_score": round(float(similarity_scores[idx]), 4),
                "target_id": article_ids[idx],
                "target_title": article_titles[idx],
                "target_subject": data[idx].get("subject", "N/A")
            })
        
        query_article[output_key] = results
    
    return output_data

with open("dataset_papers_with_titles.json", "r", encoding="utf-8") as f:
    articles_with_embeddings = json.load(f)

results = compute_similarities(
    data=articles_with_embeddings,
    query_embedding_key="embedding_title",
    target_embedding_key="embedding_full_text",
    output_key="most_similar_by_title_to_text",
    k=NUM_RESULTS
)


with open("dataset_similarity_title.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\nSaved {len(results)} papers to dataset_similarity_title.json")

example = results[0]
print(f"\nExample: {example.get('title')}")
for res in example["most_similar_by_title_to_text"][:3]:
    flag = " (SELF)" if res["is_query_itself"] else ""
    print(f"  Rank {res['rank']}: {res['similarity_score']:.4f} - {res['target_title']}{flag}")

In [None]:
'''16:
It does the same as the cell above but it compares abstract embeddings against full-text embeddings''' 

results_abstract = compute_similarities(
    data=articles_with_embeddings,
    query_embedding_key="embedding_description",
    target_embedding_key="embedding_full_text",
    output_key="most_similar_by_abstract_to_text",
    k=NUM_RESULTS
)

with open("dataset_similarity_abstract.json", "w", encoding="utf-8") as f:
    json.dump(results_abstract, f, indent=2, ensure_ascii=False)

print(f"\nSaved {len(results_abstract)} papers to dataset_similarity_abstract.json")

example = results_abstract[0]
print(f"\nExample: {example.get('title')}")
for res in example["most_similar_by_abstract_to_text"][:3]:
    flag = " (SELF)" if res["is_query_itself"] else ""
    print(f"  Rank {res['rank']}: {res['similarity_score']:.4f} - {res['target_title']}{flag}")

In [None]:
'''17:
It does the same as the cells above but it compares generated title against full-text embeddings''' 

results_abstract = compute_similarities(
    data=articles_with_embeddings,
    query_embedding_key="embedding_generated_title",
    target_embedding_key="embedding_full_text",
    output_key="most_similar_by_generated_title_to_text",
    k=NUM_RESULTS
)

with open("dataset_similarity_generated_title.json", "w", encoding="utf-8") as f:
    json.dump(results_abstract, f, indent=2, ensure_ascii=False)

print(f"\nSaved {len(results_abstract)} papers to dataset_similarity_generated_title.json")

example = results_abstract[0]
print(f"\nExample: {example.get('title')}")
for res in example["most_similar_by_generated_title_to_text"][:3]:
    flag = " (SELF)" if res["is_query_itself"] else ""
    print(f"  Rank {res['rank']}: {res['similarity_score']:.4f} - {res['target_title']}{flag}")

In [None]:
'''18:
This cell evaluates the quality of different similarity search methods by scoring retrieved results against ground truth relationships. 
For each retrieved document, it assigns a score: 
        1.0 for exact matches or document versions, 
        0.5 for documents in the same subject category, 
        0.0 for unrelated documents. 
It compares three methods (title-based, abstract-based, and generated title-based similarity) 
and calculates mean scores and standard deviations across all queries. '''

def calculate_Irenescore(query_id: str, target_id: str, metadata: dict) -> float:
    if query_id == target_id:
        return 1.0
    
    query_meta = metadata.get(query_id, {})
    target_meta = metadata.get(target_id, {})

    if target_id in query_meta.get('versions', []) or query_id in target_meta.get('versions', []):
        return 1.0

    if query_meta.get('subject') and query_meta.get('subject') == target_meta.get('subject'):
        return 0.5
    
    return 0.0


def evaluate_method(data: list, metadata: dict, similarity_key: str) -> dict:    
    scores = []
    
    for article in data:
        query_id = article.get('id')
        results = article.get(similarity_key, [])
         
        article_score = sum(
            calculate_Irenescore(query_id, r.get('target_id'), metadata)
            for r in results if r.get('target_id')
        )
        scores.append(article_score)
    
    return {
        'mean': np.mean(scores) if scores else 0.0,
        'std': np.std(scores) if scores else 0.0,
        'count': len(scores)
    }


metadata = {
    a['id']: {'subject': a.get('subject'), 'versions': a.get('version', [])}
    for a in dataset if a.get('id')
}


with open("dataset_similarity_title.json", "r", encoding="utf-8") as f:
    sim_title = json.load(f)

with open("dataset_similarity_abstract.json", "r", encoding="utf-8") as f:
    sim_abstract = json.load(f)

with open("dataset_similarity_generated_title.json", "r", encoding="utf-8") as f:
    sim_generated = json.load(f)

results = {
    'Title → Text': evaluate_method(sim_title, metadata, 'most_similar_by_title_to_text'),
    'Abstract → Text': evaluate_method(sim_abstract, metadata, 'most_similar_by_abstract_to_text'),
    'Generated Title → Text': evaluate_method(sim_generated, metadata, 'most_similar_by_generated_title_to_text')
}


for method, res in sorted(results.items(), key=lambda x: x[1]['mean'], reverse=True):
    percentage = (res['mean'] / 6.0) * 100
    print(f"\n{method}")
    print(f"Score: {res['mean']:.4f} / 6.00 ({percentage:.1f}%)")
    print(f"Std:   {res['std']:.4f}")
    print(f"Count: {res['count']}")

output = {
    method: {
        'mean_score': float(res['mean']),
        'percentage': float((res['mean'] / 6.0) * 100),
        'std_score': float(res['std']),
        'articles_evaluated': res['count']
    }
    for method, res in results.items()
}

with open("evaluation_metrics.json", "w", encoding="utf-8") as f:
    json.dump(output, f, indent=2, ensure_ascii=False)