In [1]:
!pip install -qU transformers torch sentence-transformers chromadb bitsandbytes tqdm rouge rouge-score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import json
import torch
import numpy as np
from typing import List, Dict, Any, Optional, Union
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from rouge import Rouge
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import networkx as nx
from collections import Counter

In [3]:
# Download NLTK resources if not already available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


**Getting chunked data that we created earlier**

In [4]:
!unzip /content/chunked_data.zip

Archive:  /content/chunked_data.zip
   creating: content/chunked_data/
  inflating: content/chunked_data/The-Alchemist_chunks.json  
  inflating: content/chunked_data/Ocean_ecogeochemistry_A_review_chunks.json  
  inflating: content/chunked_data/Stats_chunks.json  
  inflating: content/chunked_data/new-approaches-and-procedures-for-cancer-treatment_chunks.json  
  inflating: content/chunked_data/all_chunks.json  
  inflating: content/chunked_data/The_Plan_of_the_Giza_Pyramids_chunks.json  
  inflating: content/chunked_data/M.Sc. Applied Psychology_chunks.json  
  inflating: content/chunked_data/Dataset summaries and citations_chunks.json  


In [5]:
!mkdir chunked_data
!mv /content/content/chunked_data/* /content/chunked_data
!rm -rf /content/content/

### **Decorator to measure tokens**

In [6]:
import time
import functools
from typing import Callable, Any, Optional

def measure_token_processing(process_name: Optional[str] = None):
    """Decorator to measure token processing speed across different tasks."""

    def decorator(func: Callable) -> Callable:
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            # Determine the process name
            task_name = process_name or func.__name__

            # Get the instance (self) from args
            instance = args[0] if args else None

            # Find text to tokenize - could be in different places depending on function
            text = None
            # Check args - likely the second argument for most methods
            if len(args) > 1 and isinstance(args[1], str):
                text = args[1]
            elif len(args) > 1 and isinstance(args[1], list) and all(isinstance(t, str) for t in args[1]):
                text = ' '.join(args[1])  # Join list of strings

            # Check kwargs for text
            elif 'text' in kwargs and isinstance(kwargs['text'], str):
                text = kwargs['text']
            elif 'texts' in kwargs and isinstance(kwargs['texts'], list):
                text = ' '.join(kwargs['texts'])
            elif 'prompt' in kwargs and isinstance(kwargs['prompt'], str):
                text = kwargs['prompt']

            # Find appropriate tokenizer
            tokenizer = None
            if instance and hasattr(instance, 'tokenizer'):
                tokenizer = instance.tokenizer
            elif instance and hasattr(instance, 'embedding_model') and hasattr(instance.embedding_model, 'tokenizer'):
                tokenizer = instance.embedding_model.tokenizer

            # Count tokens if we have both text and tokenizer
            input_tokens = 0
            if text and tokenizer:
                try:
                    if hasattr(tokenizer, 'encode'):
                        input_tokens = len(tokenizer.encode(text))
                    elif hasattr(tokenizer, '__call__'):
                        input_tokens = len(tokenizer(text)['input_ids'])
                except:
                    # Fallback to rough estimate (approx 4 chars per token)
                    input_tokens = len(text) // 4
            elif text:
                # Very rough approximation if no tokenizer available
                input_tokens = len(text.split())

            # Start timing
            start_time = time.perf_counter()

            # Execute the function
            result = func(*args, **kwargs)

            # End timing
            end_time = time.perf_counter()

            # Calculate tokens per second
            processing_time = end_time - start_time
            tokens_per_second = input_tokens / processing_time if processing_time > 0 else 0

            # Output measurements
            print(f"⏱️ {task_name}: {input_tokens} tokens processed in {processing_time:.2f}s ({tokens_per_second:.2f} tokens/sec)\n")

            # If result is a string, we could measure output tokens too
            if isinstance(result, str) and tokenizer:
                try:
                    output_tokens = len(tokenizer.encode(result))
                    total_tokens = input_tokens + output_tokens
                    throughput = total_tokens / processing_time if processing_time > 0 else 0
                    print(f"   Total (in+out): {total_tokens} tokens at {throughput:.2f} tokens/sec\n\n")
                except:
                    pass

            return result
        return wrapper
    return decorator

# Summarization System

A sophisticated system for document summarization with advanced evaluation capabilities:

- **Flexible Summarization Methods**: Supports abstractive, extractive, and bullet-point summarization
- **Local LLM Integration**: Utilizes models like Meta-Llama for abstractive summaries with fallback to smaller models
- **Embedding Model Support**: Employs SentenceTransformer for enhanced extractive summarization
- **Model Caching**: Reuses loaded models to optimize performance and reduce initialization time
- **Chunk-Based Processing**: Groups and processes document chunks by source for summaries
- **ROUGE Evaluation**: Provides quantitative assessment of summary quality using ROUGE metrics
- **Device Flexibility**: Automatically adapts to available hardware (CPU/GPU) for efficient computation
- **Multi-Summary Generation**: Creates multiple candidate summaries for comparison and selection
- **Confidence Scoring**: Associates confidence scores with generated summaries for quality assessment
- **LLM-Based Evaluation**: Uses the same LLM to evaluate and select the best summary from multiple candidates
- **Two-Step Summarization**: Implements a generate-then-evaluate workflow for optimal summary selection

In [7]:
# Login to HF to access LLMs
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
class SummarizationSystem:
    """System for summarizing publications and evaluating summaries using various techniques."""

    # Class-level cache for models
    _loaded_model = None
    _loaded_tokenizer = None
    _loaded_model_name = None
    _loaded_embedding_model = None
    _loaded_embedding_model_name = None

    def __init__(
        self,
        chunks_dir: str = "chunked_data",
        summaries_dir: str = "summaries",
        llm_model_name: str = "meta-llama/Meta-Llama-3-8B-Instruct",  # Default LLM model
        embedding_model: str = "nomic-ai/nomic-embed-text-v1.5",       # Default embedding model
        device: str = "auto"
    ):
        self.chunks_dir = chunks_dir
        self.summaries_dir = summaries_dir
        self.llm_model_name = llm_model_name
        self.embedding_model_name = embedding_model

        # Ensure directories exist
        os.makedirs(summaries_dir, exist_ok=True)

        # Initialize device
        if device == "auto":
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device
        print(f"Using device: {self.device}")

        # Load models with caching to avoid repeated loading
        self._load_models()

        # Initialize ROUGE metric
        self.rouge_evaluator = Rouge()

        # Results storage
        self.experiment_results = []

    def _load_models(self):
        """Load LLM and embedding models with caching."""

        # Load LLM
        if SummarizationSystem._loaded_model_name == self.llm_model_name:
            print("LLM already loaded. Reusing from cache.")
            self.tokenizer = SummarizationSystem._loaded_tokenizer
            self.model = SummarizationSystem._loaded_model
        else:
            try:
                print(f"-----Loading LLM: {self.llm_model_name}-----\n")
                self.tokenizer = AutoTokenizer.from_pretrained(self.llm_model_name)
                if self.device == "cuda":
                    self.model = AutoModelForCausalLM.from_pretrained(
                        self.llm_model_name,
                        device_map="auto",
                        torch_dtype=torch.float16,
                        load_in_8bit=True
                    )
                else:
                    self.model = AutoModelForCausalLM.from_pretrained(
                        self.llm_model_name,
                        device_map={"": self.device}
                    )
                # Cache the loaded model
                SummarizationSystem._loaded_tokenizer = self.tokenizer
                SummarizationSystem._loaded_model = self.model
                SummarizationSystem._loaded_model_name = self.llm_model_name
            except Exception as e:
                print(f"Error loading primary LLM model: {str(e)}")
                print("Attempting to load a smaller fallback model...")
                try:
                    # Fallback to a smaller model that's more likely to work locally
                    fallback_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
                    print(f"Loading fallback model: {fallback_model}")
                    self.tokenizer = AutoTokenizer.from_pretrained(fallback_model)
                    self.model = AutoModelForCausalLM.from_pretrained(
                        fallback_model,
                        device_map="auto" if self.device == "cuda" else {"": self.device}
                    )
                    self.llm_model_name = fallback_model
                    # Update cache
                    SummarizationSystem._loaded_tokenizer = self.tokenizer
                    SummarizationSystem._loaded_model = self.model
                    SummarizationSystem._loaded_model_name = self.llm_model_name
                except Exception as e2:
                    print(f"Error loading fallback model: {str(e2)}")
                    print("Proceeding with alternative summarization methods only.")
                    self.model = None
                    self.tokenizer = None

        # Create pipeline if model loaded successfully
        if self.model is not None and self.tokenizer is not None:
            self.generator = pipeline(
                "text-generation",
                model=self.model,
                tokenizer=self.tokenizer,
                device_map="auto" if self.device == "cuda" else {"": self.device}
            )
        else:
            self.generator = None

        # Load embedding model if needed for extractive methods
        if SummarizationSystem._loaded_embedding_model_name == self.embedding_model_name:
            print("Embedding model already loaded. Reusing from cache.")
            self.embedding_model = SummarizationSystem._loaded_embedding_model
        else:
            try:
                print(f"-----Loading embedding model: {self.embedding_model_name}-----\n")
                self.embedding_model = SentenceTransformer(self.embedding_model_name, device="cpu", trust_remote_code=True)
                SummarizationSystem._loaded_embedding_model = self.embedding_model
                SummarizationSystem._loaded_embedding_model_name = self.embedding_model_name
            except Exception as e:
                print(f"Error loading embedding model: {str(e)}")
                print("Using basic NLP methods for extractive summarization.")
                self.embedding_model = None

    def load_chunks(self, filename: str = "all_chunks.json") -> List[Dict[str, Any]]:
        """Load document chunks from a JSON file."""

        filepath = os.path.join(self.chunks_dir, filename)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                chunks = json.load(f)
            print(f"Loaded {len(chunks)} chunks from {filepath}\n")
            return chunks
        except Exception as e:
            print(f"Error loading chunks from {filepath}: {str(e)}")
            return []

    def group_chunks_by_source(self, chunks: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
        """Group chunks by their source document."""

        grouped = {}
        for chunk in chunks:
            source = chunk.get("source", "unknown")
            if source not in grouped:
                grouped[source] = []
            grouped[source].append(chunk)

        # Sort chunks by chunk number within each source
        for source in grouped:
            grouped[source].sort(key=lambda x: x.get("chunk_number", 0))

        return grouped

    @measure_token_processing("Summary Generation")
    def _generate_llm_summary(self, text: str, method: str = "abstractive", max_length: int = 500) -> str:
        """Generate a summary using the loaded LLM."""

        if self.generator is None:
            return "LLM not available for summarization."

        # Different prompt templates based on summarization method
        if method == "abstractive":
            prompt = f"""<|system|>
                        You are an expert summarizer. Create a comprehensive abstractive summary of the following text.
                        Focus on the main ideas, key findings, and important details. Ensure the summary is coherent and flows well.
                        </s>

                        <|user|>
                        TEXT TO SUMMARIZE:
                        {text}

                        Please create a detailed yet concise summary of the text above.
                        </s>

                        <|assistant|>
                        """
        elif method == "bullet_points":
            prompt = f"""<|system|>
                        You are an expert summarizer. Create a summary of the following text in bullet point format.
                        Focus on the main ideas, key findings, and important details. Each bullet point should capture one key concept.
                        </s>

                        <|user|>
                        TEXT TO SUMMARIZE:
                        {text}

                        Please create a bullet point summary of the key concepts in the text.
                        </s>

                        <|assistant|>
                        """
        else:  # Default to standard summarization
            prompt = f"""<|system|>
                        You are an expert summarizer. Summarize the following text effectively.
                        </s>

                        <|user|>
                        TEXT TO SUMMARIZE:
                        {text}

                        Please summarize this text.
                        </s>

                        <|assistant|>
                        """

        try:
            # Calculate input token length to determine appropriate max_new_tokens
            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
            input_length = input_ids.size(1)

            # Set a reasonable max_new_tokens based on available context window and desired summary length
            context_window = 6000  # Typical for Llama models
            max_new_tokens = min(max_length, context_window - input_length)

            # Generate summary
            outputs = self.generator(
                prompt,
                max_new_tokens=max_new_tokens,
                temperature=0.3,  # Lower temperature for more focused summaries
                top_p=0.9,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                num_return_sequences=1
            )

            # Extract the generated text
            generated_text = outputs[0]["generated_text"]

            # Remove the prompt from the response
            summary = generated_text[len(prompt):].strip()

            # Clean up potential model artifacts
            summary = summary.split("</s>")[0].split("<|user|>")[0].split("<|system|>")[0].strip()

            return summary

        except Exception as e:
            print(f"Error generating summary with LLM: {str(e)}")
            return "Error generating summary."

    def summarize_text(self, text: str, method: str = "abstractive", max_length: int = 500) -> str:
        """Generate a summary using the specified method."""

        if len(text.strip()) == 0:
            return ""

        # Apply the selected summarization method
        if method == "abstractive" and self.generator is not None:
            return self._generate_llm_summary(text, method="abstractive", max_length=max_length)
        elif method == "bullet_points" and self.generator is not None:
            return self._generate_llm_summary(text, method="bullet_points", max_length=max_length)
        else:
            # Default to abstractive if requested method isn't available
            print(f"Method {method} not available with current models. Using abstractive summarization.")
            return self._generate_llm_summary(text, method="abstractive", max_length=max_length)

    def summarize_document(self, source_chunks: List[Dict[str, Any]], method: str = "abstractive") -> Dict[str, Any]:
        """Generate a summary for an entire document from its chunks."""

        if not source_chunks:
            return {"summary": "", "method": method}

        # Combine chunks into a single text
        combined_text = ""
        source = source_chunks[0].get("source", "unknown")

        for chunk in source_chunks:
            combined_text += chunk.get("text", "") + " "
            tables = chunk.get("tables")
            if tables and isinstance(tables, list):
                combined_text += " ".join(map(str, tables)) + " "

        # Generate summary
        summary = self.summarize_text(combined_text, method=method)

        return {
            "source": source,
            "summary": summary,
            "method": method,
            "chunk_count": len(source_chunks)
        }

    def evaluate_summary(self, reference: str, summary: str) -> Dict[str, Any]:
        """Evaluate a summary using ROUGE metrics."""
        if not reference or not summary:
            return {"rouge-1": 0, "rouge-2": 0, "rouge-l": 0}

        try:
            scores = self.rouge_evaluator.get_scores(summary, reference)[0]
            return {
                "rouge-1": scores["rouge-1"]["f"],
                "rouge-2": scores["rouge-2"]["f"],
                "rouge-l": scores["rouge-l"]["f"]
            }
        except Exception as e:
            print(f"Error calculating ROUGE scores: {str(e)}")
            return {"rouge-1": 0, "rouge-2": 0, "rouge-l": 0}


    @measure_token_processing("Summary Generation")
    def generate_multiple_summaries(self, text: str, count: int = 3, max_length: int = 500) -> List[str]:
        """Generate multiple summaries for the same text."""

        summaries = []

        if self.generator is None:
            print("LLM not available for generating multiple summaries.")
            return ["LLM not available for summarization."] * count

        # Create prompt for generating multiple summaries
        prompt = f"""<|system|>
                  You are an expert summarizer. Provide exactly {count} different short summaries of the following text.
                  Number each summary clearly as "Summary 1:", "Summary 2:", etc.
                  Each summary should be clear, concise, and capture the key points of the text.
                  Aim for around 2-3 sentences for each summary.
                  </s>

                  <|user|>
                  TEXT TO SUMMARIZE:
                  {text}

                  Please create exactly {count} different summaries of the text above, numbered clearly.
                  </s>

                  <|assistant|>
                  """

        try:
            # Calculate tokens and determine max_new_tokens
            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
            input_length = input_ids.size(1)

            context_window = 6000  # Typical for Llama models
            max_new_tokens = min(max_length * count * 2, context_window - input_length)

            # Generate summaries
            outputs = self.generator(
                prompt,
                max_new_tokens=max_new_tokens,
                temperature=0.7,  # Higher temperature for more variation
                top_p=0.9,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                num_return_sequences=1
            )

            # Extract the generated text
            generated_text = outputs[0]["generated_text"][len(prompt):].strip()

            # Clean up and parse the multiple summaries
            clean_text = generated_text.split("</s>")[0].split("<|user|>")[0].split("<|system|>")[0].strip()

            # Split by Summary headings
            summaries = []
            lines = clean_text.split('\n')
            current_summary = ""

            for line in lines:
                if line.strip().startswith("Summary") and ":" in line:
                    if current_summary:
                        summaries.append(current_summary.strip())
                    current_summary = ""
                else:
                    current_summary += line + "\n"

            # Add the last summary if exists
            if current_summary:
                summaries.append(current_summary.strip())

            # Ensure we have the requested number of summaries
            if len(summaries) < count:
                print(f"Only generated {len(summaries)} summaries instead of {count}.")
                # Duplicate the last summary to reach the desired count if needed
                while len(summaries) < count:
                    summaries.append(summaries[-1] if summaries else "")
            elif len(summaries) > count:
                # Take only the first 'count' summaries
                summaries = summaries[:count]

            return summaries

        except Exception as e:
            print(f"Error generating multiple summaries: {str(e)}")
            return ["Error generating summary."] * count


    @measure_token_processing("Summary Generation")
    def generate_confidence_scored_summaries(self, text: str, count: int = 3) -> List[Dict[str, Any]]:
        """Generate summaries with confidence scores."""

        if self.generator is None:
            print("LLM not available for generating confidence-scored summaries.")
            return [{"summary": "LLM not available", "confidence": 0.0}] * count

        prompt = f"""<|system|>
                  You are a summarization system that provides summaries with confidence scores.
                  You must respond with EXACTLY {count} summaries, each with a confidence score.
                  Format each summary as follows:

                  SUMMARY 1:
                  [The summary text goes here]
                  CONFIDENCE: [A number between 0.0 and 1.0]

                  SUMMARY 2:
                  [The summary text goes here]
                  CONFIDENCE: [A number between 0.0 and 1.0]

                  And so on. Use this exact format - with "SUMMARY" and "CONFIDENCE" labels.
                  </s>

                  <|user|>
                  TEXT TO SUMMARIZE:
                  {text}

                  Please provide exactly {count} summaries with confidence scores using the required format.
                  </s>

                  <|assistant|>
                  """

        try:
            # Generate response
            outputs = self.generator(
                prompt,
                max_new_tokens=2000,
                temperature=0.3,
                top_p=0.9,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                num_return_sequences=1
            )

            # Extract and process the generated text
            generated_text = outputs[0]["generated_text"][len(prompt):].strip()
            clean_text = generated_text.split("</s>")[0].split("<|user|>")[0].split("<|system|>")[0].strip()

            # Parse the formatted response
            summaries = []
            sections = clean_text.split("\nSUMMARY ")

            # Process first section separately if it doesn't start with "SUMMARY"
            start_idx = 0
            if not sections[0].strip().startswith("SUMMARY"):
                start_idx = 1

            for i in range(start_idx, len(sections)):
                section = sections[i].strip()
                if not section:
                    continue

                # Extract confidence score
                confidence_parts = section.split("CONFIDENCE:")
                summary_text = confidence_parts[0]

                if len(confidence_parts) > 1:
                    # Try to extract the confidence score
                    try:
                        confidence_str = confidence_parts[1].strip().split("\n")[0].strip()
                        confidence = float(confidence_str)
                        # Ensure confidence is between 0 and 1
                        confidence = max(0.0, min(1.0, confidence))
                    except (ValueError, IndexError):
                        confidence = 0.5
                else:
                    confidence = 0.5

                # Clean up the summary text
                if ":" in summary_text and summary_text.split(":", 1)[0].strip().isdigit():
                    summary_text = summary_text.split(":", 1)[1].strip()
                else:
                    summary_text = summary_text.strip()

                summaries.append({"text": summary_text, "confidence": confidence})

            # Ensure we have the requested number of summaries
            while len(summaries) < count:
                summaries.append({"text": "", "confidence": 0.0})
            if len(summaries) > count:
                summaries = summaries[:count]

            return summaries

        except Exception as e:
            print(f"Error generating confidence-scored summaries: {str(e)}")
            return [{"text": "Error generating summary.", "confidence": 0.0}] * count

    def evaluate_summaries_with_llm(self, original_text: str, summaries: List[str]) -> Dict[str, Any]:
        """Use the LLM to evaluate and rank multiple summaries."""

        if self.generator is None or not summaries:
            return {"best_summary_index": 0, "explanation": "LLM not available for evaluation"}

        # Format summaries for the prompt
        summaries_text = ""
        for i, summary in enumerate(summaries):
            summaries_text += f"Summary {i+1}:\n{summary}\n\n"

        prompt = f"""<|system|>
                  You are an expert at evaluating summaries. You will be given an original text and
                  several summaries of that text. Your task is to evaluate the summaries and determine
                  which one is the best in terms of clarity, completeness, and relevance.

                  You MUST begin your response with "BEST SUMMARY: X" where X is the number of the best summary.
                  Then provide your explanation on a new line.
                  </s>

                  <|user|>
                  ORIGINAL TEXT:
                  {original_text}

                  SUMMARIES:
                  {summaries_text}

                  Please evaluate these summaries and determine which one is the best.
                  Start with "BEST SUMMARY: X" where X is the number, then explain your choice.
                  </s>

                  <|assistant|>
                  """

        try:
            outputs = self.generator(
                prompt,
                max_new_tokens=1500,
                temperature=0.3,
                top_p=0.9,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                num_return_sequences=1
            )

            # Extract and process the evaluation
            evaluation = outputs[0]["generated_text"][len(prompt):].strip()
            evaluation = evaluation.split("</s>")[0].split("<|user|>")[0].split("<|system|>")[0].strip()

            # Extract the best summary number from the formatted response
            best_summary_index = 0  # Default to first summary
            explanation = evaluation

            if evaluation.startswith("BEST SUMMARY:"):
                first_line = evaluation.split("\n")[0].strip()
                number_part = first_line.replace("BEST SUMMARY:", "").strip()

                try:
                    index = int(number_part) - 1  # Convert to 0-based index
                    if 0 <= index < len(summaries):
                        best_summary_index = index
                        # Remove the first line from explanation
                        explanation = "\n".join(evaluation.split("\n")[1:]).strip()
                except ValueError:
                    pass

            return {
                "best_summary_index": best_summary_index,
                "best_summary": summaries[best_summary_index],
                "explanation": explanation
            }

        except Exception as e:
            print(f"Error evaluating summaries with LLM: {str(e)}")
            return {"best_summary_index": 0, "explanation": f"Error: {str(e)}"}


    @measure_token_processing("Summary Generation")
    def generate_and_evaluate_summaries(self, text: str, count: int = 3) -> Dict[str, Any]:
        """Generate multiple summaries and evaluate them to select the best one."""

        # Generate multiple summaries
        summaries = self.generate_multiple_summaries(text, count)

        # Evaluate to find the best one
        evaluation = self.evaluate_summaries_with_llm(text, summaries)

        best_index = evaluation.get("best_summary_index", 0)
        best_summary = summaries[best_index]

        # Calculate ROUGE scores for the best summary if we have a reference
        rouge_scores = {}
        if hasattr(self, "reference_summary") and self.reference_summary:
            rouge_scores = self.evaluate_summary(self.reference_summary, best_summary)

        return {
            "all_summaries": summaries,
            "best_summary": best_summary,
            "best_summary_index": best_index,
            "evaluation_explanation": evaluation.get("explanation", ""),
            "rouge_scores": rouge_scores
        }


    @measure_token_processing("Summary Generation")
    def two_step_summarization(self, text: str) -> Dict[str, Any]:
        """Perform a two-step summarization process:
        1. Generate multiple summaries
        2. Evaluate and select the best summary
        """
        # Step 1: Generate summaries
        summaries = self.generate_multiple_summaries(text, count=3)

        # Step 2: Evaluate and select the best one
        result = self.evaluate_summaries_with_llm(text, summaries)

        return {
            "original_text_length": len(text),
            "summaries": summaries,
            "selected_summary": result.get("best_summary", summaries[0]),
            "evaluation": result.get("explanation", "")
        }

In [8]:
print("-----Initializing Summarization System...-----\n")

summarizer = SummarizationSystem()

-----Initializing Summarization System...-----

Using device: cuda
-----Loading LLM: meta-llama/Meta-Llama-3-8B-Instruct-----



tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Device set to use cuda:0


-----Loading embedding model: nomic-ai/nomic-embed-text-v1.5-----



modules.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/71.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/103k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

In [9]:
# Load chunks
chunks = summarizer.load_chunks()
if not chunks:
    print("No chunks found. Pipeline aborted.")

Loaded 599 chunks from chunked_data/all_chunks.json



In [10]:
# Group chunks by source
grouped_chunks = summarizer.group_chunks_by_source(chunks)

In [11]:
grouped_chunks.keys()

dict_keys(['new-approaches-and-procedures-for-cancer-treatment.pdf', 'The_Plan_of_the_Giza_Pyramids.pdf', 'The-Alchemist.pdf', 'Stats.docx', 'Ocean_ecogeochemistry_A_review.pdf', 'Dataset summaries and citations.docx', 'M.Sc. Applied Psychology.docx'])

In [13]:
grouped_chunks['Stats.docx'][-1]['text']

'Example 2 Unfinished Attic Ceiling Batt Insulation (Retrofit Installation Adder) Example for calculating the low, mid, and high retail price along with the associated labor for replacing (retrofitting) ceiling insulation in an unfinished attic with an R-Value of 15, using fiberglass batt insulation. The numbers in red correspond to the different coefficients in the Machine Readable CSV file for the performance metric and the low, mid, and high regressions: Where A is the R-value coefficient and C is the intercept value (constant). To produce the total installed cost, use the retrofit labor adder, (if this was for new construction, the new construction adder would be used): Therefore, the median material price is $0.86 per square foot and the labor cost is $1.00 per square foot for a total installed cost of $1.86 per square foot.'

#### **Abstractive Summary**

In [14]:
abstractive_summary = summarizer.summarize_text(grouped_chunks['Stats.docx'][-1]['text'], 'abstractive')

⏱️ Summary Generation: 182 tokens processed in 24.55s (7.41 tokens/sec)

   Total (in+out): 333 tokens at 13.56 tokens/sec




In [15]:
print(abstractive_summary)

Here is a comprehensive abstractive summary of the provided text:

The example demonstrates the calculation of the low, mid, and high retail prices for retrofitting ceiling insulation in an unfinished attic with an R-Value of 15, using fiberglass batt insulation. The calculation involves coefficients A (R-value) and C (intercept value) from a Machine Readable CSV file. The retrofit labor adder is used to produce the total installed cost. The median material price is $0.86 per square foot, and the labor cost is $1.00 per square foot, resulting in a total installed cost of $1.86 per square foot. This example provides a detailed breakdown of the costs involved in retrofitting ceiling insulation in an unfinished attic.


In [16]:
summarizer.evaluate_summary(grouped_chunks['Stats.docx'][-1]['text'], abstractive_summary)

{'rouge-1': 0.5889570502691107,
 'rouge-2': 0.4210526266255002,
 'rouge-l': 0.5398772956678837}

#### **Bullet Point Summary**

In [17]:
bullet_point_summary = summarizer.summarize_text(grouped_chunks['Stats.docx'][-1]['text'], 'bullet_points')

⏱️ Summary Generation: 182 tokens processed in 22.90s (7.95 tokens/sec)

   Total (in+out): 345 tokens at 15.06 tokens/sec




In [18]:
print(bullet_point_summary)

Here is a summary of the text in bullet point format:

• The text describes a retrofit installation adder for calculating the cost of replacing ceiling insulation in an unfinished attic with an R-Value of 15 using fiberglass batt insulation.
• The cost calculation involves using coefficients from a Machine Readable CSV file, including the R-value coefficient (A) and intercept value (C).
• The median material price for the insulation is $0.86 per square foot.
• The labor cost for the installation is $1.00 per square foot.
• The total installed cost is the sum of the material and labor costs, which is $1.86 per square foot.
• The retrofit labor adder is used to calculate the total installed cost, whereas the new construction adder would be used for new construction projects.


In [19]:
summarizer.evaluate_summary(grouped_chunks['Stats.docx'][-1]['text'], bullet_point_summary)

{'rouge-1': 0.6097560926085664,
 'rouge-2': 0.38016528426166246,
 'rouge-l': 0.6097560926085664}

### Summary from The Plan of the Giza_Pyramids, but for first few chunks

In [12]:
for grouped_chunk in grouped_chunks['The_Plan_of_the_Giza_Pyramids.pdf'][:5]:
    combined_text = ""
    combined_text += grouped_chunk.get("text", "") + " "
    tables = grouped_chunk.get("tables")
    if tables and isinstance(tables, list):
        combined_text += " ".join(map(str, tables)) + " "

In [14]:
abstractive_summary = summarizer.summarize_text(combined_text, 'abstractive')
print(abstractive_summary)

⏱️ Summary Generation: 352 tokens processed in 42.17s (8.35 tokens/sec)

   Total (in+out): 657 tokens at 15.58 tokens/sec


Here is a comprehensive abstractive summary of the text:

The text discusses the dimensions and orientations of the three pyramids at Giza, as determined by Petrie. The dimensions of the pyramid bases are provided in Table I, along with the average variations in the lengths of the sides and the orientations of the pyramids with respect to true north. The distances separating the centers of the pyramids were computed by Petrie along axes constructed parallel to the mean azimuth of the Second and Great Pyramids, and are given in Table II.

The tables provide detailed information on the dimensions and orientations of the three pyramids: the Great Pyramid, the Second Pyramid, and the Third Pyramid. The dimensions are given in inches and royal cubits, and the orientations are given in azimuth degrees and minutes. The tables show that the Great Pyramid has a base lengt

In [22]:
summarizer.evaluate_summary(combined_text, abstractive_summary)

{'rouge-1': 0.5347593532889131,
 'rouge-2': 0.4191176420612565,
 'rouge-l': 0.5347593532889131}

In [23]:
bullet_summary = summarizer.summarize_text(combined_text, 'bullet_points')
print(bullet_summary)

⏱️ Summary Generation: 352 tokens processed in 40.22s (8.75 tokens/sec)

   Total (in+out): 550 tokens at 13.68 tokens/sec


Here is a summary of the text in bullet point format:

• The dimensions of the three pyramid bases were determined by Petrie, including the lengths of the sides and the orientations with respect to true north.
• The distances separating the centers of the pyramids were computed by Petrie along axes constructed parallel to the mean azimuth of the Second and Great Pyramids.
• The dimensions of the pyramids are as follows:
	+ Great Pyramid: 9068.8 inches ± 0.6, 439.81 royal cubits, azimuth -3' 43”
	+ Second Pyramid: 8474.9 inches ± 1.5, 411.00 royal cubits, azimuth -5' 26”
	+ Third Pyramid: 4153.6 inches ± 3.0, 201.44 royal cubits, azimuth 14' 03”

I hope this summary meets your requirements! Let me know if you have any further requests.


In [24]:
summarizer.evaluate_summary(combined_text, bullet_summary)

{'rouge-1': 0.6137566087567539,
 'rouge-2': 0.37944663532940687,
 'rouge-l': 0.5925925875927325}

### Now let's try some strategic approaches

**Generating Multiple Summaries**

In [15]:
summaries = summarizer.generate_multiple_summaries(combined_text, count=3)

⏱️ Summary Generation: 352 tokens processed in 29.64s (11.87 tokens/sec)



In [24]:
for i, summary in enumerate(summaries):
    print(f"Summary {i+1}:", summary)
    print()

Summary 1: The text discusses the dimensions and orientations of the three pyramids at Giza, as determined by Petrie. The dimensions of the pyramid bases are given in inches and royal cubits, while the azimuths (orientations) of the pyramids with respect to true north are also provided. The text includes tables to summarize the data.

Summary 2: Petrie's measurements of the Giza pyramids reveal the dimensions of the three pyramid bases, with the Great Pyramid having the largest base (9068.8 inches) and the Third Pyramid having the smallest (4153.6 inches). The azimuths of the pyramids vary, with the Great Pyramid having an azimuth of -3' 43” and the Third Pyramid having an azimuth of 14' 03”.

Summary 3: The text presents the results of Petrie's measurements of the Giza pyramids, including the dimensions and orientations of the three pyramids. The dimensions are given in inches and royal cubits, while the azimuths are provided in degrees and minutes. The data is presented in tables, al

**Evaluate Summaries with LLM**

In [17]:
evaluation = summarizer.evaluate_summaries_with_llm(combined_text, summaries)

In [22]:
for key, value in evaluation.items():
    print(f"{key}: {value}\n")

best_summary_index: 0

best_summary: The text discusses the dimensions and orientations of the three pyramids at Giza, as determined by Petrie. The dimensions of the pyramid bases are given in inches and royal cubits, while the azimuths (orientations) of the pyramids with respect to true north are also provided. The text includes tables to summarize the data.

explanation: Summary 1 is the best because it accurately captures the main points of the original text, including the dimensions of the pyramid bases, the azimuths of the pyramids, and the inclusion of tables to summarize the data. The summary is clear and concise, providing a good overview of the content without leaving out important details. Additionally, it does not introduce any new information or try to interpret the data, which is important for a summary. The other summaries, while providing some accurate information, also introduce errors or try to interpret the data, making them less effective summaries.



**Generate Confidence Scored Summaries**

In [29]:
summaries = summarizer.generate_confidence_scored_summaries(combined_text, count=3)

⏱️ Summary Generation: 352 tokens processed in 33.29s (10.57 tokens/sec)



In [31]:
for summary in summaries:
    print(f"Confidence: {summary['confidence']}")
    print(f"Text: {summary['text']}\n")

Confidence: 0.8
Text: SUMMARY 1:
                   The Plan of the Giza Pyramids discusses the dimensions of the three pyramids, including the Great Pyramid, Second Pyramid, and Third Pyramid. The dimensions are given in inches and royal cubits, with an average variation in the lengths of the sides and orientations with respect to true north.

Confidence: 0.0
Text: 

Confidence: 0.0
Text: 



It's not giving more summaries 😶. But we can surely make it better by some modifications in prompt. But I am leaving it for now

**Two Step Summarization**

In [35]:
summaries = summarizer.two_step_summarization(combined_text)

⏱️ Summary Generation: 352 tokens processed in 28.37s (12.41 tokens/sec)

⏱️ Summary Generation: 352 tokens processed in 48.74s (7.22 tokens/sec)



In [37]:
for key, value in summaries.items():
    print(f"{key}: {value}\n")

original_text_length: 1005

summaries: ['The text describes the dimensions and orientations of the three pyramids at Giza, as measured by Petrie. The table shows the dimensions of the pyramids in inches and royal cubits, as well as their azimuth (orientation) with respect to true north. The distances between the centers of the pyramids were also calculated by Petrie.', 'The article provides information on the sizes and orientations of the Great, Second, and Third Pyramids of Giza. Petrie measured the pyramids and recorded their dimensions in inches and royal cubits, as well as their azimuth. The text also includes tables showing the measurements and orientations of the pyramids.', "Petrie's measurements of the Giza Pyramids are presented in the text, including their dimensions in inches and royal cubits, as well as their azimuth with respect to true north. The tables provided show the measurements and orientations of the Great, Second, and Third Pyramids, which were used to calculate t