In [2]:

print("### Step 1: Setting up and Mounting Google Drive...")
from google.colab import drive
drive.mount('/content/drive')
import os


PROJECT_PATH = "/content/drive/MyDrive/GraduationProject"
DATA_PATH = os.path.join(PROJECT_PATH, "data")
CLASSIFIED_BOOKS_PATH = os.path.join(DATA_PATH, "classified_books.json")
BEST_PARAMS_PATH = os.path.join(DATA_PATH, "best_summary_params.json")
print("--- Drive Mounted and Paths Defined.")


print("\n### Step 2: Installing Libraries...")
!pip install -q transformers torch pandas sentencepiece rouge_score
print("--- Libraries Installed.")


print("\n### Step 3: Importing Libraries and Defining Final Functions...")
import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer
import json
import logging
from tqdm.notebook import tqdm
from rouge_score import rouge_scorer

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


class SummarizationModelHandler:
    _instance = None
    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(SummarizationModelHandler, cls).__new__(cls)
            cls._instance.summarizer_pipeline, cls._instance.tokenizer = None, None
            cls._instance._initialize_model()
        return cls._instance

    def _initialize_model(self):
        try:
            device = 0 if torch.cuda.is_available() else -1
            model_name = "google/pegasus-large"
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.summarizer_pipeline = pipeline("summarization", model=model_name, tokenizer=self.tokenizer, device=device)
            logger.info(f"Summarizer ({model_name}) loaded on {'GPU' if device != -1 else 'CPU'}.")
        except Exception as e:
            logger.error(f"CRITICAL: Could not load model: {e}.", exc_info=True)

    def summarize_text(self, text: str, params: dict, ratio: float = 0.4):
        if not self.summarizer_pipeline:
            return "Error: Summarizer not initialized."
        max_model_length = self.tokenizer.model_max_length
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=max_model_length)
        num_input_tokens = len(inputs['input_ids'][0])
        target_token_count = int(num_input_tokens * ratio)
        min_len, max_len = int(target_token_count*0.7), int(target_token_count*1.3)
        if min_len < 40: min_len = 40
        if max_len < 60: max_len = 60
        try:
            summary_ids = self.summarizer_pipeline.model.generate(
                inputs['input_ids'].to(self.summarizer_pipeline.device), min_length=min_len, max_length=max_len, **params)
            return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        except Exception as e:
            logger.error(f"Summarization error with params {params}: {e}")
            return "Error: Could not generate summary."


def calculate_abstractiveness(original_text, generated_summary):
    # نستخدم .split() البسيطة والمدمجة في بايثون
    original_tokens = set(original_text.lower().split())
    summary_tokens = generated_summary.lower().split()
    if not summary_tokens: return 0.0
    novel_tokens_count = sum(1 for token in summary_tokens if token not in original_tokens)
    # نقسم على 100 للحصول على نسبة مئوية
    return (novel_tokens_count / len(summary_tokens)) * 100

print("--- All Functions and Classes are Ready ---")


print("\n### Step 4: Running Detailed Summarization Evaluation...")


df_books = pd.read_json(CLASSIFIED_BOOKS_PATH)
with open(BEST_PARAMS_PATH, 'r') as f:
    best_params = json.load(f)

summarizer = SummarizationModelHandler()
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)


sample_size = 20
if len(df_books) < sample_size:
    sample_size = len(df_books)
sample_df = df_books.sample(n=sample_size, random_state=42)

print(f"\nEvaluating on a sample of {sample_size} books using parameters: {best_params}")


total_rougeL = 0
total_abstractiveness = 0

for _, book_row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Summarizing Books for Evaluation"):
    summary = summarizer.summarize_text(book_row['content'], params=best_params)

    if "Error" in summary:
        print(f"\n--- Book: {book_row['title']} --- \nSKIPPED: FAILED TO GENERATE SUMMARY")
        continue

    scores = scorer.score(book_row['content'], summary)
    abstractiveness = calculate_abstractiveness(book_row['content'], summary)

    total_rougeL += scores['rougeL'].fmeasure
    total_abstractiveness += abstractiveness

    print(f"\n\n--- Book: {book_row['title']} ---")
    print(">> Generated Summary:")
    print(summary)
    print("\n>> Quality Scores:")
    print(f"   - ROUGE-L (Accuracy): {scores['rougeL'].fmeasure:.4f}")
    print(f"   - Abstractiveness (Creativity): {abstractiveness:.2f}%")
    print("-" * 60)


avg_rougeL = total_rougeL / sample_size
avg_abstractiveness = total_abstractiveness / sample_size

print("\n" + "="*25 + " FINAL EVALUATION RESULTS " + "="*25)
print(f"Average ROUGE-L Score across {sample_size} books: {avg_rougeL:.4f}")
print(f"Average Abstractiveness across {sample_size} books: {avg_abstractiveness:.2f}%")
print("\n--- Evaluation Complete! ---")

### Step 1: Setting up and Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
--- Drive Mounted and Paths Defined.

### Step 2: Installing Libraries...
--- Libraries Installed.

### Step 3: Importing Libraries and Defining Final Functions...
--- All Functions and Classes are Ready ---

### Step 4: Running Detailed Summarization Evaluation...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Error during conversion: ChunkedEncodingError(ProtocolError('Response ended prematurely'))


model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

Device set to use cuda:0



Evaluating on a sample of 20 books using parameters: {'do_sample': True, 'temperature': 0.95, 'top_p': 0.95, 'repetition_penalty': 1.3}


Summarizing Books for Evaluation:   0%|          | 0/20 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.




--- Book: Humans Are Underrated ---
>> Generated Summary:
with knowledge being available 247 at arm s reach, social skills are what sets us apart from computers. download pdf for example, while a computer can instantly analyze millions of legal cases, provide all the required literature and predict the outcome of the case better than we do, it can never connect with the defendant and persuade her to act in her own best interest. i just saw a computer ambushed by iraqi soldiers, instead of escalating the situation, they knelt down and pointed their guns to the ground a gesture of respect. he just saw a computer ambushed by iraqi soldiers, instead of escalating the situation, they knelt down and pointed their guns to the ground a gesture of respect. i just saw a computer ambushed by iraqi soldiers, instead of escalating the situation, they knelt down and pointed their guns to the ground a gesture of respect. if you want to save this summary for later, download the free pdf and read it 