In [None]:
print("### Step 1: Setting up and Mounting Google Drive...")
from google.colab import drive
drive.mount('/content/drive')

import os

PROJECT_PATH = "/content/drive/MyDrive/GraduationProject"
DATA_PATH = os.path.join(PROJECT_PATH, "data")
CLASSIFIED_BOOKS_PATH = os.path.join(DATA_PATH, "classified_books.json")
BEST_PARAMS_PATH = os.path.join(DATA_PATH, "best_summary_params.json")

print("--- Drive Mounted and Paths Defined.")


print("\n### Step 2: Installing Libraries...")
!pip install -q transformers torch pandas rouge_score sentencepiece
print("--- Libraries Installed.")


print("\n### Step 3: Importing Libraries and Defining Functions...")
import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer
from tqdm.notebook import tqdm
import itertools
import json
import logging

from rouge_score import rouge_scorer

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class SummarizationModelHandler:
    _instance = None
    def __new__(cls, model_name):
        if cls._instance is None:
            cls._instance = super(SummarizationModelHandler, cls).__new__(cls)
            cls._instance._initialize_model(model_name)
        return cls._instance

    def _initialize_model(self, model_name):
        try:
            device = 0 if torch.cuda.is_available() else -1
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.summarizer_pipeline = pipeline("summarization", model=model_name, tokenizer=self.tokenizer, device=device)
            logger.info(f"Summarizer ({model_name}) loaded on {'GPU' if device != -1 else 'CPU'}.")
        except Exception as e:
            logger.error(f"CRITICAL: Could not load model: {e}.", exc_info=True)

    def summarize_text(self, text, params, ratio=0.5):
        if not self.summarizer_pipeline: return "Error"
        max_model_length = self.tokenizer.model_max_length
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=max_model_length)
        num_input_tokens = len(inputs['input_ids'][0])
        target_token_count = int(num_input_tokens * ratio)
        min_len, max_len = int(target_token_count*0.7), int(target_token_count*1.3)
        if min_len < 40: min_len = 40
        if max_len < 60: max_len = 60
        try:
            summary_ids = self.summarizer_pipeline.model.generate(
                inputs['input_ids'].to(self.summarizer_pipeline.device), min_length=min_len, max_length=max_len, **params)
            return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        except Exception as e:
            logger.error(f"Summarization error: {e}")
            return "Error"

# الدوال المساعدة
def load_data(file_path):
    try:
        df = pd.read_json(file_path)
        print(f"Successfully loaded {len(df)} books.")
        return df
    except Exception as e:
        print(f"Error loading data: {e}"); return None

def calculate_abstractiveness(original_text, generated_summary):
    original_tokens = set(original_text.lower().split())
    summary_tokens = generated_summary.lower().split()
    if not summary_tokens: return 0.0
    novel_tokens_count = sum(1 for token in summary_tokens if token not in original_tokens)
    return (novel_tokens_count / len(summary_tokens)) * 100

print("--- Functions and Classes Ready.")



print("\n### Step 4: Starting Grid Search for Best Hyperparameters...")

param_grid = {
    'do_sample': [True],
    'temperature': [0.8, 0.95],
    'top_p': [0.92, 0.95],
    'repetition_penalty': [1.2, 1.3]
}

df_books = load_data(CLASSIFIED_BOOKS_PATH)
if df_books is not None:
    sample_size = 10 

    summarizer = SummarizationModelHandler(model_name="google/pegasus-large")
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    keys, values = zip(*param_grid.items())
    param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

    best_score, best_params, results = -1, None, []
    sample_df = df_books.sample(n=sample_size, random_state=42)

    for params in tqdm(param_combinations, desc="Testing Combinations"):
        total_rougeL, total_abstractiveness, count = 0, 0, 0
        for _, row in sample_df.iterrows():
            summary = summarizer.summarize_text(row['content'], params, ratio=0.4)
            if "Error" in summary: continue

            total_rougeL += scorer.score(row['content'], summary)['rougeL'].fmeasure
            total_abstractiveness += calculate_abstractiveness(row['content'], summary)
            count += 1

        if count > 0:
            avg_rougeL = total_rougeL / count
            avg_abstractiveness = total_abstractiveness / count
            combined_score = (0.7 * avg_rougeL) + (0.3 * (avg_abstractiveness / 100.0))
            results.append({'params': params, 'score': combined_score, 'rouge_L': avg_rougeL, 'abstractiveness': avg_abstractiveness})
            if combined_score > best_score:
                best_score = combined_score
                best_params = params

    print("\n--- Grid Search Finished ---")
    if best_params:
        print("--- BEST HYPERPARAMETERS FOUND ---")
        print(json.dumps(best_params, indent=4))

        with open(BEST_PARAMS_PATH, 'w') as f:
            json.dump(best_params, f, indent=4)
        print(f"\nBest parameters saved to your Google Drive at: {BEST_PARAMS_PATH}")
    else:
        print("Could not determine best parameters.")

print("\n--- All Done! ---")

### Step 1: Setting up and Mounting Google Drive...
Mounted at /content/drive
--- Drive Mounted and Paths Defined.

### Step 2: Installing Libraries...
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m104.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m95.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

Device set to use cuda:0


Testing Combinations:   0%|          | 0/8 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.



--- Grid Search Finished ---
--- BEST HYPERPARAMETERS FOUND ---
{
    "do_sample": true,
    "temperature": 0.95,
    "top_p": 0.95,
    "repetition_penalty": 1.3
}

Best parameters saved to your Google Drive at: /content/drive/MyDrive/GraduationProject/data/best_summary_params.json

--- All Done! ---
