In [None]:
pip install pandas sumy transformers torch datasets


Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64

In [None]:
!pip install nltk scikit-learn
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Download the 'punkt_tab' resource

import os
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
import numpy as np

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

def extractive_summary_tfidf(text, num_sentences=3):
    """
    Extractive summarization using TF-IDF and cosine similarity.
    """
    # Sentence tokenization
    sentences = sent_tokenize(text)
    if len(sentences) <= num_sentences:
        return text  # Not enough sentences to summarize

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)

    # Cosine similarity matrix
    sim_matrix = cosine_similarity(tfidf_matrix)

    # Sentence scores: sum of similarities
    scores = sim_matrix.sum(axis=1)

    # Top N sentence indices
    ranked_indices = np.argsort(scores)[-num_sentences:]
    ranked_indices.sort()  # To maintain original order in text

    # Build summary
    summary = ' '.join([sentences[i] for i in ranked_indices])
    return summary

# Load dataset
df = pd.read_csv('/content/dataset_nlp new.csv', encoding='latin-1')

# Apply Extractive Summarization
df["extractive_summary"] = df["abstract"].apply(lambda x: extractive_summary_tfidf(str(x)))

# Save Extractive Summaries
os.makedirs("/mnt/data", exist_ok=True)
df.to_csv("/mnt/data/extractive_summaries.csv", index=False)

print("✅ Extractive summaries (TF-IDF + Cosine Similarity) saved as extractive_summaries.csv")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


✅ Extractive summaries (TF-IDF + Cosine Similarity) saved as extractive_summaries.csv


In [None]:
from google.colab import files
files.download("/mnt/data/extractive_summaries.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import os
import pandas as pd

# Load T5 Model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)


def abstractive_summary_batch(texts, max_length=100, batch_size=8):
    summaries = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        inputs = tokenizer(["summarize: " + text for text in batch], return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        summary_ids = model.generate(inputs.input_ids, max_length=max_length, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
        summaries.extend(tokenizer.batch_decode(summary_ids, skip_special_tokens=True))
    return summaries

df = pd.read_csv('/content/dataset_nlp new.csv', encoding='latin-1')
# Apply Abstractive Summarization in Batches
df["abstractive_summary"] = abstractive_summary_batch(df["abstract"].astype(str).tolist())
os.makedirs("/mnt/data", exist_ok=True)

# Save Faster Abstractive Summaries
df.to_csv("/mnt/data/abstractive_summaries.csv", index=False)
print("✅ Faster abstractive summaries saved as abstractive_summaries.csv")
print(os.path.abspath("/mnt/data/abstractive_summaries.csv"))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

✅ Faster abstractive summaries saved as abstractive_summaries.csv
/mnt/data/abstractive_summaries.csv


In [None]:
from google.colab import files
files.download("/mnt/data/abstractive_summaries.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install datasets



In [None]:
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import pandas as pd

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Load and clean dataset
df = pd.read_csv('/content/dataset_nlp new.csv', encoding='latin-1')
df = df.dropna(subset=["abstract", "title"])
df["abstract"] = df["abstract"].fillna("")
df["title"] = df["title"].fillna("")

# Rename and convert to dataset
df = df[["abstract", "title"]].rename(columns={"abstract": "text", "title": "summary"})
dataset = Dataset.from_pandas(df.reset_index(drop=True))

# Preprocessing function
def preprocess_function(examples):
    inputs = ["summarize: " + str(text) for text in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=150, truncation=True, padding="max_length")

    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Split
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Training args (no evaluation_strategy)
training_args = TrainingArguments(
    output_dir="./t5_finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=10_000,
    logging_dir="./logs",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train and evaluate manually
trainer.train()
trainer.evaluate()

print("✅ Model fine-tuned successfully!")


Map:   0%|          | 0/2122 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdicapriyogandhi[0m ([33mdicapriyogandhi-indian-institute-of-technology-patna[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,3.1188
1000,2.7539


✅ Model fine-tuned successfully!


In [None]:
import os

model_path_fine_tuned = "./t5_finetuned"
if os.path.exists(model_path_fine_tuned):
    print("✅ Fine-tuned model directory exists.")
    print("📂 Files inside:", os.listdir(model_path_fine_tuned))
else:
    print("❌ Model directory not found. Fine-tuning might have failed.")


✅ Fine-tuned model directory exists.
📂 Files inside: ['checkpoint-1275']


In [None]:
pip install rouge-score pandas torch transformers


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=1dd99388677bc87f795874032086c0e727899e0af038a572a37557fb6019ecc5
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
import pandas as pd
from rouge_score import rouge_scorer

# Load the summarization datasets
extractive_df = pd.read_csv("/mnt/data/extractive_summaries.csv")
abstractive_df = pd.read_csv("/mnt/data/abstractive_summaries.csv")

# Define ROUGE Scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def evaluate_summaries(df, summary_col, reference_col="abstract"):
    """
    Computes ROUGE Precision, Recall, and F1-score for the summaries.
    """
    results = {"rouge1": [], "rouge2": [], "rougeL": []}

    for ref, summary in zip(df[reference_col], df[summary_col]):
        if pd.isna(ref) or pd.isna(summary):  # Skip missing values
            continue
        scores = scorer.score(ref, summary)

        for rouge_type in results.keys():
            results[rouge_type].append({
                "precision": scores[rouge_type].precision,
                "recall": scores[rouge_type].recall,
                "f1": scores[rouge_type].fmeasure
            })

    # Compute average scores
    avg_scores = {}
    for rouge_type in results.keys():
        avg_scores[rouge_type] = {
            "precision": sum(d["precision"] for d in results[rouge_type]) / len(results[rouge_type]),
            "recall": sum(d["recall"] for d in results[rouge_type]) / len(results[rouge_type]),
            "f1": sum(d["f1"] for d in results[rouge_type]) / len(results[rouge_type])
        }

    return avg_scores

# Evaluate Extractive Summarization
extractive_scores = evaluate_summaries(extractive_df, "extractive_summary")

# Evaluate Abstractive Summarization
abstractive_scores = evaluate_summaries(abstractive_df, "abstractive_summary")

# Print Scores
print("\n🔹 **Extractive Summarization Scores:**")
print(pd.DataFrame(extractive_scores))

print("\n🔹 **Abstractive Summarization Scores:**")
print(pd.DataFrame(abstractive_scores))



🔹 **Extractive Summarization Scores:**
             rouge1    rouge2    rougeL
precision  1.000000  0.989174  1.000000
recall     0.595819  0.588146  0.595819
f1         0.726092  0.717157  0.726092

🔹 **Abstractive Summarization Scores:**
             rouge1    rouge2    rougeL
precision  0.985047  0.936118  0.965976
recall     0.364035  0.341841  0.357057
f1         0.514045  0.483952  0.504249
