In [None]:
# Step 1: Install Required Libraries
!pip install spacy transformers datasets torch sentencepiece
!python -m spacy download en_core_web_sm
!pip install datasets

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Step 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Step 3: Unzip Dataset (replace path with your dataset location)
!unzip "/content/drive/MyDrive/archive (6).zip" -d "/content/data"

Archive:  /content/drive/MyDrive/archive (6).zip
replace /content/data/cnn_dailymail/test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# Step 4: Import Required Libraries
import pandas as pd
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from collections import defaultdict

In [None]:
# Step 5: Load and Explore Dataset
train_data = pd.read_csv('/content/data/cnn_dailymail/train.csv')
validation_data = pd.read_csv('/content/data/cnn_dailymail/validation.csv')
test_data = pd.read_csv('/content/data/cnn_dailymail/test.csv')

print("Training data shape:", train_data.shape)
print("Validation data shape:", validation_data.shape)
print("Test data shape:", test_data.shape)
train_data.head()

Training data shape: (287113, 3)
Validation data shape: (13368, 3)
Test data shape: (11490, 3)


Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [None]:
# Step 6: Text Preprocessing
def clean_text(text):
    """Basic text cleaning function"""
    text = text.replace('\n', ' ')  # Remove newline characters
    text = ' '.join(text.split())  # Remove extra whitespace
    return text

In [None]:
# Apply cleaning to all datasets
train_data['cleaned_article'] = train_data['article'].apply(clean_text)
validation_data['cleaned_article'] = validation_data['article'].apply(clean_text)
test_data['cleaned_article'] = test_data['article'].apply(clean_text)

In [None]:
# Step 7: Extractive Summarization with spaCy
nlp = spacy.load("en_core_web_sm")

In [None]:
def extractive_summary(text, num_sentences=3):
    """Generate extractive summary using frequency-based method"""
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]

    # Calculate word frequencies
    word_freq = defaultdict(int)
    for word in doc:
        if word.text.lower() not in STOP_WORDS and word.text.isalpha():
            word_freq[word.text.lower()] += 1

    # Score sentences
    sentence_scores = {}
    for i, sentence in enumerate(nlp.pipe(sentences)):
        score = 0
        for word in sentence:
            score += word_freq.get(word.text.lower(), 0)
        # Add position bonus to first few sentences
        if i < 3:
            score += 50
        sentence_scores[i] = score

    # Select top sentences
    ranked = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)
    top_indices = [i for i, _ in ranked[:num_sentences]]
    return ' '.join([sentences[i] for i in sorted(top_indices)])

In [None]:
# Test extractive summarization
sample_text = test_data['cleaned_article'].iloc[0]
print("Extractive Summary:")
print(extractive_summary(sample_text))

Extractive Summary:
With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. While United Airlines has 30 inches of space, Gulf Air economy seats have between 29 and 32 inches, Air Asia offers 29 inches and Spirit Airlines offers just 28 inches.


In [None]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
from datasets import load_dataset


In [None]:
# Load pre-trained Pegasus model and tokenizer
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [None]:
# Function to generate summaries
def generate_summary(text, max_length=128):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding="longest")
    summary_ids = model.generate(inputs["input_ids"], max_length=max_length, num_beams=4, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
# Fine-tuning the model
def fine_tune_model():
    # Load a dataset for fine-tuning (e.g., CNN/DailyMail)
    dataset = load_dataset("cnn_dailymail", "3.0.0")

    # Preprocess the dataset
    def preprocess_function(examples):
        inputs = [doc for doc in examples["article"]]
        model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="longest")
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples["highlights"], max_length=128, truncation=True, padding="longest")
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset.map(preprocess_function, batched=True)
        # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=2,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_data=tokenized_datasets["train"],
        eval_data=tokenized_datasets["validation"],
    )

    # Fine-tune the model
    trainer.train()
    # Save the fine-tuned model
    model.save_pretrained("./fine_tuned_pegasus")
    tokenizer.save_pretrained("./fine_tuned_pegasus")

In [None]:
# Example usage
if __name__ == "__main__":
    # Generate a summary using the pre-trained model
    text = """
    Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals. Colloquially, the term "artificial intelligence" is often used to describe machines (or computers) that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem-solving".
    """
    summary = generate_summary(text)
    print("Generated Summary:", summary)

    # Fine-tune the model (uncomment to run fine-tuning)
    # fine_tune_model()
text = "To begin, A career that an individual can apply for with a linguistics degree is a professor. A professorial career in linguistics is the highest level of expertise in the field of study. Professors of linguistics are crucial for the development of linguistic research, teaching potential linguists, and our knowledge of language. The influence they have on the next generation of language researchers and scholars is so much more than just the lecture hall. Typical tasks include creating and delivering educational courses, carrying out original research, and publishing academic works. A Ph.D. in linguistics or a closely related subject that shows a solid understanding of language ideas, techniques, and practices is one of the required academic credentials. More basic skills are also important like critical thinking, good communication, and the ability to guide and inspire others."
summary = generate_summary(text)
print("Generated Summary:", summary)

Generated Summary: Artificial intelligence is the study of machines that mimic the human mind, such as learning and problem-solving.
Generated Summary: linguistics is a branch of science that deals with the study of language and the way it affects our daily lives.
