In [None]:
# Install required libraries
!pip install transformers datasets rouge-score scikit-learn


In [None]:
# Import necessary libraries
import pandas as pd
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from rouge_score import rouge_scorer
from sklearn.model_selection import KFold
import torch


In [None]:
# Step 1: Load and Preprocess Datasets

def load_pubmed_dataset():
    """Loads the PubMed dataset from Hugging Face."""
    pubmed_dataset = load_dataset("ccdv/pubmed-summarization", "section")
    return pubmed_dataset

def load_arxiv_dataset():
    """Loads the arXiv dataset from Hugging Face."""
    arxiv_dataset = load_dataset("ccdv/arxiv-summarization")  
    return arxiv_dataset

def preprocess_data(pubmed_dataset, arxiv_dataset):
    """Combines PubMed and arXiv datasets and renames columns."""
    combined_dataset = concatenate_datasets([pubmed_dataset["train"], arxiv_dataset["train"]])
    combined_dataset = combined_dataset.rename_column("article", "text")
    combined_dataset = combined_dataset.rename_column("abstract", "summary")
    return combined_dataset


In [None]:
# Step 2: Train the Model

def train_model(train_dataset, val_dataset):
    """Trains the BART model using the provided training and validation datasets."""
    model_name = "facebook/bart-large-cnn"
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)


In [None]:
    def tokenize_data(examples):
        """Tokenizes the input text and summary using the BART tokenizer."""
        inputs = tokenizer(examples["text"], max_length=1024, truncation=True, padding="max_length")
        targets = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")
        inputs["labels"] = targets["input_ids"]
        return inputs


In [None]:
  train_dataset = train_dataset.map(tokenize_data, batched=True)
    val_dataset = val_dataset.map(tokenize_data, batched=True)


In [None]:
    training_args = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=2,  
        per_device_eval_batch_size=2,   
        num_train_epochs=2,             
        evaluation_strategy="epoch",
        logging_dir="./logs",
    )



In [None]:
   trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train()


In [None]:
    # Save the PyTorch model as a pickle file
    torch.save(model.state_dict(), 'summarization_model.pkl') 

    return model, tokenizer


In [None]:

# Step 3: Evaluate on Test Dataset

def evaluate_model(model, tokenizer, test_dataset):
    """Evaluates the trained model using the ROUGE metric."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
    results = []


In [None]:
 for example in test_dataset:
        text = example["text"]
        reference_summary = example["summary"]

        # Generate summary
        inputs = tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
        summary_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=4, length_penalty=2.0, early_stopping=True)
        generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [None]:
# Calculate ROUGE scores
        scores = scorer.score(reference_summary, generated_summary)
        results.append({
            "ROUGE-1": scores["rouge1"].fmeasure,
            "ROUGE-2": scores["rouge2"].fmeasure,
            "ROUGE-L": scores["rougeL"].fmeasure,
        })

    # Aggregate results
    avg_rouge1 = sum(r["ROUGE-1"] for r in results) / len(results)
    avg_rouge2 = sum(r["ROUGE-2"] for r in results) / len(results)
    avg_rougeL = sum(r["ROUGE-L"] for r in results) / len(results)


In [None]:
    print(f"Average ROUGE-1: {avg_rouge1}")
    print(f"Average ROUGE-2: {avg_rouge2}")
    print(f"Average ROUGE-L: {avg_rougeL}")


In [None]:
# Main Execution

if _name_ == "_main_":
    # Load datasets
    pubmed_dataset = load_pubmed_dataset()
    arxiv_dataset = load_arxiv_dataset()

    # Preprocess data
    train_val_dataset = preprocess_data(pubmed_dataset, arxiv_dataset)


In [None]:
  # Implement k-fold cross-validation and select one fold
    k = 5  # Number of folds (adjust as needed)
    kf = KFold(n_splits=k, shuffle=True, random_state=42) 

    for fold, (train_index, val_index) in enumerate(kf.split(train_val_dataset)):
        print(f"Training on fold {fold + 1}...")
        train_dataset = train_val_dataset.select(train_index)
        val_dataset = train_val_dataset.select(val_index)
        
        # Train and evaluate only on the first fold to reduce dataset size
        if fold == 0:
            model, tokenizer = train_model(train_dataset, val_dataset)
            
            # Load and evaluate on the test dataset (CompScholar)
            compscholar_df = pd.read_csv("Brain Dead CompScholar Dataset.csv")
            test_dataset = Dataset.from_pandas(compscholar_df)
            evaluate_model(model, tokenizer, test_dataset)
            
            break  # Stop after the first fold