In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers datasets sentencepiece torch accelerate
!pip install nltk rouge-score




In [6]:
import json
import pandas as pd
import numpy as np
import textstat
from datasets import Dataset
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu

# 🚀 STEP 1: Load ArXiv JSONL Dataset (Line-by-Line Processing)
arxiv_path = "/root/.cache/kagglehub/datasets/Cornell-University/arxiv/versions/224/arxiv-metadata-oai-snapshot.json"

def load_jsonl(file_path, max_records=10000):
    data = []
    with open(file_path, "r") as f:
        for i, line in enumerate(f):
            try:
                data.append(json.loads(line))
                if i >= max_records:
                    break  # Load only a limited subset for efficiency
            except json.JSONDecodeError:
                print(f"❌ Skipping bad JSON line {i}")
    return pd.DataFrame(data)

arxiv_df = load_jsonl(arxiv_path)
print(f"✅ Loaded {len(arxiv_df)} records from ArXiv dataset")

# 🚀 STEP 2: Ensure Required Columns Exist
required_columns = ["title", "abstract"]
for col in required_columns:
    if col not in arxiv_df.columns:
        raise KeyError(f"❌ Missing required column: {col}")

# 🚀 STEP 3: Convert Data to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(arxiv_df)

# 🚀 STEP 4: Summarization Function (Placeholder - Replace with Model)
def dummy_summarizer(text):
    return " ".join(text.split()[:50])  # Just takes first 50 words as a basic summary

hf_dataset = hf_dataset.map(lambda x: {"summary": dummy_summarizer(x["abstract"])})

# 🚀 STEP 5: Evaluation Metrics
def evaluate_summaries(refs, preds):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge_scores = [scorer.score(r, p) for r, p in zip(refs, preds)]
    avg_rouge = {key: np.mean([s[key].fmeasure for s in rouge_scores]) for key in ["rouge1", "rouge2", "rougeL"]}

    bleu_scores = [sentence_bleu([r.split()], p.split()) for r, p in zip(refs, preds)]
    avg_bleu = np.mean(bleu_scores)

    readability = np.mean([textstat.flesch_reading_ease(p) for p in preds])

    return {"ROUGE": avg_rouge, "BLEU": avg_bleu, "Readability": readability}

# 🚀 STEP 6: Run Evaluation
refs = hf_dataset["abstract"][:100]  # First 100 abstracts
preds = hf_dataset["summary"][:100]
eval_results = evaluate_summaries(refs, preds)
print("🔹 Evaluation Results:", eval_results)

✅ Loaded 10001 records from ArXiv dataset


Map:   0%|          | 0/10001 [00:00<?, ? examples/s]

🔹 Evaluation Results: {'ROUGE': {'rouge1': np.float64(0.6502532003000658), 'rouge2': np.float64(0.6467719905238495), 'rougeL': np.float64(0.6502532003000658)}, 'BLEU': np.float64(0.39630247168280763), 'Readability': np.float64(28.107900000000004)}


In [8]:
from datasets import load_dataset
import pandas as pd
import os
import nltk
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import corpus_bleu
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

nltk.download('punkt')

# Load Hugging Face datasets efficiently
def load_huggingface_datasets():
    datasets = {}
    dataset_names = ["ncbi/pubmed", "ccdv/pubmed-summarization", "arxiv-community/arxiv_dataset"]
    report = "Dataset Loading Summary:\n"

    for name in dataset_names:
        print(f"Loading dataset: {name}")
        try:
            datasets[name] = load_dataset(name, split="train", streaming=True)  # Streaming for efficiency
            report += f"✅ Loaded dataset: {name}\n"
        except Exception as e:
            report += f"❌ Error loading {name}: {e}\n"

    return datasets, report

# Preprocess dataset entries
def preprocess_dataset(dataset, source, limit=5000):
    processed_data = []
    count = 0

    for entry in dataset:
        if count >= limit:
            break

        abstract = entry.get("abstract", "")
        title = entry.get("title", "")
        article = entry.get("article", "") if source != "arxiv-community/arxiv_dataset" else entry.get("body_text", "")

        processed_data.append({"title": title, "abstract": abstract, "article": article, "source": source})
        count += 1

    return pd.DataFrame(processed_data), f"Processed {count} records from {source}\n"

# Load Google Drive dataset
def load_google_drive_dataset(file_path):
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        return df.rename(columns={"Paper Title": "title", "Abstract": "abstract", "Document": "article"}), "Loaded Google Drive dataset\n"
    else:
        return pd.DataFrame(), "Google Drive dataset not found\n"

# Evaluate summaries using ROUGE and BLEU
def evaluate_summaries(reference_summaries, generated_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(ref, gen) for ref, gen in zip(reference_summaries, generated_summaries)]
    bleu_score = corpus_bleu([[ref.split()] for ref in reference_summaries], [gen.split() for gen in generated_summaries])

    avg_rouge = {
        "rouge1": sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores),
        "rouge2": sum(score['rouge2'].fmeasure for score in rouge_scores) / len(rouge_scores),
        "rougeL": sum(score['rougeL'].fmeasure for score in rouge_scores) / len(rouge_scores)
    }

    return avg_rouge, bleu_score

# Generate PDF report
def generate_pdf_report(report_text, output_file="summarization_report.pdf"):
    c = canvas.Canvas(output_file, pagesize=letter)
    width, height = letter

    y_position = height - 40
    for line in report_text.split('\n'):
        c.drawString(40, y_position, line)
        y_position -= 20

    c.save()
    print(f"PDF report saved as {output_file}")

# Main execution
def main():
    report_text = "Summarization Process Report\n\n"
    datasets, dataset_report = load_huggingface_datasets()
    report_text += dataset_report + "\n"

    preprocessed_dfs = []
    for name in datasets:
        if datasets[name]:
            df, preprocess_report = preprocess_dataset(datasets[name], name)
            preprocessed_dfs.append(df)
            report_text += preprocess_report + "\n"

    google_drive_file_path = "/content/drive/MyDrive/brain dead/Brain Dead CompScholar Dataset.csv"
    google_drive_df, drive_report = load_google_drive_dataset(google_drive_file_path)
    google_drive_df["source"] = "google_drive"
    report_text += drive_report + "\n"

    final_dataset = pd.concat(preprocessed_dfs + [google_drive_df], ignore_index=True)
    final_dataset.to_csv("merged_summarization_dataset.csv", index=False)
    report_text += "Merged dataset saved as 'merged_summarization_dataset.csv'\n\n"

    if 'abstract' in final_dataset.columns and 'article' in final_dataset.columns:
        rouge_scores, bleu_score = evaluate_summaries(final_dataset['abstract'], final_dataset['article'])
        report_text += f"Evaluation Results:\nROUGE: {rouge_scores}\nBLEU: {bleu_score}\n"

    generate_pdf_report(report_text)

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'reportlab'