In [19]:
# ============================
# Step 1: Install Dependencies
# ============================
!pip install transformers datasets

# =======================
# Step 2: Load Your CSV
# =======================
import pandas as pd

# Load from uploaded path
df = pd.read_csv("/content/AI_scopus.csv")

# ✅ Combine title + abstract as input
df['full_text'] = df['Title'].fillna('') + '. ' + df['Abstract'].fillna('')

# ✅ Use abstract as the summary target
df['summary'] = df['Abstract'].fillna('')

# Filter to only the relevant data
df = df[['full_text', 'summary']].dropna().reset_index(drop=True)

print(f"✅ Dataset ready with {len(df)} entries.")
df.head()

# ========================================
# Step 3: Convert to Hugging Face Dataset
# ========================================
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

# ============================
# Step 4: Tokenization Utility
# ============================
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

def preprocess(example):
    inputs = tokenizer(example['full_text'], max_length=1024, padding='max_length', truncation=True)
    targets = tokenizer(example['summary'], max_length=256, padding='max_length', truncation=True)
    inputs['labels'] = targets['input_ids']
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=['full_text', 'summary'])

# ======================
# Step 5: Load the Model
# ======================
from transformers import BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# ===============================
# Step 6: Set Training Parameters
# ===============================
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=1,
    logging_steps=10,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# ========================
# Step 7: Train the Model
# ========================
trainer.train()

# ===========================
# Step 8: Test on One Sample
# ===========================
sample_text = df['full_text'].iloc[0]
true_summary = df['summary'].iloc[0]

# Generate summary
inputs = tokenizer(sample_text, return_tensors='pt', max_length=1024, truncation=True)
summary_ids = model.generate(inputs['input_ids'], max_length=256, num_beams=4, early_stopping=True)
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("\n🔹 ORIGINAL TEXT (Preview):\n", sample_text[:500], "...\n")
print("🔹 TRUE SUMMARY:\n", true_summary)
print("🔹 GENERATED SUMMARY:\n", generated_summary)


✅ Dataset ready with 1387 entries.


Map:   0%|          | 0/1109 [00:00<?, ? examples/s]

Map:   0%|          | 0/278 [00:00<?, ? examples/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33muditnaryan7906[0m ([33muditnaryan7906-iit-patna[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,2.0437
20,0.7571
30,0.3039
40,0.1927
50,0.1114
60,0.0728
70,0.0499
80,0.0337
90,0.0288
100,0.0241


KeyboardInterrupt: 

In [18]:
# Step 1: Imports
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
import evaluate
import numpy as np
import torch

# Step 2: Load the CNN/DailyMail summarization dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Step 3: Load the tokenizer and model (facebook/bart-base)
model_checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Step 4: Preprocessing function
def preprocess_function(examples):
    inputs = examples["article"]
    targets = examples["highlights"]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Step 5: Tokenize the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Step 6: Use smaller subset for faster training
small_train_dataset = tokenized_datasets["train"].select(range(2000))
small_eval_dataset = tokenized_datasets["validation"].select(range(500))

# Step 7: Load ROUGE metric
rouge = evaluate.load("rouge")

# Step 8: Define metric function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    return result

# Step 9: Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Step 10: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Set to 'wandb' if using Weights & Biases
    fp16=torch.cuda.is_available()  # Use mixed precision if GPU available
)

# Step 11: Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 12: Train the model
trainer.train()

# Step 13: Evaluate the model
metrics = trainer.evaluate()
print(metrics)


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ImportError: To be able to use evaluate-metric/rouge, you need to install the following dependencies['rouge_score'] using 'pip install rouge_score' for instance'