In [None]:
!pip install transformers
!pip install datasets
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24954 sha256=30b0d7909052a44241914cea5f007f1f03665e38cde7460bd98f0aaea2e6c7f1
  Stored in directory: /root/.cache/pip/wheels/9b/3d/39/09558097d3119ca0a4d462df68f22c6f3c1b345ac63a09b86e
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rouge import Rouge
import rouge_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_json("/content/drive/MyDrive/eLife_rouge.jsonl", lines=True)

In [None]:
import json
import pandas as pd
import torch
from datasets import load_metric
from sklearn.model_selection import train_test_split
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load Rouge metric
rouge = load_metric("rouge")


# Load GPT-2 model and tokenizer
model_name = "gpt2"
config = GPT2Config.from_pretrained(model_name)
config.task_specific_params = {
    'text-generation': {'do_sample': True, 'max_length': 50, 'temperature': 0.7}
}
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)

def build_inputs(text, summary, tokenizer, max_length=1024):
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=max_length, truncation=True, padding='max_length')
    target_ids = tokenizer.encode(summary, return_tensors="pt", max_length=max_length, truncation=True, padding='max_length')
    return {"input_ids": input_ids, "labels": target_ids}


def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Convert logits to token ids
    pred_ids = np.argmax(predictions, axis=-1)
    # print(pred_ids)

    # Decode the predictions and labels
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_str = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute Rouge scores
    rouge = Rouge()
    scores = rouge.get_scores(pred_str, labels_str, avg=True)

    # Compute accuracy (exact match)
    accuracy = sum([1 if p == l else 0 for p, l in zip(pred_str, labels_str)]) / len(labels_str)

    # Combine the metrics
    results = {
        "rouge1_f": scores["rouge-1"]["f"],
        "rouge2_f": scores["rouge-2"]["f"],
        "rougeL_f": scores["rouge-l"]["f"],
        "accuracy": accuracy,
    }

    return results

# Training configuration
training_args = TrainingArguments(
    output_dir="./lay_summary_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
)

# Split the dataset into training and evaluation sets
train_df, eval_df = train_test_split(df.head(100), test_size=0.2, random_state=42)

train_dataset = [build_inputs(row['article'], row['lay_summary'], tokenizer) for _, row in train_df.iterrows()]
eval_dataset = [build_inputs(row['article'], row['lay_summary'], tokenizer) for _, row in eval_df.iterrows()]

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Add the custom compute_metrics function here
)


# Train and evaluate the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./lay_summary_model")
tokenizer.save_pretrained("./lay_summary_model")


Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss
