In [1]:
# Install required packages
!pip install transformers==4.36.2 rouge-score==0.1.2 pandas torch

# Import libraries
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from rouge_score import rouge_scorer
import torch

# Load T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Function to summarize text using T5
def t5_summarize(text):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(
        inputs['input_ids'],
        max_length=100, min_length=30,
        length_penalty=2.0, num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Function to compute ROUGE scores
def compute_rouge(summary, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return scorer.score(reference, summary)

# Load Organic and Synthetic datasets
organic_df = pd.read_csv('/content/organic_data.csv')
synthetic_df = pd.read_csv('/content/synthetic_data.csv')

# Summarize and Evaluate Organic Data
organic_results = []
for _, row in organic_df.iterrows():
    summary = t5_summarize(row['text'])
    rouge = compute_rouge(summary, row['text'])
    organic_results.append({'id': row['id'], 'original': row['text'], 'summary': summary, 'rouge': rouge})

# Summarize and Evaluate Synthetic Data
synthetic_results = []
for _, row in synthetic_df.iterrows():
    summary = t5_summarize(row['text'])
    rouge = compute_rouge(summary, row['text'])
    synthetic_results.append({'id': row['id'], 'original': row['text'], 'summary': summary, 'rouge': rouge})

# Convert results to DataFrames
organic_summary_df = pd.DataFrame(organic_results)
synthetic_summary_df = pd.DataFrame(synthetic_results)

# Show results
print("Organic Data Summarization Results:")
display(organic_summary_df[['id', 'summary', 'rouge']])

print("Synthetic Data Summarization Results:")
display(synthetic_summary_df[['id', 'summary', 'rouge']])


Collecting transformers==4.36.2
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score==0.1.2
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tokenizers<0.19,>=0.14 (from transformers==4.36.2)
  Downloading tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.12

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Organic Data Summarization Results:


Unnamed: 0,id,summary,rouge
0,1,the Document Academy is a platform for researc...,"{'rouge1': (1.0, 1.0, 1.0), 'rouge2': (1.0, 1...."
1,2,natural language processing has become crucial...,"{'rouge1': (0.5, 1.0, 0.6666666666666666), 'ro..."


Synthetic Data Summarization Results:


Unnamed: 0,id,summary,rouge
0,1,researchers discuss the role of documents in s...,"{'rouge1': (0.6538461538461539, 1.0, 0.7906976..."
1,2,NLP plays a vital role in automating language ...,"{'rouge1': (0.5, 1.0, 0.6666666666666666), 'ro..."
