In [None]:
from transformers import pipeline
from text_rank.evaluation import *
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
summarizer = pipeline("summarization", model="KamilAin/bart-base-booksum", device=device)
data = read_data("small_datasets/CNNML_tiny.csv")
# Perform summarization

In [None]:
summary = summarizer(str(data[0]), max_length=70, min_length=30, do_sample=False)
# Print the summarized text
print(summary[0]['summary_text'])

In [None]:
from transformers import BartTokenizerFast, BartForConditionalGeneration
from datasets import load_dataset

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Load the dataset
dataset = load_dataset("cnn_dailymail", '3.0.0')
train_dataset = dataset["train"]
val_dataset = dataset["validation"]

# Load the fast tokenizer
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")

# Tokenize the dataset
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import Trainer, TrainingArguments
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
model.to(device)


training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    do_eval=True,
    save_total_limit=5,
    evaluation_strategy="epoch",
    max_steps=2000
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val
)

trainer.train()
model.save_pretrained("./bart_cnn_dailymail_finetuned")

In [None]:
from transformers import BartTokenizerFast, BartForConditionalGeneration
from text_rank.evaluation import *
def generate_summary(text, model, tokenizer):
    inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=30, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

model_path = "./bart_cnn_dailymail_finetuned"
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained(model_path)
data = read_data("small_datasets/CNNML_tiny.csv")
example_text = prefix + str(data[0])

summary = generate_summary(example_text, model, tokenizer)
print(summary)

In [None]:
from datasets import load_dataset, concatenate_datasets

def data_download():
    dataset = load_dataset('kmyoo/cnn-dailymail-v1-tiny')
    merged_dataset = concatenate_datasets([dataset['train'], dataset['validation'], dataset['test']])
    merged_dataset.to_csv('./tiny_CNN_DM/full_dataset.csv')

In [None]:
data_download()

In [42]:
import pandas as pd

csv_file_path = './tiny_CNN_DM/full_dataset.csv'
df = pd.read_csv(csv_file_path)

In [43]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=1/6)

In [45]:
train_df.to_csv('./tiny_CNN_DM/train_dataset.csv', index=False)
test_df.to_csv('./tiny_CNN_DM/test_dataset.csv', index=False)