In [55]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

In [48]:
# Load the dataset (replace 'your_dataset_name' with the actual name)
dataset = pd.read_csv(r'
ews.csv')

In [49]:
# Perform train-test split (80% train, 20% test)
train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)

In [53]:
train_df

Unnamed: 0,id,article,highlights
29,001c839e1d76c400129f6c2799957c74e9895815,A Hero teacher who saved the life of a serious...,"Teacher Ray Coe gave pupil Alya Ahmed Ali,13, ..."
535,018e3016e48a4aaf442a2db16d4b1143d3b662ea,Police investigating the disappearance of Made...,"Police want to speak to ex-pat Robert Murat, P..."
695,02060be908e4894aa26922fbc14a2aea1a4163f1,The full scale of spending on private contract...,£3.2m spent on Tasers with millions more on ot...
557,01a033acb35063f347289196cb89e8afaef802c2,(EW.com) -- You don't need to look very hard t...,The Tony Awards hold the dearest spot in Neil ...
836,0264c489ba126938bc91c07211df95804e41ff67,"By . Nazia Parveen . PUBLISHED: . 18:09 EST, 1...",The three girls were 'shaken like dolls' by tw...
...,...,...,...
106,0057863e126ceb0f22053aa1570a14977e5803ff,By . Matt Barlow . Follow @@Matt_Barlow_DM . N...,Neymar out of the tournament after injury late...
270,00d48bf5089a093edf12ef5f3836780e97be6be0,"(CNN) -- These days, no fashion house portfoli...",Tommy Hilfiger and Karl Lagerfeld are the late...
860,0276ac7a8bc00bdc47e1e99774ee7d5017f0f0a8,Arsenal midfielder Mikel Arteta was visibly di...,Borussia Dortmund score either side of half ti...
435,01462daeb1d2b20447926ffd47368bdf0116db4a,A onetime California school official was sente...,"Andrea Cardosa, a former assistant principal, ..."


In [56]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [57]:
# Load the pre-trained tokenizer and model for summarization (e.g., T5 or BART)
model_name = "facebook/bart-base"  # Or "t5-small" for smaller model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



In [58]:
# Preprocessing function for tokenizing inputs and outputs
def preprocess_function(examples):
    # Tokenize input (article) and output (highlights)
    inputs = tokenizer(examples['article'], max_length=1024, padding='max_length', truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['highlights'], max_length=128, padding='max_length', truncation=True)
    
    inputs['labels'] = labels['input_ids']
    return inputs

In [59]:
# Apply the preprocessing to both training and testing datasets
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [60]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    predict_with_generate=True  # To enable text generation during evaluation
)

# Initialize Seq2SeqTrainer for summarization tasks
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer
)



In [61]:
# Start training
trainer.train()

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 3.348780393600464, 'eval_runtime': 1.1453, 'eval_samples_per_second': 174.63, 'eval_steps_per_second': 11.351, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 2.543712854385376, 'eval_runtime': 1.1481, 'eval_samples_per_second': 174.2, 'eval_steps_per_second': 11.323, 'epoch': 2.0}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 2.33656644821167, 'eval_runtime': 1.3084, 'eval_samples_per_second': 152.854, 'eval_steps_per_second': 9.936, 'epoch': 3.0}
{'train_runtime': 49.2765, 'train_samples_per_second': 48.705, 'train_steps_per_second': 3.044, 'train_loss': 4.096498209635417, 'epoch': 3.0}


TrainOutput(global_step=150, training_loss=4.096498209635417, metrics={'train_runtime': 49.2765, 'train_samples_per_second': 48.705, 'train_steps_per_second': 3.044, 'total_flos': 1463367499776000.0, 'train_loss': 4.096498209635417, 'epoch': 3.0})

In [62]:
# Start evaluation
trainer.evaluate()

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 2.33656644821167,
 'eval_runtime': 1.3859,
 'eval_samples_per_second': 144.306,
 'eval_steps_per_second': 9.38,
 'epoch': 3.0}

In [63]:
# Save the trained model
trainer.save_model("./trained_model")


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [64]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_metric
import torch

# Load the fine-tuned model and tokenizer (adjust paths as necessary)
model = AutoModelForSeq2SeqLM.from_pretrained('./trained_model')
tokenizer = AutoTokenizer.from_pretrained('./trained_model')

In [67]:
# Function to generate summaries
def generate_summary(batch):
    # Tokenize the articles (batch input)
    inputs = tokenizer(batch['article'], max_length=1024, padding='max_length', truncation=True, return_tensors="pt")
    
    # Move inputs to GPU if available
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    
    # Generate summaries for the entire batch
    with torch.no_grad():
        summary_ids = model.generate(inputs['input_ids'], max_length=128, num_beams=4, early_stopping=True)
    
    # Decode the generated summaries (batch decoding)
    batch['generated_summary'] = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
    
    return batch


# Apply the generation function to the test dataset
test_results = tokenized_test_dataset.map(generate_summary, batched=True, batch_size=16)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [68]:
# Display a few generated summaries with actual summaries
for i in range(5):
    print(f"Article {i+1}:\n", test_results[i]['article'])
    print(f"Generated Summary {i+1}:\n", test_results[i]['generated_summary'])
    print(f"Actual Summary {i+1}:\n", test_results[i]['highlights'])
    print("---------------------------------------------------------")

Article 1:
 Experts have used a radical new satellites to capture unique psychedelic images of the ruptures in the Earth's crust caused by the Napa earthquake. Radar images from the UK's Sentinel-1A satellite have been used to map the biggest earthquake that has shaken northern California in 25 years. The images reveal the rupture is larger than previously thought. By processing two Sentinel-1A images, which were acquired on 7 August and 31 August 2014 over this wine-producing region, an interferogram was generated. The two round shapes around Napa valley, which are visible in the central part of the image, show how the ground moved during the quake. The satellite uses a technique called 'Synthetic aperture radar interferometry'. This uses two or more satellite radar images of the same area are combined to detect large-scale surface changes. Small changes on the ground modify the reflected radar signal and lead to rainbow-coloured fringes in the 'interferogram'. Each colour cycle corre