In [None]:
# Part 1: Importing the Dataset and Libraries

# Import necessary libraries
import pandas as pd
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the CSV file into a pandas DataFrame (replace with your file path)
df = pd.read_csv('/content/Text Summarization Data - Text Summarization Data.csv')

# View the first few rows of the dataframe to ensure it's loaded correctly
df.head()


Using device: cuda


Unnamed: 0,id,article,highlights
0,61df4979ac5fcc2b71be46ed6fe5a46ce7f071c3,"Sally Forrest, an actress-dancer who graced th...","Sally Forrest, an actress-dancer who graced th..."
1,21c0bd69b7e7df285c3d1b1cf56d4da925980a68,A middle-school teacher in China has inked hun...,Works include pictures of Presidential Palace ...
2,56f340189cd128194b2e7cb8c26bb900e3a848b4,A man convicted of killing the father and sist...,"Iftekhar Murtaza, 29, was convicted a year ago..."
3,00a665151b89a53e5a08a389df8334f4106494c2,Avid rugby fan Prince Harry could barely watch...,Prince Harry in attendance for England's crunc...
4,9f6fbd3c497c4d28879bebebea220884f03eb41a,A Triple M Radio producer has been inundated w...,Nick Slater's colleagues uploaded a picture to...


In [None]:
# Part 2: Training/Preparation

# Load pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Move the model to GPU if available
model.to(device)

# Define a Summarization Function with GPU support
def summarize_text(text, model, tokenizer, device, max_input_length=1024, max_output_length=150):

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", max_length=max_input_length, truncation=True, padding=True)

    # Move input tensors to the selected device (GPU or CPU)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_output_length,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )

    # Decode the summary and return
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


df['summarized_highlights'] = df['highlights'].apply(lambda x: summarize_text(x, model, tokenizer, device))


In [None]:
# Optional: View the first few rows to check if the summarization worked
df[['highlights', 'summarized_highlights']].head()

In [None]:

# Save the DataFrame with summarized highlights to a new CSV file
df.to_csv('summarized_highlights.csv', index=False)

# Optional: Print a message when done
print("Summarization complete. The summarized highlights have been saved to 'summarized_highlights.csv'.")