In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import re

# Function to clean and preprocess individual articles
def preprocess_article(raw_text):
    """
    Cleans and structures the input text for articles.
    Retains sentence boundaries, proper nouns, and structure markers.
    """
    # Replace special markers with tokens for structure
    text = raw_text.replace("[SN]URL[SN]", "<URL>")
    text = text.replace("[SN]TITLE[SN]", "<TITLE>")
    text = text.replace("[SN]FIRST-SENTENCE[SN]", "<FIRST>")
    text = text.replace("[SN]RESTBODY[SN]", "<BODY>")

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)

    # Remove the URL content (optional, unless URLs are required)
    text = re.sub(r"<URL>.*?<TITLE>", "<TITLE>", text, flags=re.DOTALL)

    # Convert to lowercase (optional)
    text = text.lower()

    # Return cleaned and formatted text
    return text.strip()

# Function to preprocess summaries
def preprocess_summary(raw_text):
    """
    Cleans and structures the input text for summaries.
    Retains proper nouns and important stopwords.
    """
    # Replace special markers with tokens for structure
    text = raw_text.replace("[SN]URL[SN]", "<URL>")
    text = text.replace("[SN]TITLE[SN]", "<TITLE>")

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)

    # Remove the URL content (optional, unless URLs are required)
    text = re.sub(r"<URL>.*?<TITLE>", "<TITLE>", text, flags=re.DOTALL)

    # Convert to lowercase (optional)
    text = text.lower()

    # Return cleaned and formatted text
    return text.strip()

# Function to process the entire dataset
import re

def process_dataset(input_file, article_file, summary_file):
    with open(input_file, "r", encoding="utf-8") as infile, \
         open(article_file, "w", encoding="utf-8") as article_out, \
         open(summary_file, "w", encoding="utf-8") as summary_out:
         fs,fa="",""
         fc = infile.readlines()
         for i in range(5):
            fs+=fc[i]
         for i in range(6,len(fc)):
            fa+=fc[i]
         fa = preprocess_article(fa)
         fs = preprocess_summary(fs)
         article_out.write(fa)
         summary_out.write(fs)

In [None]:
import shutil
from google.colab import files

# Folder to zip and download
folder_name = 'processed_data'

# Create a zip file of the folder
shutil.make_archive(folder_name, 'zip', folder_name)

# Download the zip file
files.download(f'{folder_name}.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
file=open("/content/drive/MyDrive/xsum-data/10000983.summary","r")
print(file.read())

[SN]URL[SN]
http://web.archive.org/web/20110610181825/http://www.bbc.co.uk/newsbeat/10000983

[SN]TITLE[SN]
Anger over 'US criticism' of NHS

[SN]FIRST-SENTENCE[SN]
It's being called "evil" and a "death panel" where bureaucrats decide who lives and who dies. Any ideas what it could be?

[SN]RESTBODY[SN]
Well, it's how the idea of Britain's NHS is being described over in America.
It's all part of a backlash against Barack Obama's planned changes to the US health system.
There's been an angry response from Newsbeat listeners who point out the UK is above the US in healthcare league tables.
Fiona from Derby says the NHS saved her life. "I almost died of Hodgkin's Lymphoma aged 20," she said. "I've since gone on to have two children. I'm owed their lives as much as my own."
Gareth in Hampshire has had a personal experience of the health system in the US. "I lived in the USA and the healthcare system is designed for the rich," he texted.
"My two-year-old son had to go to the emergency room 

In [None]:
import os
directory = '/content/drive/MyDrive/xsum-data'  # Make sure this is the correct path
output_dir = '/content/drive/MyDrive/processed_data'
for filename in os.scandir(directory):
  fname=(str(filename)).split("'")[1]
  base_filename = os.path.splitext(os.path.basename(fname))[0]
  summary_filename = os.path.join(output_dir, f"{base_filename}_summary.txt")
  article_filename = os.path.join(output_dir, f"{base_filename}_article.txt")
  process_dataset(filename, article_filename, summary_filename)

In [None]:
f=open("/content/drive/MyDrive/processed_data/10000983_article.txt","r")
print(f.read())
f.close()

<first> it's being called "evil" and a "death panel" where bureaucrats decide who lives and who dies. any ideas what it could be? <body> well, it's how the idea of britain's nhs is being described over in america. it's all part of a backlash against barack obama's planned changes to the us health system. there's been an angry response from newsbeat listeners who point out the uk is above the us in healthcare league tables. fiona from derby says the nhs saved her life. "i almost died of hodgkin's lymphoma aged 20," she said. "i've since gone on to have two children. i'm owed their lives as much as my own." gareth in hampshire has had a personal experience of the health system in the us. "i lived in the usa and the healthcare system is designed for the rich," he texted. "my two-year-old son had to go to the emergency room for a high fever. "the hospital charged $10,000 (â£6,076). he only had three injections and a quick check up." john is a medical student at the university of manchester