### Downloading Dataset from HF

In [1]:
#loading from hugging face datasets
from datasets import load_dataset
dataset = load_dataset("multi_news")

  from .autonotebook import tqdm as notebook_tqdm
Downloading builder script: 100%|██████████| 3.83k/3.83k [00:00<00:00, 20.8MB/s]
Downloading metadata: 100%|██████████| 2.82k/2.82k [00:00<00:00, 13.8MB/s]
Downloading readme: 100%|██████████| 10.6k/10.6k [00:00<00:00, 22.6MB/s]
Downloading data: 100%|██████████| 548M/548M [00:53<00:00, 10.3MB/s]
Downloading data: 100%|██████████| 58.8M/58.8M [00:06<00:00, 9.07MB/s]
Downloading data: 100%|██████████| 66.9M/66.9M [00:07<00:00, 9.11MB/s]
Downloading data: 100%|██████████| 7.30M/7.30M [00:01<00:00, 4.18MB/s]
Downloading data: 100%|██████████| 69.0M/69.0M [00:17<00:00, 3.87MB/s]
Downloading data: 100%|██████████| 7.31M/7.31M [00:00<00:00, 9.86MB/s]
Downloading data files: 100%|██████████| 3/3 [01:41<00:00, 33.80s/it]
Generating train split: 100%|██████████| 44972/44972 [00:02<00:00, 19747.53 examples/s]
Generating validation split: 100%|██████████| 5622/5622 [00:00<00:00, 20398.41 examples/s]
Generating test split: 100%|██████████| 5622/562

In [9]:
import pandas as pd

#convert the train, validation, and test splits to DataFrames
train_df = pd.DataFrame({'document': dataset['train']['document'], 'summary': dataset['train']['summary']})
validation_df = pd.DataFrame({'document': dataset['validation']['document'], 'summary': dataset['validation']['summary']})
test_df = pd.DataFrame({'document': dataset['test']['document'], 'summary': dataset['test']['summary']})

#file paths to save the CSV files
train_csv_path = 'train.csv'
validation_csv_path = 'validation.csv'
test_csv_path = 'test.csv'
train_df.to_csv(train_csv_path, index=False)
validation_df.to_csv(validation_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)

print(f"Train data saved to {train_csv_path}")
print(f"Validation data saved to {validation_csv_path}")
print(f"Test data saved to {test_csv_path}")

Train data saved to train.csv
Validation data saved to validation.csv
Test data saved to test.csv


### Converting Data To Suitable Format

In [10]:
#function to split documents and calculate length
def split_and_count(documents):
    #split documents by the delimiter and strip whitespace
    documents_list = [doc.strip() for doc in documents.split("|||||")]
    #calculate the length of the list
    num_documents = len(documents_list)
    return documents_list, num_documents

In [11]:
#split_and_count function to the train, validation, and test splits
train_df['documents'], train_df['num_documents'] = zip(*train_df['document'].map(split_and_count))
validation_df['documents'], validation_df['num_documents'] = zip(*validation_df['document'].map(split_and_count))
test_df['documents'], test_df['num_documents'] = zip(*test_df['document'].map(split_and_count))

#drop the original "document" column
train_df.drop(columns=['document'], inplace=True)
validation_df.drop(columns=['document'], inplace=True)
test_df.drop(columns=['document'], inplace=True)

train_df = train_df[['documents', 'num_documents', 'summary']]
validation_df = validation_df[['documents', 'num_documents', 'summary']]
test_df = test_df[['documents', 'num_documents', 'summary']]

#define file paths for CSV files
train_csv_path = 'multi_news/train.csv'
validation_csv_path = 'multi_news/validation.csv'
test_csv_path = 'multi_news/test.csv'

#save the modified DataFrames as CSV files
train_df.to_csv(train_csv_path, index=False)
validation_df.to_csv(validation_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)

print(f"Train data saved to {train_csv_path}")
print(f"Validation data saved to {validation_csv_path}")
print(f"Test data saved to {test_csv_path}")

Train data saved to multi_news/train.csv
Validation data saved to multi_news/validation.csv
Test data saved to multi_news/test.csv


### Sample File Generation

In [15]:
import pandas as pd

#defining the file paths for the original CSV files
train_csv_path = 'multi_news/train.csv'
validation_csv_path = 'multi_news/validation.csv'
test_csv_path = 'multi_news/test.csv'

train_df = pd.read_csv(train_csv_path)
validation_df = pd.read_csv(validation_csv_path)
test_df = pd.read_csv(test_csv_path)

#taking a subset of rows for each dataset
sample_train_df = train_df.sample(n=1000, random_state=42)
sample_validation_df = validation_df.sample(n=250, random_state=42)
sample_test_df = test_df.sample(n=250, random_state=42)
sample_train_csv_path = 'multi_news/sample_train.csv'
sample_validation_csv_path = 'multi_news/sample_validation.csv'
sample_test_csv_path = 'multi_news/sample_test.csv'

#saving the sample DataFrames as CSV files
sample_train_df.to_csv(sample_train_csv_path, index=False)
sample_validation_df.to_csv(sample_validation_csv_path, index=False)
sample_test_df.to_csv(sample_test_csv_path, index=False)

print(f"Sample training data saved to {sample_train_csv_path}")
print(f"Sample validation data saved to {sample_validation_csv_path}")
print(f"Sample test data saved to {sample_test_csv_path}")

Sample training data saved to multi_news/sample_train.csv
Sample validation data saved to multi_news/sample_validation.csv
Sample test data saved to multi_news/sample_test.csv
