# Split the Data

In [3]:
# This can be done to process the data in batches, possibly in multiple kernels/on multiple devices

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import os

# Load dataset
data_path = "news_data/direct_data/nasdaq_external_data.csv"
df = pd.read_csv(
    data_path,
    usecols=["Date", "Stock_symbol", "Lsa_summary", "Luhn_summary", "Textrank_summary", "Lexrank_summary"],
    dtype=np.bytes_
)

# Dropping unnecessary data for easier sending
# Printing will show how much data is actually present

initial = df.size
print(f"Initial number of elements: {initial}")

df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

print(f"Post drop number of elements: {df.size}")
print(f"Overall number of elements dropped (including missing elements): {initial - df.size}")

# Create output directory
output_dir = "news_data/splits"
os.makedirs(output_dir, exist_ok=True)

# Split into n parts
num_splits = 5
split_size = len(df) // num_splits

for i in tqdm(range(num_splits)):
    start_idx = i * split_size
    end_idx = (i + 1) * split_size if i < num_splits - 1 else len(df)
    split_df = df[start_idx:end_idx]
    split_path = os.path.join(output_dir, f"nasdaq_external_{i+1}.csv")
    split_df.to_csv(split_path, index=False)

print("Split complete ✅")

Initial number of elements: 93295794
Post drop number of elements: 14861412
Overall number of elements dropped (including missing elements): 78434382


  0%|          | 0/5 [00:00<?, ?it/s]

Split complete ✅
