In [None]:
import pandas as pd
import os
import math
import json

class DataSplitter:
    @staticmethod
    def split_ndjson_to_csv(
        input_path: str,
        output_dir: str,
        chunk_size: int = 300000,
    ):
        with open(input_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f if line.strip()]
        df = pd.DataFrame(data)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        num_chunks = math.ceil(len(df) / chunk_size)
        for i in range(num_chunks):
            start = i * chunk_size
            end = min((i + 1) * chunk_size, len(df))
            chunk = df.iloc[start:end]
            output_path = os.path.join(output_dir, f"part_{i+1}.csv")
            chunk.to_csv(output_path, index=False)
        print(f"Saved {num_chunks} CSV files in {output_dir}")

In [None]:
DataSplitter.split_ndjson_to_csv(
    input_path='./Bigdataset/mini_sparkify_event_data.json',
    output_dir='./Bigdataset/BreakData',
    chunk_size=300000
)

<a href="https://colab.research.google.com/github/NolanMM/PROG74000-25W-Sec1-Applications-of-Artificial-Intelligence-and-Machine-Learning/blob/5-Final-Project/Project/Break_Large_File_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install polars[gpu]

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import polars as pl
import os
import math

class DataSplitter:
    @staticmethod
    def split_ndjson_to_csv(
        input_path: str,
        output_dir: str,
        chunk_size: int = 300000,
        engine: str = "gpu"
    ):
        df = pl.scan_ndjson(input_path)
        result = df.collect(engine=engine)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        num_chunks = math.ceil(len(result) / chunk_size)
        for i in range(num_chunks):
            start = i * chunk_size
            end = min((i + 1) * chunk_size, len(result))
            chunk = result[start:end]
            chunk.write_csv(f"{output_dir}/part_{i+1}.csv")

        print(f"✅ Saved {num_chunks} CSV files in {output_dir}")


✅ Saved 88 CSV files in /content/drive/MyDrive/PROG74000-25W-Project/Break_data


In [None]:
DataSplitter.split_ndjson_to_csv(
    input_path='/content/drive/MyDrive/PROG74000-25W-Project/bigdata.json',
    output_dir='/content/drive/MyDrive/PROG74000-25W-Project/Break_data',
    chunk_size=300000
)