In [1]:
import pandas as pd
import os

def split_csv_file(input_filepath, output_directory, chunk_size=20000):
    """
    Splits a large CSV file into smaller CSV files, each with a specified number of rows.

    Args:
        input_filepath (str): The path to the input CSV file.
        output_directory (str): The directory where the split files will be saved.
        chunk_size (int): The number of rows for each split file (excluding the header).
    """

    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
        print(f"Created output directory: {output_directory}")

    # Read the CSV in chunks
    print(f"Splitting '{input_filepath}' into chunks of {chunk_size} rows...")
    chunk_number = 0
    header = None

    for i, chunk in enumerate(pd.read_csv(input_filepath, chunksize=chunk_size)):
        if i == 0:
            # Store the header from the first chunk
            header = list(chunk.columns)

        output_filename = os.path.join(output_directory, f'patientDEMOGRAPHICS-part-{chunk_number + 1}.csv')
        
        # Write each chunk to a new CSV file, including the header
        if chunk_number == 0:
            # For the first file, pandas will write the header by default
            chunk.to_csv(output_filename, index=False)
        else:
            # For subsequent files, manually write the header and then append the data
            # This is handled by pandas' to_csv with header=True, 
            # as it automatically includes the header from the DataFrame.
            chunk.to_csv(output_filename, index=False)
        
        print(f"Saved {len(chunk)} rows to '{output_filename}'")
        chunk_number += 1

    print("CSV splitting complete!")

# --- Configuration ---
input_csv_file = 'Data/patientDEMOGRAPHICS-11th-July-2025.csv'
output_dir = 'Data/SplitFiles' # This directory will be created if it doesn't exist
rows_per_file = 20000

# --- Run the splitter ---
split_csv_file(input_csv_file, output_dir, rows_per_file)

Created output directory: Data/SplitFiles
Splitting 'Data/patientDEMOGRAPHICS-11th-July-2025.csv' into chunks of 20000 rows...
Saved 20000 rows to 'Data/SplitFiles\patientDEMOGRAPHICS-part-1.csv'
Saved 20000 rows to 'Data/SplitFiles\patientDEMOGRAPHICS-part-2.csv'
Saved 20000 rows to 'Data/SplitFiles\patientDEMOGRAPHICS-part-3.csv'
Saved 7378 rows to 'Data/SplitFiles\patientDEMOGRAPHICS-part-4.csv'
CSV splitting complete!
