In [18]:
import csv
import pandas as pd
import os

def split_csv(input_file, output_prefix, rows_per_file):
    """Splits a CSV file into multiple CSV files based on the number of rows and writes them to the 'good_data' folder.

    Args:
        input_file: The path to the input CSV file.
        output_prefix: The prefix for the output file names.
        rows_per_file: The number of rows per output file.
    """

    with open(input_file, 'r') as infile:
        reader = csv.reader(infile)
        header = next(reader)  # Read the header row
        output_file = None
        row_count = 0
        file_count = 1

        for row in reader:
            if row_count == 0:
                output_file = open(os.path.join("good_data", f"{output_prefix}_{file_count}.csv"), 'w', newline='')
                writer = csv.writer(output_file)
                writer.writerow(header)
            writer.writerow(row)
            row_count += 1
            if row_count == rows_per_file:
                output_file.close()
                row_count = 0
                file_count += 1

        if output_file:
            output_file.close()

def create_directory(directory_name):
  """Creates a directory if it doesn't exist.

  Args:
    directory_name: The name of the directory to create.
  """

  if not os.path.exists(directory_name):
    os.mkdir(directory_name)
    print(f"Directory '{directory_name}' created successfully.")
  else:
    print(f"Directory '{directory_name}' already exists.")


In [26]:
create_directory("good_data")

Directory 'good_data' already exists.


In [27]:
# Example usage
#file_ = pd.read_csv('../data/fraudTrain.csv')
input_file = '../data/testData.csv'
output_prefix = 'split_data'
rows_per_file = 1000
create_directory("good_data")
create_directory("bad_data")
split_csv(input_file, output_prefix, rows_per_file)

Directory 'good_data' already exists.
Directory 'bad_data' already exists.
