In [None]:
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import math

# Use pathlib for better path management
input_file = Path("../data/final_dataset_with_full_diffs_CLEANED.csv")
output_file = Path("../data/filtered_output.csv")

# Ensure the output directory exists
output_file.parent.mkdir(parents=True, exist_ok=True)

# --- Parameters ---
chunk_size = 100  # You can adjust this based on your system
skip_rows = 8000
process_limit = 15000

# --- Data Processing ---
filtered_rows = []

# Calculate the total number of chunks to process for tqdm
total_chunks = math.ceil(process_limit / chunk_size)

# Create a TextFileReader iterator with skiprows and nrows to read only the desired window of data
# This is more efficient than manually skipping rows inside the loop.
# We add 1 to skip_rows to account for the header row.
chunk_iterator = pd.read_csv(
    input_file,
    chunksize=chunk_size,
    skiprows=range(1, skip_rows + 1),  # Skip rows from the beginning (1-based index)
    nrows=process_limit,  # Limit the number of rows to read after skipping
    low_memory=False,  # Recommended for chunk processing
)

print(
    f"Processing {process_limit} rows from '{input_file}' starting after {skip_rows} rows..."
)

# Use tqdm to create a progress bar
for chunk in tqdm(chunk_iterator, total=total_chunks, desc="Filtering chunks"):
    # Filter condition
    filtered_chunk = chunk[chunk["is_bug_introducing"] == True]
    if not filtered_chunk.empty:
        filtered_rows.append(filtered_chunk)

# --- Save Results ---
if filtered_rows:
    # Concatenate all filtered chunks into a single DataFrame
    result_df = pd.concat(filtered_rows, ignore_index=True)

    # Save the filtered DataFrame to a new CSV file
    result_df.to_csv(output_file, index=False)

    print(f"\nSuccessfully saved {len(result_df)} filtered rows to '{output_file}'")
else:
    print("\nNo rows matched the filter condition. No output file was created.")


Processing 15000 rows from 'data\final_dataset_with_full_diffs_CLEANED.csv' starting after 8000 rows...


Filtering chunks:   0%|          | 0/150 [00:00<?, ?it/s]


Successfully saved 5412 filtered rows to 'data\filtered_output.csv'


In [None]:
import pandas as pd

# --- Configuration (on your local machine) ---
CLEAN_SOURCE_CSV = "../data/final_dataset_with_full_diffs_CLEANED.csv" # Path to your large clean file
DATA_SUBSET_PATH = "../data/data_subset_10k.csv"     # The new, small file we are creating
NUM_ROWS = 10000

print(f"Reading the first {NUM_ROWS} rows from '{CLEAN_SOURCE_CSV}'...")

# Read only the first N rows
df_subset = pd.read_csv(CLEAN_SOURCE_CSV, nrows=NUM_ROWS)

# Save the subset to a new file
df_subset.to_csv(DATA_SUBSET_PATH, index=False)

print(f"--- Success! ---")
print(f"A small sample file has been created at: '{DATA_SUBSET_PATH}'")
print("You can now upload this small file to RunPod.")

Reading the first 10000 rows from './data/final_dataset_with_full_diffs_CLEANED.csv'...
--- Success! ---
A small sample file has been created at: './data/data_subset_10k.csv'
You can now upload this small file to RunPod.
