# Data Segregation

by seperating large diff and small diff we can handle memory load and speed using deffernt approach in tokenizing

In [1]:
import csv
import os
import sys
from tqdm.notebook import tqdm

In [None]:
# --- Configuration ---
LARGE_CSV_PATH = "data/final_dataset_with_diffs.csv"
SMALL_DIFFS_PATH = "data/small_diffs.csv"
LARGE_DIFFS_PATH = "data/large_diffs.csv"
NUL_BYTE_LOG_PATH = "log/nul_byte_lines.txt" # File to log line numbers of skipped rows
SIZE_LIMIT_MB = 10 

In [3]:
# --- Setup ---
size_limit_bytes = SIZE_LIMIT_MB * 1024 * 1024
# Increase the CSV field size limit to handle large diffs
max_int = sys.maxsize
while True:
    try:
        csv.field_size_limit(max_int)
        break
    except OverflowError:
        max_int = int(max_int / 10)

# --- Main Segregation Loop ---
print(f"Segregating '{LARGE_CSV_PATH}'...")

small_count = 0
large_count = 0
nul_count = 0

try:
    with open(LARGE_CSV_PATH, 'r', encoding='utf-8') as infile, \
         open(SMALL_DIFFS_PATH, 'w', newline='', encoding='utf-8') as small_outfile, \
         open(LARGE_DIFFS_PATH, 'w', newline='', encoding='utf-8') as large_outfile, \
         open(NUL_BYTE_LOG_PATH, 'w', encoding='utf-8') as nul_log_file:

        # Read header and create writers
        header_line = infile.readline()
        header = header_line.strip().split(',')
        
        small_writer = csv.writer(small_outfile)
        large_writer = csv.writer(large_outfile)
        small_writer.writerow(header)
        large_writer.writerow(header)
        
        diff_col_index = header.index('diff')

        # Iterate through the rest of the file, tracking line numbers
        for line_num, line in enumerate(tqdm(infile, desc="Scanning Rows"), start=2):
            
            # --- THE KEY CHANGE IS HERE ---
            # Check for NULL bytes before doing anything else
            if '\x00' in line:
                nul_log_file.write(f"{line_num}\n") # Log the line number
                nul_count += 1
                continue # Skip this row and continue to the next
            
            # If the line is clean, parse it
            try:
                parsed_row = next(csv.reader([line]))
                if len(parsed_row) != len(header):
                    continue # Skip rows with column mismatches

                diff_text = parsed_row[diff_col_index]
                
                # Perform segregation logic
                if len(diff_text.encode('utf-8', 'ignore')) > size_limit_bytes:
                    large_writer.writerow(parsed_row)
                    large_count += 1
                else:
                    small_writer.writerow(parsed_row)
                    small_count += 1
            except Exception:
                # Catch any other parsing errors on non-NUL lines
                nul_log_file.write(f"{line_num}\n")
                nul_count += 1
                continue

    print("\n--- Segregation Complete --- ✅")
    print(f"Wrote {small_count} rows to '{SMALL_DIFFS_PATH}'")
    print(f"Wrote {large_count} rows to '{LARGE_DIFFS_PATH}'")
    if nul_count > 0:
        print(f"Skipped {nul_count} problematic rows containing NUL bytes. See '{NUL_BYTE_LOG_PATH}'.")

except FileNotFoundError:
    print(f"ERROR: Input file not found at '{LARGE_CSV_PATH}'")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Segregating 'final_dataset_with_diffs.csv'...


Scanning Rows: 0it [00:00, ?it/s]


--- Segregation Complete --- ✅
Wrote 158890 rows to 'small_diffs.csv'
Wrote 0 rows to 'large_diffs.csv'
Skipped 1186 problematic rows containing NUL bytes. See 'nul_byte_lines.txt'.


❌ failed this script cant handle the nul_bytes

### dataset cleaning

In [None]:
import csv
import os
import sys
from tqdm.notebook import tqdm

# --- Configuration ---
LARGE_CSV_PATH = "data/final_dataset_with_diffs.csv"
CLEAN_CSV_PATH = "data/final_dataset_with_full_diffs_CLEANED.csv"
NUL_COMMIT_LOG_PATH = "log/nul_commit_hashes.log"

In [7]:
# --- Setup ---
max_int = sys.maxsize
while True:
    try:
        csv.field_size_limit(max_int)
        break
    except OverflowError:
        max_int = int(max_int / 10)

# --- Main Segregation Loop with Improved Error Logging ---
print(f"Starting to clean '{LARGE_CSV_PATH}' and log problematic commits...")
nul_rows_found = 0

with open(LARGE_CSV_PATH, 'rb') as infile, \
     open(CLEAN_CSV_PATH, 'wb') as outfile, \
     open(NUL_COMMIT_LOG_PATH, 'w', encoding='utf-8') as logfile:

    header_bytes = infile.readline()
    outfile.write(header_bytes)
    header_str = header_bytes.decode('utf-8', 'ignore').strip()
    header_list = header_str.split(',')
    
    try:
        commit_hash_index = header_list.index('commit_hash')
    except ValueError:
        print("ERROR: 'commit_hash' column not found in the header.")
        raise

    for line_bytes in tqdm(infile, desc="Scanning and Cleaning"):
        if b'\x00' in line_bytes:
            nul_rows_found += 1
            
            # Decode the line, replacing bad characters
            line_str = line_bytes.decode('utf-8', 'replace')
            
            # --- THE KEY CHANGE IS HERE ---
            # For these broken lines, use a simple split instead of the strict csv.reader
            try:
                parts = line_str.split(',')
                if len(parts) > commit_hash_index:
                    commit_hash = parts[commit_hash_index]
                    logfile.write(f"{commit_hash}\n")
                else:
                    logfile.write("Unparseable_row_short_after_split\n")
            except Exception:
                logfile.write("Unparseable_row_hard_error\n")

            # Clean the NULL byte and write the line to the output
            cleaned_line_bytes = line_bytes.replace(b'\x00', b'')
            outfile.write(cleaned_line_bytes)
        else:
            # If the line is clean, write it directly
            outfile.write(line_bytes)

print("\n--- Cleaning Complete --- ✅")
print(f"Clean file saved to '{CLEAN_CSV_PATH}'.")
if nul_rows_found > 0:
    print(f"Found and cleaned {nul_rows_found} rows containing NULL bytes.")
    print(f"The commit hashes of these rows have been attempted to be saved to '{NUL_COMMIT_LOG_PATH}'.")

Starting to clean 'final_dataset_with_diffs.csv' and log problematic commits...


Scanning and Cleaning: 0it [00:00, ?it/s]


--- Cleaning Complete --- ✅
Clean file saved to 'final_dataset_with_full_diffs_CLEANED.csv'.
Found and cleaned 898 rows containing NULL bytes.
The commit hashes of these rows have been attempted to be saved to 'nul_commit_hashes.log'.


## Segregation process

In [8]:
import pandas as pd
from tqdm.notebook import tqdm
import os

# --- Configuration ---
CLEAN_CSV_PATH = "final_dataset_with_full_diffs_CLEANED.csv"
SMALL_DIFFS_PATH = "small_diffs.csv"
LARGE_DIFFS_PATH = "large_diffs.csv"
SIZE_LIMIT_MB = 10
CHUNK_SIZE = 500 # Process 500 rows at a time

In [9]:
# --- Setup ---
size_limit_bytes = SIZE_LIMIT_MB * 1024 * 1024

print(f"Segregating '{CLEAN_CSV_PATH}' using pandas chunking...")

# Create the CSV reader which yields DataFrames (chunks)
try:
    csv_reader = pd.read_csv(CLEAN_CSV_PATH, chunksize=CHUNK_SIZE)

    # Use tqdm to track progress over the chunks
    for i, chunk_df in enumerate(tqdm(csv_reader, desc="Processing Chunks")):
        # Calculate the byte size of each diff in the chunk
        diff_sizes = chunk_df['diff'].astype(str).str.encode('utf-8', 'ignore').str.len()

        # Split the chunk into small and large diffs based on the size limit
        large_df = chunk_df[diff_sizes > size_limit_bytes]
        small_df = chunk_df[diff_sizes <= size_limit_bytes]
        
        # Append to the respective CSV files
        # The header is only written for the very first chunk
        header = i == 0
        
        if not small_df.empty:
            small_df.to_csv(SMALL_DIFFS_PATH, mode='a', header=header, index=False)
            
        if not large_df.empty:
            large_df.to_csv(LARGE_DIFFS_PATH, mode='a', header=header, index=False)

    print("\n--- Segregation Complete --- ✅")
    print(f"Small diffs saved to '{SMALL_DIFFS_PATH}'")
    print(f"Large diffs saved to '{LARGE_DIFFS_PATH}'")

except FileNotFoundError:
    print(f"ERROR: Clean file not found at '{CLEAN_CSV_PATH}'. Please ensure the cleaning step was completed successfully.")
except Exception as e:
    print(f"An unexpected error occurred during segregation: {e}")

Segregating 'final_dataset_with_full_diffs_CLEANED.csv' using pandas chunking...


Processing Chunks: 0it [00:00, ?it/s]


--- Segregation Complete --- ✅
Small diffs saved to 'small_diffs.csv'
Large diffs saved to 'large_diffs.csv'
