In [1]:
import pandas as pd
import os
from tqdm import tqdm
from loguru import logger
from typing import List

### Configuration


In [None]:
# This helps in keeping track of the script's execution and debugging.
logger.add("../logs/missing_value_analysis.log", rotation="500 MB")

1

### function

In [3]:
# --- Missing Value Analysis ---
def analyze_missing_values(file_path: str, chunk_size: int = 10000):
    if os.path.exists(file_path):
        try:
            logger.info(f"Starting missing value analysis for {file_path}...")
            # Initialize a list to store the summary of missing values from each chunk.
            missing_values_summary: List[pd.Series] = []
            total = 0

            # Create an iterator to read the CSV file in chunks.
            # Using a 'with' statement ensures the file handler is properly closed.
            with pd.read_csv(
                file_path, chunksize=chunk_size, low_memory=False
            ) as chunk_iterator:
                # Use tqdm to create a progress bar for iterating through chunks.
                for chunk in tqdm(chunk_iterator, desc="Processing chunks"):
                    # For each column in the chunk, count the number of NA (missing) values.
                    na_counts = chunk.isnull().sum()
                    total += chunk.shape[0]
                    # Append the result to the summary list.
                    missing_values_summary.append(na_counts)

            # --- Aggregation and Display ---
            if missing_values_summary:
                # Combine the results from all chunks into a single DataFrame.
                # Each series from the list becomes a column in the new DataFrame.
                total_missing_df = pd.concat(missing_values_summary, axis=1)

                # Calculate the total number of missing values for each column across all chunks.
                total_missing_counts = total_missing_df.sum(axis=1)

                # Filter to show only columns that have missing values.
                missing_columns = total_missing_counts[total_missing_counts > 0]

                # Log the total missing values for each column.
                logger.info(
                    "Total Missing Values per Column:\n" + missing_columns.to_string()
                )
                logger.info(f"Total processed Values: {total}")

                # Optional: Save the missing value summary to a file in the 'data' directory
                # output_path = "data/missing_values_summary.csv"
                # missing_columns.to_csv(output_path)
                # logger.info(f"Missing value summary saved to {output_path}")

            else:
                logger.warning("File is empty or no data was processed.")

        except Exception as e:
            # If an error occurs during file processing, log the error message.
            logger.error(f"An error occurred while processing the file: {e}")
    else:
        # If the file does not exist, log an error message.
        logger.error(f"Error: File not found at the specified path: {file_path}")
        logger.error(
            "Please update the 'file_path' variable with the correct location of your dataset."
        )

In [4]:
file_path: str = "data/final_dataset_with_diffs.csv"
analyze_missing_values(file_path)

[32m2025-08-19 11:55:34.319[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m5[0m - [1mStarting missing value analysis for data/final_dataset_with_diffs.csv...[0m
Processing chunks: 13it [09:04, 41.91s/it]
[32m2025-08-19 12:04:39.260[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m36[0m - [1mTotal Missing Values per Column:
diff    482[0m
[32m2025-08-19 12:04:39.279[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m39[0m - [1mTotal processed Values: 125825[0m


In [5]:
file_path: str = "data/final_dataset_with_full_diffs_CLEANED.csv"
analyze_missing_values(file_path)

[32m2025-08-19 12:04:39.776[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m5[0m - [1mStarting missing value analysis for data/final_dataset_with_full_diffs_CLEANED.csv...[0m
Processing chunks: 13it [10:00, 46.20s/it]
[32m2025-08-19 12:14:40.489[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m36[0m - [1mTotal Missing Values per Column:
diff    482[0m
[32m2025-08-19 12:14:40.502[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m39[0m - [1mTotal processed Values: 125825[0m


In [6]:
file_path: str = "data/final_embedding_dataset.csv"
analyze_missing_values(file_path)

[32m2025-08-19 12:14:40.907[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m5[0m - [1mStarting missing value analysis for data/final_embedding_dataset.csv...[0m
Processing chunks: 13it [01:37,  7.52s/it]
[32m2025-08-19 12:16:18.677[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m36[0m - [1mTotal Missing Values per Column:
commit_hash                                100246
author_email                               100246
commit_date                                100246
lines_added                                100246
lines_deleted                              100246
files_changed                              100246
num_modified_subsystems                    100246
num_modified_dirs                          100246
entropy                                    100246
previous_total_size                        100246
author_total_commits                       100246
time_since_last_commit                     100246
recen

In [7]:
file_path: str = "data/small_diffs.csv"
analyze_missing_values(file_path)

[32m2025-08-19 12:22:09.620[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m5[0m - [1mStarting missing value analysis for data/small_diffs.csv...[0m
Processing chunks: 13it [04:00, 18.54s/it]
[32m2025-08-19 12:26:10.646[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m36[0m - [1mTotal Missing Values per Column:
diff    482[0m
[32m2025-08-19 12:26:10.652[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m39[0m - [1mTotal processed Values: 125346[0m


In [11]:
file_path: str = "data/headerlarge.csv"
analyze_missing_values(file_path,chunk_size=10)

[32m2025-08-19 12:43:31.379[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m5[0m - [1mStarting missing value analysis for data/headerlarge.csv...[0m
Processing chunks: 48it [03:43,  4.66s/it]
[32m2025-08-19 12:47:15.356[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m36[0m - [1mTotal Missing Values per Column:
Series([], )[0m
[32m2025-08-19 12:47:15.363[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m39[0m - [1mTotal processed Values: 478[0m


In [12]:
file_path: str = "data/large_diffs.csv"
analyze_missing_values(file_path,chunk_size=10)

[32m2025-08-19 12:47:54.645[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m5[0m - [1mStarting missing value analysis for data/large_diffs.csv...[0m
Processing chunks: 48it [03:35,  4.49s/it]
[32m2025-08-19 12:51:30.653[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m36[0m - [1mTotal Missing Values per Column:
Series([], )[0m
[32m2025-08-19 12:51:30.655[0m | [1mINFO    [0m | [36m__main__[0m:[36manalyze_missing_values[0m:[36m39[0m - [1mTotal processed Values: 478[0m
