# Data Cleanup & Additionnal Metrics

This notebook contains all the steps necessary to associate classes and methods in UND data collected before and to identify files with bugs. 

## 1. Processing UND Data

First, we will get rid of the all the classes and methods in each entry into the CSV and a 'Bug' column initialized at 0.

In [52]:
import os
import glob
import csv
import pandas as pd
from pathlib import Path
from collections import defaultdict

In order to simplify repertory changes, we'll initialize two variables, containning the paths of this current repository and the path of your clone of the Apache Hive repertory.

In [None]:
project_repo = Path("/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/")
hive_repo = Path("/home/nicolas-richard/Desktop/.Apache_Hive/")

In [17]:
def process_csv(file_path):
    df = pd.read_csv(file_path)
    
    df = df[df['Kind'] == 'File']

    df = df.drop(columns=['Kind','Entity_Uniquename'])  
    df.insert(0, 'Bug', 0)  
    
    df = df.rename(columns={'Name': 'FileName'})
    
    columns_order = ['Bug', 'FileName'] + [col for col in df.columns if col not in ['Bug', 'FileName']]
    df = df[columns_order]
    
    return df

current_repo = os.getcwd()
input_files = glob.glob(os.path.join(current_repo, 'UND_hive_data', '*.csv'))
output_dir = os.path.join(current_repo, 'UND_hive_processed_data')

os.makedirs(output_dir, exist_ok=True)
output_files = []

for file_path in input_files:
    processed_df = process_csv(file_path)

    base_name = os.path.basename(file_path) 
    output_file = os.path.join(output_dir, base_name.replace('.csv', '_processed.csv'))
    processed_df.to_csv(output_file, index=False)
    output_files.append(output_file)

print("Processing complete. Files saved:", output_files)

Processing complete. Files saved: ['/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_2.3.3_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_3.0.0_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_4.0.1_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_2.3.0_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_2.3.10_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_3.1.2_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_3.1.0_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_2.3.4_processed.csv', '/home/nicolas-richa

## 2. Identify Files with Bugs

For each of the affected files identified in the previous notebook, we will initialize at '1' the bug column 

In [18]:
os.chdir(project_repo)
current_repo = os.getcwd()
input_file = 'Hive_Modified_Files.csv'

bug_ids = []
versions = []
file_names = []

def normalize_path(path):
    return os.path.normpath(path).lower()

with open(input_file, 'r', newline='', encoding='utf-8') as file:
    reader = csv.reader(file)
    headers = next(reader, None)
    for row in reader:
        if len(row) < 3:
            continue
        bug_id = row[0].strip()
        version = row[1].strip()
        affected_files = row[2].split(';')
        for affected_file in affected_files:
            affected_file = affected_file.strip()
            if affected_file:  # Ensure the file name is not empty
                bug_ids.append(bug_id)
                versions.append(version)
                file_names.append(affected_file)

output_df = pd.DataFrame({
    'Bug ID': bug_ids,
    'Version': versions,
    'File': file_names
})

print("Parsed data from input file:")
print(output_df)

unfound_bug_ids = []
unfound_versions = []
unfound_file_names = []

found_files = 0
unfound_files = 0

for index, row in output_df.iterrows():
    version = row['Version']
    target_file = row['File']
    filename = f"{current_repo}/UND_hive_processed_data/UND_{version}_processed.csv"

    if not os.path.isfile(filename):
        print(f"File not found: {filename}")
        unfound_bug_ids.append(row['Bug ID'])
        unfound_versions.append(version)
        unfound_file_names.append(target_file)
        unfound_files += 1
        continue

    try:
        with open(filename, 'r', newline='', encoding='utf-8') as file:
            rows = list(csv.reader(file))

        found = False
        target_file_lower = normalize_path(target_file)

        for i, columns in enumerate(rows):
            if len(columns) < 2:
                continue
            file_name_in_row_lower = normalize_path(columns[1])

            matches = [
                target_file_lower in file_name_in_row_lower,
                file_name_in_row_lower in target_file_lower,
                target_file_lower.replace('/', '\\') in file_name_in_row_lower,
                file_name_in_row_lower.replace('/', '\\') in target_file_lower,
                target_file_lower.split('.')[0] in file_name_in_row_lower,
                file_name_in_row_lower.split('.')[0] in target_file_lower
            ]

            if any(matches):
                columns[0] = '1'  
                rows[i] = columns
                found = True
                found_files += 1
                print(f"Match found for Bug {row['Bug ID']} in {row['Version']}")
                break

        if not found:
            unfound_files += 1
            unfound_bug_ids.append(row['Bug ID'])
            unfound_versions.append(version)
            unfound_file_names.append(target_file)
            print(f"Match not found for Bug {row['Bug ID']} in {row['Version']}")
        else:
            with open(filename, 'w', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerows(rows)

    except Exception as e:
        print(f"Error processing file {filename}: {e}")
        unfound_bug_ids.append(row['Bug ID'])
        unfound_versions.append(version)
        unfound_file_names.append(target_file)

unfound_df = pd.DataFrame({
    'Bug ID': unfound_bug_ids,
    'Version': unfound_versions,
    'File': unfound_file_names
})

unfound_csv_path = 'Unfound_Files.csv'
unfound_df.to_csv(unfound_csv_path, index=False)
print(f"Unfound files saved to {unfound_csv_path}")

print(f"Total files found: {found_files}")
print(f"Total files not found: {unfound_files}")
print(f"Percentage of files found: {found_files/(unfound_files+found_files) * 100}%")

Parsed data from input file:
          Bug ID Version                                               File
0     HIVE-22165   2.1.0  service/src/java/org/apache/hive/service/cli/s...
1     HIVE-21009   2.1.0  common/src/java/org/apache/hadoop/hive/conf/Hi...
2     HIVE-21009   2.1.0  service/src/java/org/apache/hive/service/auth/...
3     HIVE-21009   2.1.0  service/src/test/org/apache/hive/service/auth/...
4     HIVE-20771   3.1.0  serde/src/java/org/apache/hadoop/hive/serde2/l...
...          ...     ...                                                ...
4495  HIVE-21614   2.3.4  standalone-metastore/metastore-server/src/main...
4496  HIVE-21614   2.3.4  standalone-metastore/metastore-server/src/main...
4497  HIVE-21508   2.3.4  standalone-metastore/metastore-common/src/main...
4498  HIVE-16839   2.3.4  standalone-metastore/metastore-server/src/main...
4499  HIVE-16839   2.3.4  standalone-metastore/metastore-server/src/test...

[4500 rows x 3 columns]
Match found for Bug HIVE-22165 in 

## 3. Add Classes and Methods to Processed Files

As mentionned before, within the UND data collected, only files have been identified as containning a bug. UND also provides variables for classes and methods, which we will need to associate to their respective file.  

### 3.1 Classes
We want to come up with values for the following fields for a given file, from the  methods and classes listed in the raw UND data:

- Classes:
   - CountClassBase
   - CountClassCoupled
   - CountClassDerived
   - MaxInheritanceTree

- Methods:
   - CountInputMin
   - CountInputMean
   - CountInputMax
   - CountOutputMin
   - CountOutputMean
   - CountOutputMax
   - CountPathMin
   - CountPathMean
   - CountPathMax
   - MaxNestingMin
   - MaxNestingMean
   - MaxNestingMax

To get the first class metrics, we'll simply add the ClassBaseCount, CountClassDerived and CountClassCoupled metrics from each of the classes linked to a given file. For the last class metric, we'll select maximum value of MaxInheritanceTree for each class linked to the file.
For the metrics related to methods, we'll select only "methods" in the original UND dataset and create new columns for each of the variables above. For each set of variables, we'll simply select the minimal, average and maimal values for each method.

In [41]:
def get_version_from_filename(processed_file):
    """
    Extracts the version from the processed filename.
    Expected format: 'UND_<version>_processed.csv'
    """
    if not processed_file.endswith('_processed.csv'):
        return None
    try:
        version = processed_file.split('_processed.csv')[0].split('UND_')[1]
        return version
    except IndexError:
        print(f"Filename {processed_file} does not match the expected format. Skipping.")
        return None

def load_dataframe(file_path, description):
    """
    Loads a CSV file into a pandas DataFrame.
    """
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print(f"Error reading {description} '{file_path}': {e}. Skipping.")
        return None

def identify_filename_columns(processed_df, data_df):
    """
    Identifies the filename columns in processed and data DataFrames.
    """
    filename_col_processed = processed_df.columns[1] if len(processed_df.columns) > 1 else None
    filename_col_data = 'File' if 'File' in data_df.columns else None

    if not filename_col_data:
        print(f"No matching filename column found in data file. Skipping processed file.")
    return filename_col_processed, filename_col_data

def aggregate_method_metrics(method_entries, filename_col):
    """
    Aggregates method metrics: min, mean, max for specified columns.
    """
    method_required_metrics = ['CountInput', 'CountOutput', 'CountPath', 'MaxNesting']
    missing_metrics = [metric for metric in method_required_metrics if metric not in method_entries.columns]

    if missing_metrics:
        print(f"Missing method metric columns {missing_metrics}. Skipping method metrics.")
        return None

    try:
        aggregated = method_entries.groupby(filename_col).agg({
            'CountInput': ['min', 'mean', 'max'],
            'CountOutput': ['min', 'mean', 'max'],
            'CountPath': ['min', 'mean', 'max'],
            'MaxNesting': ['min', 'mean', 'max']
        }).reset_index()
        
        # Flatten MultiIndex columns
        aggregated.columns = [
            f"{col[0]}{col[1].capitalize()}" if col[1] else col[0]
            for col in aggregated.columns
        ]
        return aggregated
    except Exception as e:
        print(f"Error aggregating method metrics: {e}. Skipping method metrics.")
        return None

def aggregate_class_metrics(class_entries, filename_col):
    """
    Aggregates class metrics: sum, max, mean for specified columns.
    """
    class_required_metrics = [
        'CountClassBase', 'CountClassCoupled', 'CountClassDerived', 
        'MaxInheritanceTree', 'PercentLackOfCohesion'
    ]
    missing_metrics = [metric for metric in class_required_metrics if metric not in class_entries.columns]

    if missing_metrics:
        print(f"Missing class metric columns {missing_metrics}. Skipping class metrics.")
        return None

    try:
        aggregated = class_entries.groupby(filename_col).agg({
            'CountClassBase': 'sum',
            'CountClassCoupled': 'sum',
            'CountClassDerived': 'sum',
            'MaxInheritanceTree': 'max',
            'PercentLackOfCohesion': 'mean'
        }).reset_index()
        return aggregated
    except Exception as e:
        print(f"Error aggregating class metrics: {e}. Skipping class metrics.")
        return None

In [37]:
def merge_and_fill_metrics(processed_df, aggregated_df, left_on, right_on, metric_columns):
    """
    Merges aggregated metrics into the processed DataFrame and fills NaN values.
    Ensures new columns are appended at the end and duplicates are handled.
    """
    if aggregated_df is not None:
        # Identify and remove duplicate columns before merging
        duplicate_columns = set(processed_df.columns).intersection(set(aggregated_df.columns))
        if duplicate_columns:
            processed_df.drop(columns=list(duplicate_columns), inplace=True)
        
        # Merge
        processed_df = processed_df.merge(
            aggregated_df,
            how='left',
            left_on=left_on,
            right_on=right_on
        )

        # Fill NaN with 0 and ensure new columns are present
        for metric in metric_columns:
            if metric in processed_df.columns:
                processed_df[metric] = processed_df[metric].fillna(0)
            else:
                processed_df[metric] = 0
    return processed_df

def reorder_columns(processed_df, original_columns):
    """
    Reorders the DataFrame columns to have original columns first, followed by new columns.
    """
    new_columns = [col for col in processed_df.columns if col not in original_columns]
    return processed_df[original_columns + new_columns]

def process_file(processed_file, processed_data_dir, data_dir, output_dir):
    """
    Processes a single processed file by merging aggregated metrics from the corresponding data file.
    """
    version = get_version_from_filename(processed_file)
    if not version:
        return

    data_file = f'UND_{version}.csv'
    data_file_path = os.path.join(data_dir, data_file)

    if not os.path.isfile(data_file_path):
        print(f"Data file {data_file} not found in {data_dir}. Skipping {processed_file}.")
        return

    processed_df = load_dataframe(os.path.join(processed_data_dir, processed_file), "processed file")
    if processed_df is None:
        return

    data_df = load_dataframe(data_file_path, "data file")
    if data_df is None:
        return

    filename_col_processed, filename_col_data = identify_filename_columns(processed_df, data_df)
    if not filename_col_data:
        return

    original_columns = list(processed_df.columns)
    kind_col = data_df.columns[0]

    # Process Method Metrics
    method_entries = data_df[data_df[kind_col].str.contains('method', case=False, na=False)]
    method_aggregated = aggregate_method_metrics(method_entries, filename_col_data)
    method_metric_columns = [
        'CountInputMin', 'CountInputMean', 'CountInputMax',
        'CountOutputMin', 'CountOutputMean', 'CountOutputMax',
        'CountPathMin', 'CountPathMean', 'CountPathMax',
        'MaxNestingMin', 'MaxNestingMean', 'MaxNestingMax'
    ]
    processed_df = merge_and_fill_metrics(
        processed_df, method_aggregated, filename_col_processed, filename_col_data, method_metric_columns
    )

    # Process Class Metrics
    class_entries = data_df[data_df[kind_col].str.contains('class', case=False, na=False)]
    class_aggregated = aggregate_class_metrics(class_entries, filename_col_data)
    class_metric_columns = [
        'CountClassBase', 'CountClassCoupled', 'CountClassDerived', 
        'MaxInheritanceTree', 'PercentLackOfCohesion'
    ]
    processed_df = merge_and_fill_metrics(
        processed_df, class_aggregated, filename_col_processed, filename_col_data, class_metric_columns
    )

    # Drop the 'File' column if it exists after merging
    if filename_col_data in processed_df.columns:
        processed_df.drop(columns=[filename_col_data], inplace=True)
    
    # Ensure 'original_columns' does not include 'File'
    if filename_col_data and filename_col_data in original_columns:
        original_columns.remove(filename_col_data)
    
    # Reorder columns
    processed_df = reorder_columns(processed_df, original_columns)
    
    output_file_path = os.path.join(output_dir, processed_file)
    try:
        processed_df.to_csv(output_file_path, index=False)
        print(f"Successfully updated {processed_file} and saved to {output_dir}")
    except Exception as e:
        print(f"Error saving updated file {output_file_path}: {e}.")

In [42]:
def process_files(processed_data_dir, data_dir, output_dir):
    """
    Processes all processed files in the specified directory.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for processed_file in os.listdir(processed_data_dir):
        if processed_file.endswith('_processed.csv'):
            process_file(processed_file, processed_data_dir, data_dir, output_dir)
        else:
            print(f"File {processed_file} does not match '_processed.csv' pattern. Skipping.")

def main():
    """
    Main function to set directories and initiate processing.
    """
    processed_data_directory = os.path.join(project_repo,'UND_hive_processed_data')
    data_directory =  os.path.join(project_repo, 'UND_hive_data')
    output_directory =  os.path.join(project_repo, 'UND_hive_updated_data')

    process_files(str(processed_data_directory), str(data_directory), str(output_directory))

if __name__ == "__main__":
    main()

Successfully updated UND_3.1.2_processed.csv and saved to /home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_updated_data
Successfully updated UND_4.0.0_processed.csv and saved to /home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_updated_data
Successfully updated UND_2.3.7_processed.csv and saved to /home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_updated_data
Successfully updated UND_2.3.0_processed.csv and saved to /home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_updated_data
Successfully updated UND_2.3.2_processed.csv and saved to /home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_updated_data
Successfully updated UND_2.3.5_processed.csv and saved to /home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_updated_data
Successfully updated UND_2.3.4_processed.csv and saved to /home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML

We can confirm the effectiveness of the above code by generating a summary of the new columns in all files

In [None]:
def get_new_metric_columns():
    """
    Returns a list of new metric columns added during processing.
    Adjust this list based on your processing script.
    """
    method_metrics = [
        'CountInputMin', 'CountInputMean', 'CountInputMax',
        'CountOutputMin', 'CountOutputMean', 'CountOutputMax',
        'CountPathMin', 'CountPathMean', 'CountPathMax',
        'MaxNestingMin', 'MaxNestingMean', 'MaxNestingMax'
    ]
    
    class_metrics = [
        'CountClassBase', 'CountClassCoupled', 'CountClassDerived',
        'MaxInheritanceTree', 'PercentLackOfCohesion'
    ]
    
    return method_metrics + class_metrics

def initialize_stats(columns):
    """
    Initializes a dictionary to store min and max for each column.
    """
    stats = {col: {'min': None, 'max': None} for col in columns}
    return stats

def update_stats(df, columns, stats):
    """
    Updates the stats dictionary with min and max from the given DataFrame.
    """
    for col in columns:
        if col in df.columns:
            col_min = df[col].min()
            col_max = df[col].max()
            
            if pd.notnull(col_min):
                if stats[col]['min'] is None or col_min < stats[col]['min']:
                    stats[col]['min'] = col_min
            if pd.notnull(col_max):
                if stats[col]['max'] is None or col_max > stats[col]['max']:
                    stats[col]['max'] = col_max

def generate_summary(stats):
    """
    Generates a summary DataFrame from the stats dictionary.
    """
    summary_data = []
    for col, values in stats.items():
        summary_data.append({
            'Column': col,
            'Min': values['min'],
            'Max': values['max']
        })
    summary_df = pd.DataFrame(summary_data)
    return summary_df

def analyze_new_columns(output_dir):
    """
    Analyzes all processed DataFrames in the output directory to determine
    the range of values in the new metric columns.
    """
    output_path = Path(output_dir)
    if not output_path.exists():
        print(f"Output directory {output_dir} does not exist.")
        return
    
    new_columns = get_new_metric_columns()
    stats = initialize_stats(new_columns)
    
    processed_files = [f for f in output_path.iterdir() if f.is_file() and f.name.endswith('_processed.csv')]
    
    if not processed_files:
        print(f"No processed CSV files found in {output_dir}.")
        return
    
    for file in processed_files:
        try:
            df = pd.read_csv(file)
            update_stats(df, new_columns, stats)
        except Exception as e:
            print(f"Error reading {file}: {e}. Skipping.")
            continue
    
    summary_df = generate_summary(stats)
    return summary_df

def save_summary(summary_df, project_repo, summary_filename='metrics_summary.csv'):
    """
    Saves the summary DataFrame to a CSV file in the output directory.
    """
    summary_path = os.path.join (project_repo, summary_filename)
    try:
        summary_df.to_csv(summary_path, index=False)
        print(f"Summary report saved to {summary_path}")
    except Exception as e:
        print(f"Error saving summary report: {e}")

def main():
    """
    Main function to execute the analysis of new metric columns.
    """
    # Define the output directory where processed files are stored
    # Adjust the path as needed
    output_directory = os.path.join(project_repo, 'UND_hive_updated_data')
    
    # Analyze the new columns
    summary = analyze_new_columns(str(Path(output_directory)))
    
    if summary is not None:
        print("Summary of New Metric Columns:")
        print(summary)
        
        # Save the summary to a CSV file
        save_summary(summary, str(Path(project_repo)))

if __name__ == "__main__":
    main()

Summary of New Metric Columns:
                   Column           Min          Max
0           CountInputMin  0.000000e+00        190.0
1          CountInputMean  0.000000e+00        196.0
2           CountInputMax  0.000000e+00        648.0
3          CountOutputMin  0.000000e+00         73.0
4         CountOutputMean  0.000000e+00         74.0
5          CountOutputMax  0.000000e+00        235.0
6            CountPathMin -2.080375e+09    1363402.0
7           CountPathMean -1.428172e+08  200000004.4
8            CountPathMax  0.000000e+00  999999999.0
9           MaxNestingMin  0.000000e+00          8.0
10         MaxNestingMean  0.000000e+00          8.0
11          MaxNestingMax  0.000000e+00         11.0
12         CountClassBase  0.000000e+00       5394.0
13      CountClassCoupled  0.000000e+00      31496.0
14      CountClassDerived  0.000000e+00        173.0
15     MaxInheritanceTree  0.000000e+00          9.0
16  PercentLackOfCohesion  0.000000e+00        100.0
Summary report 