# Data Cleanup

This notebook contains all the steps necessary to associate classes and methods in UND data collected before and to identify files with bugs. 

## 1. Processing UND Data

First, we will get rid of the all the classes and methods in each entry into the CSV and a 'Bug' column initialized at 0.

In [1]:
import os
import glob
import csv
import pandas as pd

In [2]:
def process_csv(file_path):
    df = pd.read_csv(file_path)
    
    df = df[df['Kind'] == 'File']

    df = df.drop(columns=['Kind','Entity_Uniquename'])  
    df.insert(0, 'Bug', 0)  
    
    df = df.rename(columns={'Name': 'FileName'})
    
    columns_order = ['Bug', 'FileName'] + [col for col in df.columns if col not in ['Bug', 'FileName']]
    df = df[columns_order]
    
    return df

current_repo = os.getcwd()
input_files = glob.glob(os.path.join(current_repo, 'UND_hive_data', '*.csv'))
output_dir = os.path.join(current_repo, 'UND_hive_processed_data')

os.makedirs(output_dir, exist_ok=True)
output_files = []

for file_path in input_files:
    processed_df = process_csv(file_path)

    base_name = os.path.basename(file_path) 
    output_file = os.path.join(output_dir, base_name.replace('.csv', '_processed.csv'))
    processed_df.to_csv(output_file, index=False)
    output_files.append(output_file)

print("Processing complete. Files saved:", output_files)

Processing complete. Files saved: ['/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-2.3.8_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-2.2.0_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-4.0.0_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-2.3.3_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-3.1.3_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-2.3.7_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-2.3.10_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-2.

## 2. Identify Files with Bugs

For each of the affected files identified in the previous notebook, we will initialize at '1' the bug column 

In [3]:
os.chdir("/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model")
current_repo = os.getcwd()
input_file = 'Hive_Modified_Files.csv'

bug_ids = []
versions = []
file_names = []

def normalize_path(path):
    return os.path.normpath(path).lower()

with open(input_file, 'r', newline='', encoding='utf-8') as file:
    reader = csv.reader(file)
    headers = next(reader, None)
    for row in reader:
        if len(row) < 3:
            continue
        bug_id = row[0].strip()
        version = row[1].strip()
        affected_files = row[2].split(';')
        for affected_file in affected_files:
            affected_file = affected_file.strip()
            if affected_file:  # Ensure the file name is not empty
                bug_ids.append(bug_id)
                versions.append(version)
                file_names.append(affected_file)

output_df = pd.DataFrame({
    'Bug ID': bug_ids,
    'Version': versions,
    'File': file_names
})

print("Parsed data from input file:")
print(output_df)

unfound_bug_ids = []
unfound_versions = []
unfound_file_names = []

found_files = 0
unfound_files = 0

for index, row in output_df.iterrows():
    version = row['Version']
    target_file = row['File']
    filename = f"{current_repo}/UND_hive_processed_data/UND_hive-{version}_processed.csv"

    if not os.path.isfile(filename):
        print(f"File not found: {filename}")
        unfound_bug_ids.append(row['Bug ID'])
        unfound_versions.append(version)
        unfound_file_names.append(target_file)
        unfound_files += 1
        continue

    try:
        with open(filename, 'r', newline='', encoding='utf-8') as file:
            rows = list(csv.reader(file))

        found = False
        target_file_lower = normalize_path(target_file)

        for i, columns in enumerate(rows):
            if len(columns) < 2:
                continue
            file_name_in_row_lower = normalize_path(columns[1])

            matches = [
                target_file_lower in file_name_in_row_lower,
                file_name_in_row_lower in target_file_lower,
                target_file_lower.replace('/', '\\') in file_name_in_row_lower,
                file_name_in_row_lower.replace('/', '\\') in target_file_lower,
                target_file_lower.split('.')[0] in file_name_in_row_lower,
                file_name_in_row_lower.split('.')[0] in target_file_lower
            ]

            if any(matches):
                columns[0] = '1'  
                rows[i] = columns
                found = True
                found_files += 1
                print(f"Match found for Bug {row['Bug ID']} in {row['Version']}")
                break

        if not found:
            unfound_files += 1
            unfound_bug_ids.append(row['Bug ID'])
            unfound_versions.append(version)
            unfound_file_names.append(target_file)
            print(f"Match not found for Bug {row['Bug ID']} in {row['Version']}")
        else:
            with open(filename, 'w', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerows(rows)

    except Exception as e:
        print(f"Error processing file {filename}: {e}")
        unfound_bug_ids.append(row['Bug ID'])
        unfound_versions.append(version)
        unfound_file_names.append(target_file)

unfound_df = pd.DataFrame({
    'Bug ID': unfound_bug_ids,
    'Version': unfound_versions,
    'File': unfound_file_names
})

unfound_csv_path = 'Unfound_Files.csv'
unfound_df.to_csv(unfound_csv_path, index=False)
print(f"Unfound files saved to {unfound_csv_path}")

print(f"Total files found: {found_files}")
print(f"Total files not found: {unfound_files}")


Parsed data from input file:
          Bug ID Version                                               File
0     HIVE-22165   2.1.0  service/src/java/org/apache/hive/service/cli/s...
1     HIVE-21009   2.1.0  common/src/java/org/apache/hadoop/hive/conf/Hi...
2     HIVE-21009   2.1.0  service/src/java/org/apache/hive/service/auth/...
3     HIVE-21009   2.1.0  service/src/test/org/apache/hive/service/auth/...
4     HIVE-20771   3.1.0  serde/src/java/org/apache/hadoop/hive/serde2/l...
...          ...     ...                                                ...
4495  HIVE-21614   2.3.4  standalone-metastore/metastore-server/src/main...
4496  HIVE-21614   2.3.4  standalone-metastore/metastore-server/src/main...
4497  HIVE-21508   2.3.4  standalone-metastore/metastore-common/src/main...
4498  HIVE-16839   2.3.4  standalone-metastore/metastore-server/src/main...
4499  HIVE-16839   2.3.4  standalone-metastore/metastore-server/src/test...

[4500 rows x 3 columns]
Match found for Bug HIVE-22165 in 

In [None]:
print(f"Percentage of files found: {found_files/(unfound_files+found_files) * 100}%")

Percentage of files found: 89.5111111111111%


These results are poor and are likely due to errors during UND data collection. In spite of this, our results will not be affected as we can safely assume that all files containning bugs have been identified in the processed data.

## 3. Add Classes and Methods to Processed Files

As mentionned before, within the UND data collected, only files have been identified as containning a bug. UND also provides variables for classes and methods, which we will need to associate to their respective file.  

### 3.1 Classes
We want to come up with values for the following fields for a given file, from the  
- CountClassBase
- CountClassCoupled
- CountClassDerived
- MaxInheritanceTree

To get the first three metrics, we'll simply add the ClassBaseCount, CountClassDerived and CountClassCoupled metrics from each of the classes linked to a given file. For the last metric, we'll select maximum value of MaxInheritanceTree for each class linked to the file.

In [None]:
def aggregate_class_metrics(processed_data_dir, data_dir, output_dir):

    for processed_file in os.listdir(processed_data_dir):
        if not processed_file.endswith('_processed.csv'):
            continue        
        try:
            version = processed_file.split('_processed.csv')[0].split('UND_hive-')[1]
        except IndexError:
            print(f"Filename {processed_file} does not match the expected format. Skipping.")
            continue

        data_file = f'UND_hive-{version}.csv'
        data_file_path = os.path.join(data_dir, data_file)

        if not os.path.isfile(data_file_path):
            print(f"Data file {data_file} not found in {data_dir}. Skipping {processed_file}.")
            continue
        try:
            processed_df = pd.read_csv(os.path.join(processed_data_dir, processed_file))
        except Exception as e:
            print(f"Error reading {processed_file}: {e}. Skipping.")
            continue
        try:
            data_df = pd.read_csv(data_file_path)
        except Exception as e:
            print(f"Error reading {data_file}: {e}. Skipping.")
            continue
        kind_col = data_df.columns[0]

        class_entries = data_df[data_df[kind_col].str.contains('class', case=False, na=False)]

        filename_col_processed = processed_df.columns[1]

        possible_filename_cols = ['FilePath', 'FileName', 'File', 'filename']
        filename_col_data = None
        for col in possible_filename_cols:
            if col in data_df.columns:
                filename_col_data = col
                break
        if not filename_col_data:
            print(f"No matching filename column found in {data_file}. Skipping {processed_file}.")
            continue

        required_metrics = ['CountClassBase', 'CountClassCoupled', 'CountClassDerived', 'MaxInheritanceTree']
        missing_metrics = [metric for metric in required_metrics if metric not in data_df.columns]
        if missing_metrics:
            print(f"Missing metric columns {missing_metrics} in {data_file}. Skipping {processed_file}.")
            continue

        aggregated_metrics = class_entries.groupby(filename_col_data).agg({
            'CountClassBase': 'sum',
            'CountClassCoupled': 'sum',
            'CountClassDerived': 'sum',
            'MaxInheritanceTree': 'max'
        }).reset_index()

        processed_df = processed_df.merge(
            aggregated_metrics,
            how='left',
            left_on=filename_col_processed,
            right_on=filename_col_data,
            suffixes=('', '_agg')
        )

        for metric in required_metrics:
            if metric in processed_df.columns:
                processed_df[metric] = processed_df[metric].fillna(0)
            else:
                processed_df[metric] = 0

        if filename_col_data in processed_df.columns:
            processed_df.drop(columns=[filename_col_data], inplace=True)

        output_file_path = os.path.join(output_dir, processed_file)
        try:
            processed_df.to_csv(output_file_path, index=False)
            print(f"Successfully updated {processed_file} and saved to {output_dir}.")
        except Exception as e:
            print(f"Error saving updated file {output_file_path}: {e}.")
            continue

if __name__ == "__main__":
    # Define your directories here
    processed_data_directory = 'UND_hive_processed_data'
    data_directory = 'UND_hive_data'
    output_directory = 'UND_hive_updated_data'
    
    aggregate_class_metrics(processed_data_directory, data_directory, output_directory)

SyntaxError: incomplete input (2564707754.py, line 88)

### 3.2 Methods
We want to come up with values for the following fields: 
- CountInputMin
- CountInputMean
- CountInputMax
- CountOutputMin
- CountOutputMean
- CountOutputMax
- CountPathMin
- CountPathMean
- CountPathMax
- MaxNestingMin
- MaxNestingMean
- MaxNestingMax