# Data Cleanup

This notebook contains all the steps necessary to associate classes and methods in UND data collected before and to identify files with bugs. 

## 1. Processing UND Data

First, we will get rid of the all the classes and methods in each entry into the CSV and a 'Bug' column initialized at 0.

In [1]:
import os
import glob
import csv
import sys
import pandas as pd

In [2]:
def process_csv(file_path):
    df = pd.read_csv(file_path)
    
    df = df[df['Kind'] == 'File']

    df = df.drop(columns=['Kind','Entity_Uniquename'])  
    df.insert(0, 'Bug', 0)  
    
    df = df.rename(columns={'Name': 'FileName'})
    
    columns_order = ['Bug', 'FileName'] + [col for col in df.columns if col not in ['Bug', 'FileName']]
    df = df[columns_order]
    
    return df

current_repo = os.getcwd()
input_files = glob.glob(os.path.join(current_repo, 'UND_hive_data', '*.csv'))
output_dir = os.path.join(current_repo, 'UND_hive_processed_data')

os.makedirs(output_dir, exist_ok=True)
output_files = []

for file_path in input_files:
    processed_df = process_csv(file_path)

    base_name = os.path.basename(file_path) 
    output_file = os.path.join(output_dir, base_name.replace('.csv', '_processed.csv'))
    processed_df.to_csv(output_file, index=False)
    output_files.append(output_file)

print("Processing complete. Files saved:", output_files)

Processing complete. Files saved: ['/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-2.3.8_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-2.2.0_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-4.0.0_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-2.3.3_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-3.1.3_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-2.3.7_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-2.3.10_processed.csv', '/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/UND_hive_processed_data/UND_hive-2.

## 2. Identify Files with Bugs

For each of the affected files identified in the previous notebook, we will initialize at '1' the bug column 

In [21]:
csv.field_size_limit(sys.maxsize)

input_file = 'Hive_Affected_Files.csv'  

bug_ids = []
versions = []
file_names = []

with open(input_file, 'r', newline='') as file:
    reader = csv.reader(file)
    for row in reader:
        bug_id = row[0]
        version = row[1]

        affected_files = row[2].split(';')

        for file in affected_files:
            bug_ids.append(bug_id)
            versions.append(version)
            file_names.append(file)

output_df = pd.DataFrame({
    'Bug ID': bug_ids,
    'Version': versions,
    'File': file_names
})

print(output_df)

unfound_bug_ids = []
unfound_versions = []
unfound_file_names = []

for index, row in output_df.iloc[1:].iterrows():
    filename = f"{current_repo}/UND_hive_processed_data/UND_hive-{row['Version']}_processed.csv"
    target_file = row['File'] 

    with open(filename, 'r', newline='') as file:
        reader = csv.reader(file)
        rows = list(reader)

    found = False

for i, columns in enumerate(rows):
    if len(columns) > 1:
        file_name_in_row = columns[1].strip().removeprefix("/home/nicolas-richard/Desktop/.Apache_Hive/")

        file_name_in_row_lower = file_name_in_row.lower()
        target_file_lower = target_file.lower()

        if (target_file_lower in file_name_in_row_lower or 
            file_name_in_row_lower in target_file_lower or
            target_file_lower.replace('/', '\\') in file_name_in_row_lower or
            file_name_in_row_lower.replace('/', '\\') in target_file_lower or
            target_file_lower.split('.')[0] in file_name_in_row_lower or
            file_name_in_row_lower.split('.')[0] in target_file_lower):
            
            columns[0] = '1'
            rows[i] = columns
            found = True
            break

    if not found:
        unfound_bug_ids.append(row['Bug ID'])
        unfound_versions.append(row['Version'])
        unfound_file_names.append(row['File'])
    else:
        with open(filename, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerows(rows)

percentageBug_files_found = 100 - ((len(unfound_bug_ids)/(len(bug_ids))) * 100)

print(f"Percentage of files found: {percentageBug_files_found}%")

          Bug ID  Version                                               File
0         Bug_ID  Version                                   Affected_File(s)
1     HIVE-19247    2.2.0  ql/src/java/org/apache/hadoop/hive/ql/optimize...
2     HIVE-19085    2.3.2  storage-api/src/java/org/apache/hadoop/hive/co...
3     HIVE-18611    2.3.2  ql/src/java/org/apache/hadoop/hive/ql/optimize...
4     HIVE-18611    2.3.2  ql/src/java/org/apache/hadoop/hive/ql/udf/gene...
...          ...      ...                                                ...
2651   HIVE-7239    2.1.0  ql/src/test/org/apache/hadoop/hive/ql/index/Mo...
2652   HIVE-7239    2.1.0  ql/src/test/org/apache/hadoop/hive/ql/index/Mo...
2653   HIVE-7239    2.1.0  ql/src/test/org/apache/hadoop/hive/ql/index/Sp...
2654   HIVE-7239    2.1.0  ql/src/test/org/apache/hadoop/hive/ql/index/Te...
2655   HIVE-7239    2.1.0  ql/src/test/org/apache/hadoop/hive/ql/index/Te...

[2656 rows x 3 columns]
Percentage of files found: -76.12951807228916%


Around 13% of affected files were not found, all versions considered, with possible reasons including file path changes or reorganization, files being deleted or moved, different naming conventions across versions, case sensitivity issues in filenames, file permissions or access issues, files being renamed during updates, incorrect path mappings, files being archived or compressed, files being merged or split into other files, and documentation/reference errors.