In [1]:
pip install -r requirements.txt

Collecting pandas (from -r requirements.txt (line 1))
  Downloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting scikit-learn (from -r requirements.txt (line 2))
  Downloading scikit_learn-1.5.0-cp39-cp39-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting networkx (from -r requirements.txt (line 3))
  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting matplotlib (from -r requirements.txt (line 4))
  Downloading matplotlib-3.9.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting numpy>=1.22.4 (from pandas->-r requirements.txt (line 1))
  Downloading numpy-2.0.0-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas->-r requirements.txt (line 1))
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas->-r requirements.txt (line 1))
  Do

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


# Function to read and vectorize the log file
def vectorize_log_file(file_path, vectorizer=None, fit=True):
    all_sentence = []
    with open(file_path, 'r') as file:
        log_content = file.read().strip().split('\n')
        all_sentence.append(extract_all_word_from_log(log_content))
    if not log_content:
        raise ValueError(f"The log file {file_path} is empty or contains only whitespace.")
    if fit:
        log_vector = vectorizer.fit_transform(all_sentence)
        
    else:
        log_vector = vectorizer.transform(all_sentence)
    return log_vector.toarray()[0]

def extract_all_word_from_log(log_context):
    words = ""
    for line in log_context:
        words += line.split('|')[-1].strip()

    return words

# Function to parse execution trace
def parse_execution_trace(file_path):
    return pd.read_csv(file_path)

def path_to_method_and_line_no(path):
    line_number = path.split('@')[1]
    method_name = '.'.join(path.split('.')[:-1])
    return method_name, line_number

# Function to vectorize execution trace for each method
def vectorize_execution_trace(trace_df, unique_methods):
    execution_vectors = {}
    total_lines_of_code = {
    'src/test/java/functions/CalculatorTest.java': 338,
    'src/main/java/functions/Calculator.java': 192,
    'src/main/java/functions/distributions/Calculator2.java': 27,
    'src/main/java/functions/distributions/Calculator3.java': 25,
    }

    for method in unique_methods:
        coverage = {path: [0] * lines for path, lines in total_lines_of_code.items()}
        method_trace = trace_df[trace_df['signature'] == method]

        method = method.replace('src/test/java/functions/CalculatorTest.java;CalculatorTest.', '')
        method = method.replace('#', '')
        for _, row in trace_df.iterrows():
            if str(row['line_no']) in map(str, method_trace['line_no'].values):
                coverage[row['signature'].split(';')[0]][row['line_no']-1] = 1
        print(coverage.values())
        execution_vectors[method] = [x for y in coverage.values() for x in y]
    return execution_vectors

In [4]:
trace_file = 'trace_data.csv'
log_directory = 'data/log'

# Load the single execution trace for training
trace_df = parse_execution_trace(trace_file)
trace_df['line_no'] = trace_df['path'].apply(lambda x: int(x.split('@')[1]))
unique_methods = trace_df['signature'].unique()
# unique_methods = np.delete(unique_methods, 0)

# Vectorize the execution trace for each method
execution_vectors = vectorize_execution_trace(trace_df, unique_methods)

# Prepare to store the log vectors and corresponding execution vectors
log_vectors = []
execution_vectors_for_model = []
method_names = []  # To keep track of method names for testing

vectorizer = CountVectorizer()

# Load and vectorize the log files
for method in unique_methods:
    method = method.replace('src/test/java/functions/CalculatorTest.java;CalculatorTest.', '')
    method = method.replace('#', '')
    log_file_path = os.path.join(log_directory, f"log_{method}.log")
    log_file_path = log_file_path.replace('"', '')

    if os.path.exists(log_file_path):
        try:
            log_vector = vectorize_log_file(log_file_path, vectorizer, fit=True)
            log_vectors.append(log_vector)
            execution_vectors_for_model.append(execution_vectors[method])
            method_names.append(method)
        except ValueError as e:
            print(e)
    else:
        print(f"Log file for method {method} not found.")

# Ensure log_vectors and execution_vectors_for_model are not empty
if not log_vectors or not execution_vectors_for_model:
    raise ValueError("Log vectors or execution vectors for model are empty. Check your data.")

max_log_length = max(len(v) for v in log_vectors)
max_exec_length = max(len(v) for v in execution_vectors_for_model)

# Pad the vectors with zeros to make them the same length
padded_log_vectors = np.array([np.pad(v, (0, max_log_length - len(v)), 'constant') for v in log_vectors])
padded_execution_vectors = np.array([np.pad(v, (0, max_exec_length - len(v)), 'constant') for v in execution_vectors_for_model])

# Combine all log vectors and execution vectors into single datasets
X = np.array(padded_log_vectors)
y = np.array(padded_execution_vectors)

# Ensure that y is of type integer
y = y.astype(int)

# Split the data into training (90%) and testing (10%) sets
X_train, X_test, y_train, y_test, method_train, method_test = train_test_split(X, y, method_names, test_size=0.2, random_state=42)

# Print shapes for debugging
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# Train the Random Forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

dict_values([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
padded_execution_vectors

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [6]:
# Validation Phase
# Predict faulty lines for the test set
y_pred = model.predict(X_test)

# Function to evaluate the predictions
def evaluate_predictions(method_test, y_test, y_pred):
    for i in range(len(y_test)):
        actual_coverage = np.where(y_test[i] == 1)[0]
        predicted_coverage = np.where(y_pred[i] == 1)[0]
        print(f"Method: {method_test[i]}")
        print(f"  Actual Coverage: {actual_coverage}")
        print(f"  Predicted Coverage: {predicted_coverage}")

# Evaluate the predictions
evaluate_predictions(method_test, y_test, y_pred)

Method: testIsPrime_N007
  Actual Coverage: [ 13  16  18  19  20  21  22 140 141 145 146 151 152]
  Predicted Coverage: [ 13  16  18  19  20  21  22 140 141 145 146 151 152]
Method: testFailure
  Actual Coverage: [ 3  4  5  6  7 13 16 18 19 20 21 22]
  Predicted Coverage: [ 3  4  5  6  7 13 16 18 19 20 21 22]
Method: testFactorial_N001
  Actual Coverage: [ 13  16  18  19  20  21  22 106 107 111 112 113 115 116]
  Predicted Coverage: [ 13  16  18  19  20  21  22 140 141 145 146 151 152]
Method: testLayer_N001
  Actual Coverage: [  6  11  12  13  16  18  19  20  21  22 170 171 172 173]
  Predicted Coverage: [ 3  4  5  6  7 13 16 18 19 20 21 22]
Method: testDivided_N002
  Actual Coverage: [ 13  16  18  19  20  21  22  72  73 157 158 166]
  Predicted Coverage: [ 13  16  18  19  20  21  22 157 158]
Method: testTimes_N004
  Actual Coverage: [ 13  16  18  19  20  21  22  64  65 157 158 166]
  Predicted Coverage: [ 13  16  18  19  20  21  22 157 158]
Method: testIsPrime_N002
  Actual Coverage: