In [1]:
pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Function to read and vectorize the log file
def vectorize_log_file(file_path, vectorizer=None, fit=True):
    with open(file_path, 'r') as file:
        log_content = file.read()
    if fit:
        log_vector = vectorizer.fit_transform([log_content])
    else:
        log_vector = vectorizer.transform([log_content])
    return log_vector.toarray()[0]

# Function to parse execution trace
def parse_execution_trace(file_path):
    with open(file_path, 'r') as file:
        trace_lines = file.readlines()
    trace_data = []
    for line in trace_lines:
        parts = line.strip().split(',')
        if len(parts) == 6:
            trace_data.append(parts)
    return pd.DataFrame(trace_data, columns=['signature', 'test_trace_id', 'line_no', 'registry_id', 'path', 'invoked_order'])

# Function to vectorize execution trace for each method
def vectorize_execution_trace(trace_df, unique_methods):
    execution_vectors = {}
    for method in unique_methods:
        method_trace = trace_df[trace_df['signature'] == method]
        method = method.replace('src/test/java/functions/CalculatorTest.java;CalculatorTest.', '')
        method = method.replace('#', '')
        method_vector = [1 if str(row['line_no']) in method_trace['line_no'].values else 0 for _, row in trace_df.iterrows()]
        execution_vectors[method] = method_vector
    return execution_vectors

In [15]:
trace_file = 'trace_data.csv'
log_directory = 'data/log'

# Load the single execution trace for training
trace_df = parse_execution_trace(trace_file)
unique_methods = trace_df['signature'].unique()
unique_methods = np.delete(unique_methods, 0)

# Vectorize the execution trace for each method
execution_vectors = vectorize_execution_trace(trace_df, unique_methods)

# Prepare to store the log vectors and corresponding execution vectors
log_vectors = []
execution_vectors_for_model = []

# Vectorizer for log files
vectorizer = TfidfVectorizer()

# Load and vectorize the log files
for method in unique_methods:
    method = method.replace('src/test/java/functions/CalculatorTest.java;CalculatorTest.', '')
    method = method.replace('#', '')
    log_file_path = os.path.join(log_directory, f"log_{method}.log")
    log_file_path = log_file_path.replace('"', '')

    if os.path.exists(log_file_path):
        log_vector = vectorize_log_file(log_file_path, vectorizer, fit=True)
        log_vectors.append(log_vector)
        execution_vectors_for_model.append(execution_vectors[method])
    else:
        print(f"Log file for method {method} not found.")

# Ensure log_vectors and execution_vectors_for_model are not empty
if not log_vectors or not execution_vectors_for_model:
    raise ValueError("Log vectors or execution vectors for model are empty. Check your data.")

max_log_length = max(len(v) for v in log_vectors)
max_exec_length = max(len(v) for v in execution_vectors_for_model)

# Pad the vectors with zeros to make them the same length
padded_log_vectors = np.array([np.pad(v, (0, max_log_length - len(v)), 'constant') for v in log_vectors])
padded_execution_vectors = np.array([np.pad(v, (0, max_exec_length - len(v)), 'constant') for v in execution_vectors_for_model])

# Combine all log vectors and execution vectors into single datasets
X = np.array(padded_log_vectors)
y = np.array(padded_execution_vectors)

# Ensure that y is of type integer
y = y.astype(int)

# Split the data into training (90%) and testing (10%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Print shapes for debugging
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# Train the Random Forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)


Log file for method "testAdd_N002" not found.
Log file for method "testDivided_E001" not found.
Log file for method "src/test/java/functions/distributions/Calculator2Test.java;Calculator2Test.testPower_N001" not found.
Log file for method "src/test/java/functions/distributions/Calculator2Test.java;Calculator2Test.testPower_N002" not found.
X_train shape: (5, 28)
y_train shape: (5, 124)
X_test shape: (2, 28)
y_test shape: (2, 124)


In [16]:

y_pred = model.predict(X_test)

# Function to evaluate the predictions
def evaluate_predictions(y_test, y_pred):
    for i in range(len(y_test)):
        actual_faulty_lines = np.where(y_test[i] == 1)[0]
        predicted_faulty_lines = np.where(y_pred[i] == 1)[0]
        print(f"Test sample {i}:")
        print(f"  Actual faulty lines: {actual_faulty_lines}")
        print(f"  Predicted faulty lines: {predicted_faulty_lines}")

# Evaluate the predictions
evaluate_predictions(y_test, y_pred)


Test sample 0:
  Actual faulty lines: [  1   2   3   4   5   6   7   8   9  10  92  93  94  95  96  97  98  99
 100 101 102]
  Predicted faulty lines: []
Test sample 1:
  Actual faulty lines: [34 35 36 37 38 39 40 41 42 43 44]
  Predicted faulty lines: []


In [17]:
X_test

array([[0.28221626, 0.28221626, 0.28221626, 0.09407209, 0.28221626,
        0.09407209, 0.28221626, 0.09407209, 0.09407209, 0.28221626,
        0.09407209, 0.09407209, 0.09407209, 0.37628835, 0.09407209,
        0.09407209, 0.28221626, 0.09407209, 0.28221626, 0.18814417,
        0.28221626, 0.09407209, 0.09407209, 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.26311741, 0.52623481, 0.0877058 , 0.26311741, 0.26311741,
        0.0877058 , 0.26311741, 0.0877058 , 0.0877058 , 0.0877058 ,
        0.0877058 , 0.0877058 , 0.0877058 , 0.0877058 , 0.35082321,
        0.0877058 , 0.0877058 , 0.0877058 , 0.26311741, 0.0877058 ,
        0.26311741, 0.0877058 , 0.26311741, 0.0877058 , 0.        ,
        0.        , 0.        , 0.        ]])

In [13]:
X_train

array([[0.26311741, 0.52623481, 0.0877058 , 0.26311741, 0.26311741,
        0.0877058 , 0.26311741, 0.0877058 , 0.0877058 , 0.0877058 ,
        0.0877058 , 0.0877058 , 0.0877058 , 0.0877058 , 0.35082321,
        0.0877058 , 0.0877058 , 0.0877058 , 0.26311741, 0.0877058 ,
        0.26311741, 0.0877058 , 0.26311741, 0.0877058 , 0.        ,
        0.        , 0.        , 0.        ],
       [0.26311741, 0.26311741, 0.52623481, 0.0877058 , 0.26311741,
        0.0877058 , 0.26311741, 0.0877058 , 0.0877058 , 0.0877058 ,
        0.0877058 , 0.0877058 , 0.0877058 , 0.0877058 , 0.35082321,
        0.0877058 , 0.0877058 , 0.26311741, 0.0877058 , 0.26311741,
        0.0877058 , 0.26311741, 0.0877058 , 0.0877058 , 0.        ,
        0.        , 0.        , 0.        ],
       [0.27472113, 0.27472113, 0.27472113, 0.13736056, 0.27472113,
        0.27472113, 0.13736056, 0.27472113, 0.13736056, 0.13736056,
        0.13736056, 0.41208169, 0.13736056, 0.27472113, 0.13736056,
        0.27472113, 0.1373