# CodeNet Error Classification Example

In [1]:
import codenet

import numpy as np
import pandas as pd

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

pd.set_option('max_columns', None)

codenet.P = 4

In [2]:
input_path = "../input/generated/"
data_path = input_path + "data/"

generate_labels_path = input_path + "generate_labels.csv"

Here we load the labels generated in the preprocessing stage of the project. The error classes are generated by running the example files on the sample input. To run a baseline classification model we are only interested in the file contents, or the tokenized version of the source code, and the error_class label, and we want to create a pipeline that will be able to predict the error label from the list of tokens.

In [3]:
generate_labels_df = pd.read_csv(generate_labels_path)
generate_labels_df = generate_labels_df[generate_labels_df['language'] == 'Python']
generate_labels_df.head()

Unnamed: 0,tag,i1,i2,j1,j2,problem_id,original_id,changed_id,language,extension,original_language,original_status,output,error,returncode,error_class,error_class_extra
4,insert,9,9,9,13,p02628,s000778835,s833669381,Python,py,Python (3.8.2),Runtime Error,,"Traceback (most recent call last):\n File ""/h...",1,ValueError,ValueError: invalid literal for int() with bas...
5,replace,136,137,136,137,p02406,s000873533,s357405506,Python,py,Python,Time Limit Exceeded,,"File ""/home/alex/Documents/research/bug-dete...",1,SyntaxError,SyntaxError: Missing parentheses in call to 'p...
7,replace,35,37,35,36,p02681,s001183728,s212071531,Python,py,Python (3.8.2),Runtime Error,,"File ""/home/alex/Documents/research/bug-dete...",1,SyntaxError,SyntaxError: invalid syntax
9,replace,86,87,86,87,p02314,s001192878,s967652203,Python,py,Python,Runtime Error,,"File ""/home/alex/Documents/research/bug-dete...",1,SyntaxError,SyntaxError: Missing parentheses in call to 'p...
11,replace,38,39,38,39,p03694,s001344748,s267165124,Python,py,Python (3.8.2),Runtime Error,,"Traceback (most recent call last):\n File ""/h...",1,NameError,NameError: name 'b' is not defined


In [4]:
X, y = codenet.classification_X_y(generate_labels_df)

Processing p02615 s999341335: 100%|█| 3851/38


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42)

Here we test a simple model, multinomial naive bayes, and to be able to run the algorithm we have to vectorize the tokens from the dataset into a matrix format, so we make use of a count vectorizer and a tfidf transformer.

In [6]:
model = Pipeline([
    ('vect', CountVectorizer(analyzer=lambda x: x)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted', zero_division=0)}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")
print(f"F1: {f1_score(y_test, y_pred, average='weighted')}")

Accuracy: 0.49513303049967555
Precision: 0.2755448125475472
Recall: 0.49513303049967555
F1: 0.3320691113248573
