In [1]:
import torch
import pickle
import codenet

import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from scipy import sparse

pd.set_option('max_columns', None)

codenet.P = 1

In [2]:
error_pairs_df = pd.read_csv(codenet.error_pairs_v2_path)

error_pairs_df.head()

Unnamed: 0,original_id,changed_id,original_status,problem_id,language,filename_ext,tag,i1,i2,j1,j2,output,error,returncode,error_class,error_class_extra
0,s000016565,s604436209,Runtime Error,p03106,Python,py,replace,22,23,22,24,,"Traceback (most recent call last):\n File ""<s...",1.0,AttributeError,AttributeError: 'str' object has no attribute ...
1,s000016565,s604436209,Runtime Error,p03106,Python,py,insert,49,49,52,53,,"File ""<string>"", line 4\n if a%i==0 and b...",1.0,SyntaxError,SyntaxError: unmatched ')'
2,s000016565,s604436209,Runtime Error,p03106,Python,py,insert,62,62,66,67,,"File ""<string>"", line 4\n if (a%i==0 and ...",1.0,SyntaxError,SyntaxError: invalid syntax
3,s000023530,s834210063,Runtime Error,p02684,Python,py,insert,81,81,81,88,,"Traceback (most recent call last):\n File ""<s...",1.0,NameError,NameError: name 'flag' is not defined
4,s000023530,s834210063,Runtime Error,p02684,Python,py,insert,179,179,185,237,,"File ""<string>"", line 18\n if s:]\n ...",1.0,SyntaxError,SyntaxError: unmatched ']'


In [3]:
with open('test.pkl', 'rb') as f:
    X, y = pickle.load(f)

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

feat_extraction = Pipeline([
    ('vect', CountVectorizer(analyzer=lambda x: x)),
    ('tfidf', TfidfTransformer()),
])

X_train = feat_extraction.fit_transform(X_train)
X_test = feat_extraction.transform(X_test)

labels = np.array([item for sublist in y for item in sublist])[:, 2]
label_enc = LabelEncoder()
label_enc = label_enc.fit(labels)

in_feat = X_train.shape[1]
num_classes = len(np.unique(labels))

in_feat, num_classes

(80226, 30)

In [88]:
class Model(nn.Module):
    def __init__(self, in_feat, num_classes):
        super(Model, self).__init__()

        self.in_feat = in_feat
        self.num_classes = num_classes
        self.out_feat = 1 + 2 + num_classes
        
        self.linear = nn.Linear(in_feat, self.out_feat)
        
    def forward(self, x):
        x = self.linear(x)
        return x

class Loss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        self.mce = nn.CrossEntropyLoss()
        
    def forward(self, output, target):      
        if len(target) == 0:
            return self.mse(output[0, 0], torch.tensor(0.0))
        
        obj_loss = self.mse(output[0, 0], torch.tensor(1.0))
        pos_loss = self.mse(output[0, 1:3], target[0, 0:2])
        mce_loss = self.mce(output[0:1, 2:], target[0:1, 2].long())

        return obj_loss + pos_loss + mce_loss
    
def target2tensor(target):
    if not target:
        return torch.tensor(target).float()
    target = np.array(target)
    target[:, 2] = label_enc.transform(target[:, 2])    
    target = target.astype(np.float32)
    return torch.tensor(target).float()

In [89]:
model = Model(in_feat, num_classes)
loss_fn = Loss()
solver = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))

In [90]:
dataloader = list(zip(X_train, y_train))

for epoch in range(1):
    with tqdm(dataloader, position=0, total=len(dataloader)) as loop:
        for i, (x, target) in enumerate(loop):
            x = torch.tensor(x.toarray()).float()
            target = target2tensor(target)

            predictions = model(x)
            loss = loss_fn(predictions, target)

            model.zero_grad()
            loss.backward()
            solver.step()

            loop.set_postfix(loss=loss.item(), epoch=epoch)

 19%|▏| 14749/78032 [09:50<42:13, 24.98it/s, 


KeyboardInterrupt: 

In [None]:
dataloader = list(zip(X_test, y_test))

def get_prediction(predictions):
    if predictions[0, 0] < 0.5:
        return []

    line, column = predictions[0, 1:3].long().tolist()
    label = torch.argmax(predictions[0, 3:]).item()
    
    return [(label, line, column)]
    
result = []

with tqdm(dataloader, position=0, total=len(dataloader)) as loop:
    for i, (x, target) in enumerate(loop):
        x = torch.tensor(x.toarray()).float()
        target = torch.tensor(target).float()

        predictions = model(x)
        predictions = get_prediction(predictions)
        
        result.append(predictions)

In [None]:
a_pred = [a[0][0] if a else label_enc.transform(["Accepted"])[0] for a in result]
a_test = [a[0][0] if a else label_enc.transform(["Accepted"])[0] for a in y_test]

In [None]:
print(f"Accuracy: {accuracy_score(a_test, a_pred)}")
print(f"Precision: {precision_score(a_test, a_pred, average='weighted', zero_division=0)}")
print(f"Recall: {recall_score(a_test, a_pred, average='weighted', zero_division=0)}")
print(f"F1: {f1_score(a_test, a_pred, average='weighted')}")