In [1]:
import torch
import pandas  as pd

from lime.lime_text import LimeTextExplainer

import os
import pickle
import eli5

from transformers import T5ForConditionalGeneration, RobertaTokenizer,RobertaModel
import torch
import pandas as pd
import torch.nn.functional as F
import torch.nn as nn

import numpy as np
from eli5.lime import TextExplainer
from eli5.lime.samplers import MaskingTextSampler, MaskingTextSamplers


In [2]:
class Bert_lstm(nn.Module):
    def __init__(self, hidden_dim, output_size,n_layers,bidirectional=True, drop_prob=0.5):
        super(Bert_lstm, self).__init__()
 
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
        
        self.bert=RobertaModel.from_pretrained("microsoft/codebert-base")
        for param in self.bert.parameters():
            param.requires_grad = True
        
        # dropout layer
        self.dropout = nn.Dropout(drop_prob)
           
                
        # LSTM layers
        self.lstm = nn.LSTM(input_size = 768, 
            hidden_size = self.hidden_dim, 
            num_layers = self.n_layers,
            batch_first=True,
            bidirectional=bool(bidirectional))

        # self.lstm = nn.LSTM(768,self.hidden_dim,bidirectional=True)


        # linear and sigmoid layers
        if bidirectional:
            # self.fc = nn.Linear(hidden_dim*2, output_size)
            self.fc = nn.Linear(hidden_dim*2, output_size)
        else:
            self.fc = nn.Linear(hidden_dim, output_size)
          
        #self.sig = nn.Sigmoid()
 
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x=self.bert(x)[0]     
        
        # lstm_out
        x = x.float()
        lstm_out, (hidden_last,cn_last) = self.lstm(x, hidden)
        
        if self.bidirectional:
            hidden_last_L=hidden_last[-2]
            hidden_last_R=hidden_last[-1]
            hidden_last_out=torch.cat([hidden_last_L,hidden_last_R],dim=-1)
        else:
            hidden_last_out=hidden_last[-1]   #[32, 384]
            
            
        # dropout and fully-connected layer
        out = self.dropout(hidden_last_out)
        out = self.fc(out)
        
        return out
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        number = 1
        if self.bidirectional:
            number = 2
        USE_CUDA = torch.cuda.is_available()
        if (USE_CUDA):
            hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda(),
                      weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda()
                     )
        else:
            hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float(),
                      weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float()
                     )
        
        return hidden



In [14]:
def predict_example(test_comment_list):
    USE_CUDA = torch.cuda.is_available()
    hidden_dim = 384
    output_size = 2
    n_layers = 2
    bidirectional = True
    net = Bert_lstm( 
                hidden_dim, 
                output_size,
                n_layers, 
                bidirectional)
    save_path = "../models/codebert_lstm"
    net.load_state_dict(torch.load(save_path))
    if(USE_CUDA):
        net.cuda()
    result_comments=test_comment_list   #预处理去掉标点符号
    #转换为字id
    tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
    result_comments_id = tokenizer(result_comments,
                                    padding=True,
                                    truncation=True,
                                    max_length=512,
                                    return_tensors='pt')
    tokenizer_id = result_comments_id['input_ids']
    # print(tokenizer_id.shape)
    inputs = tokenizer_id
    batch_size = inputs.size(0)
    h = net.init_hidden(batch_size)

    if(USE_CUDA):
        inputs = inputs.cuda()

    net.eval()
    with torch.no_grad():
        # get the output from the model
        output= net(inputs, h)
        output=torch.nn.Softmax(dim=1)(output)
        return output.cpu().detach().numpy()


In [4]:
# batch_size = 32
# model_path = "../models/bert_model"
# bert_model_class = "microsoft/codebert-base"
# test_dataset_path = '../data/clean_data/test.csv'

label_names = ['negative', 'positive']

In [38]:
explainer = TextExplainer(n_samples = 50, random_state=2022, sampler=MaskingTextSampler())

In [23]:
# model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-base")
# model_path = "D:/2022summerproject/models/pytorch_model.bin"
# model.load_state_dict(torch.load(model_path))

# tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-base")

test = pd.read_csv("../data/clean_data/test.csv")
k=11
example0 = test.iloc[k*2]['new_code']
example1 = test.iloc[k*2+1]['new_code']


In [39]:
explainer.fit(example0, predict_example)
explainer.show_prediction(target_names=['negative', 'positive'])



Contribution?,Feature
17.983,Highlighted in text (sum)
-0.178,<BIAS>


In [40]:
explainer.fit(example1, predict_example)
explainer.show_prediction(target_names=['negative', 'positive'])



Contribution?,Feature
0.651,Highlighted in text (sum)
0.14,<BIAS>
