In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import kenlm
from tqdm import tqdm
import fastText
import pandas as pd
from bleu import *
import torch, os

In [2]:
#bert classifier

from tqdm import trange

from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.tokenization import BertTokenizer

model_cls = BertForSequenceClassification.from_pretrained("./bert_classifier/amazon", num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

model_cls.to('cuda')
model_cls.eval()

max_seq_len=70
sm = torch.nn.Softmax(dim=-1)

def evaluate_dev_set(input_sentences, labels, bs=32):
    """
    To evaluate whole dataset and return accuracy
    """
    ids = []
    segment_ids = []
    input_masks = []
    pred_lt = []
    for sen in input_sentences:
        text_tokens = tokenizer.tokenize(sen)
        if len(text_tokens) >= max_seq_len - 2:
            text_tokens = text_tokens[:max_seq_len - 3]
        tokens = ["[CLS]"] + text_tokens + ["[SEP]"]
        temp_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(temp_ids)
        segment_id = [0] * len(temp_ids)
        padding = [0] * (max_seq_len - len(temp_ids))

        temp_ids += padding
        input_mask += padding
        segment_id += padding
        
        ids.append(temp_ids)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
    
    ids = torch.tensor(ids).to('cuda')
    segment_ids = torch.tensor(segment_ids).to('cuda')
    input_masks = torch.tensor(input_masks).to('cuda')
    
    steps = len(ids) // bs
    
    for i in trange(steps+1):
        if i == steps:
            temp_ids = ids[i * bs : len(ids)]
            temp_segment_ids = segment_ids[i * bs: len(ids)]
            temp_input_masks = input_masks[i * bs: len(ids)]
        else:
            temp_ids = ids[i * bs : i * bs + bs]
            temp_segment_ids = segment_ids[i * bs: i * bs + bs]
            temp_input_masks = input_masks[i * bs: i * bs + bs]
        
        with torch.no_grad():
            preds = sm(model_cls(temp_ids, temp_segment_ids, temp_input_masks))
        
        #preds = preds.view(-1,bs)
        try:
            args = torch.argmax(preds, dim=-1)
            pred_lt.extend(args.tolist())
        except RuntimeError:
            pass
    accuracy = sum(np.array(pred_lt) == np.array(labels)) / len(labels)
    
    return accuracy, pred_lt

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [6]:
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import logging
logging.basicConfig(level=logging.INFO)

lm_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
lm_model = GPT2LMHeadModel.from_pretrained('gpt2')
path = os.path.join(os.getcwd(), "GPT2/amazon_language_model_1.bin")
lm_model_state_dict = torch.load(path)
lm_model.load_state_dict(lm_model_state_dict)
lm_model.to(device)
lm_model.eval()

lm_loss = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='none')


def calculate_ppl_gpt2(sentence_batch, bs=16):
    # tokenize the sentences
    tokenized_ids = [None for i in range(len(sentence_batch))]
    ppl = [None for i in range(len(sentence_batch))]
    
    for i in range(len(sentence_batch)):
        tokenized_ids[i] = lm_tokenizer.encode(sentence_batch[i])
        
    sen_lengths = [len(x) for x in tokenized_ids]
    max_sen_length = max(sen_lengths)
    
    n_batch = len(sentence_batch)
    input_ids = np.zeros( shape=(n_batch, max_sen_length), dtype=np.int64)
    lm_labels = np.full(shape=(n_batch, max_sen_length), fill_value=-1)
    
    for i, tokens in enumerate(tokenized_ids):
        input_ids[i, :len(tokens)] = tokens
        lm_labels[i, :len(tokens)-1] = tokens[1:] 
    
    input_ids = torch.tensor(input_ids)#.to(device)
    lm_labels = torch.tensor(lm_labels)#.to(device)
    
    steps = n_batch // bs
    
    for i in range(steps+1):
        
        if i == steps:
            temp_input_ids = input_ids[i * bs : n_batch]
            temp_lm_labels = lm_labels[i * bs : n_batch]
            temp_sen_lengths = sen_lengths[i * bs : n_batch]
        else:
            temp_input_ids = input_ids[i * bs : i * bs + bs]
            temp_lm_labels = lm_labels[i * bs : i * bs + bs]
            temp_sen_lengths = sen_lengths[i * bs : i * bs + bs]
            
        temp_input_ids = temp_input_ids.to('cuda')
        temp_lm_labels = temp_lm_labels.to('cuda')
            
        with torch.no_grad():
            lm_pred = lm_model(temp_input_ids)
            
        loss_val = lm_loss(lm_pred[0].view(-1, lm_pred[0].size(-1)), temp_lm_labels.view(-1))
        normalized_loss = loss_val.view(len(temp_input_ids),-1).sum(dim= -1) / torch.tensor(temp_sen_lengths, dtype=torch.float32).to(device)
        tmp_ppl = torch.exp(normalized_loss)
        ppl[i * bs: i * bs + len(temp_input_ids)] = tmp_ppl.tolist()
    
    return  ppl


INFO:pytorch_pretrained_bert.tokenization_gpt2:loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /home/ubuntu/.pytorch_pretrained_bert/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:pytorch_pretrained_bert.tokenization_gpt2:loading merges file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /home/ubuntu/.pytorch_pretrained_bert/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
INFO:pytorch_pretrained_bert.modeling_gpt2:loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin from cache at /home/ubuntu/.pytorch_pretrained_bert/4295d67f022061768f4adc386234dbdb781c814c39662dd1662221c309962c55.778cf36f5c4e5d94c8cd9cefcf2a580c8643570eb327f0d4a1f007fab2acbdf1
INFO:pytorch_pretrained_bert.modeling_gpt

In [7]:
#fasttext classifier
classifier_model = fastText.load_model('fasttextmodel/amazon_model.bin')

#kenlm lm
kenlm_lm = kenlm.Model('kenlmmodel/amazon.arpa')


In [13]:
df = pd.read_csv('amazon_all_model_prediction_1.csv', header = None)
label = 0
label_str = '__label__0'

list_sentences = df[1:len(df)].values.tolist()

list_sentences_source = []
list_sentences_human = []

for list_sentance in list_sentences:
    if(pd.isnull(list_sentance[0])):
        list_sentences_source.append(" ")
    else:
        list_sentences_source.append(list_sentance[0])
    
    if(pd.isnull(list_sentance[-1])):
        list_sentences_human.append(" ")
    else:
        list_sentences_human.append(list_sentance[-1])
    

matrics1 = []
for i in tqdm(range(0, len(list_sentences[0]))):
    bleu_s = 0
    bleu_r = 0
    fasttext_c = 0
    kenlm_ppl = 0
    gpt2_ppl = 0
    
    sentences = []
    for j in range(0, len(list_sentences)):
        if(pd.isnull(list_sentences[j][i])):
            sentences.append(" ")
            continue
        sentences.append(list_sentences[j][i])
        
    fasttext_labels = classifier_model.predict(sentences)
    
    total_sentences = len(sentences)

    bleu_s = get_bleu(list_sentences_source, sentences)
    bleu_r = get_bleu(list_sentences_human, sentences)

    for _, sentence in enumerate(sentences):
        if(fasttext_labels[0][_][0]==label_str):
            fasttext_c += 1
        kenlm_ppl += kenlm_lm.perplexity(sentence)
        
    labels_list = [label] * len(sentences)

    bert_accuracy, pred_label_list = evaluate_dev_set(sentences, labels_list)
    ppl_list_gpt2 = calculate_ppl_gpt2(sentences)
    
    for j in range(0, len(ppl_list_gpt2)):
        gpt2_ppl += ppl_list_gpt2[j]

    matrics1.append([bleu_s , bleu_r , fasttext_c/total_sentences , kenlm_ppl/total_sentences, bert_accuracy, gpt2_ppl/len(ppl_list_gpt2)])
    




  0%|          | 0/9 [00:00<?, ?it/s][A[A[A



  0%|          | 0/16 [00:00<?, ?it/s][A[A[A[A



  6%|▋         | 1/16 [00:00<00:05,  2.76it/s][A[A[A[A



 12%|█▎        | 2/16 [00:00<00:04,  2.97it/s][A[A[A[A



 19%|█▉        | 3/16 [00:00<00:04,  3.10it/s][A[A[A[A



 25%|██▌       | 4/16 [00:01<00:03,  3.17it/s][A[A[A[A



 31%|███▏      | 5/16 [00:01<00:03,  3.21it/s][A[A[A[A



 38%|███▊      | 6/16 [00:01<00:03,  3.25it/s][A[A[A[A



 44%|████▍     | 7/16 [00:02<00:02,  3.29it/s][A[A[A[A



 50%|█████     | 8/16 [00:02<00:02,  3.32it/s][A[A[A[A



 56%|█████▋    | 9/16 [00:02<00:02,  3.34it/s][A[A[A[A



 62%|██████▎   | 10/16 [00:02<00:01,  3.35it/s][A[A[A[A



 69%|██████▉   | 11/16 [00:03<00:01,  3.37it/s][A[A[A[A



 75%|███████▌  | 12/16 [00:03<00:01,  3.38it/s][A[A[A[A



 81%|████████▏ | 13/16 [00:03<00:00,  3.39it/s][A[A[A[A



 88%|████████▊ | 14/16 [00:04<00:00,  3.40it/s][A[A[A[A



 94%|█████████▍| 15/

In [14]:
df = pd.read_csv('amazon_all_model_prediction_0.csv', header = None)
label = 1
label_str = '__label__1'

list_sentences = df[1:len(df)].values.tolist()

list_sentences_source = []
list_sentences_human = []

for list_sentance in list_sentences:
    if(pd.isnull(list_sentance[0])):
        list_sentences_source.append(" ")
    else:
        list_sentences_source.append(list_sentance[0])
    
    if(pd.isnull(list_sentance[-1])):
        list_sentences_human.append(" ")
    else:
        list_sentences_human.append(list_sentance[-1])

matrics0 = []
for i in tqdm(range(0, len(list_sentences[0]))):
    bleu_s = 0
    bleu_r = 0
    fasttext_c = 0
    kenlm_ppl = 0
    gpt2_ppl = 0

    sentences = []
    for j in range(0, len(list_sentences)):
        if(pd.isnull(list_sentences[j][i])):
            sentences.append(" ")
            continue
        sentences.append(list_sentences[j][i])
        
    fasttext_labels = classifier_model.predict(sentences)
    
    total_sentences = len(sentences)
    
    bleu_s = get_bleu(list_sentences_source, sentences)
    bleu_r = get_bleu(list_sentences_human, sentences)
    
    for _, sentence in enumerate(sentences):
        if(fasttext_labels[0][_][0]==label_str):
            fasttext_c += 1
        kenlm_ppl += kenlm_lm.perplexity(sentence)
        
    labels_list = [label] * len(sentences)
    bert_accuracy, pred_label_list = evaluate_dev_set(sentences, labels_list)
    
    ppl_list_gpt2 = calculate_ppl_gpt2(sentences)

    for j in range(0, len(ppl_list_gpt2)):
        gpt2_ppl += ppl_list_gpt2[j]
        
    matrics0.append([bleu_s , bleu_r , fasttext_c/total_sentences , kenlm_ppl/total_sentences, bert_accuracy, gpt2_ppl/len(ppl_list_gpt2)])
    
    




  0%|          | 0/9 [00:00<?, ?it/s][A[A[A



  0%|          | 0/16 [00:00<?, ?it/s][A[A[A[A



  6%|▋         | 1/16 [00:00<00:05,  2.77it/s][A[A[A[A



 12%|█▎        | 2/16 [00:00<00:04,  2.99it/s][A[A[A[A



 19%|█▉        | 3/16 [00:00<00:04,  3.12it/s][A[A[A[A



 25%|██▌       | 4/16 [00:01<00:03,  3.18it/s][A[A[A[A



 31%|███▏      | 5/16 [00:01<00:03,  3.23it/s][A[A[A[A



 38%|███▊      | 6/16 [00:01<00:03,  3.26it/s][A[A[A[A



 44%|████▍     | 7/16 [00:02<00:02,  3.29it/s][A[A[A[A



 50%|█████     | 8/16 [00:02<00:02,  3.32it/s][A[A[A[A



 56%|█████▋    | 9/16 [00:02<00:02,  3.34it/s][A[A[A[A



 62%|██████▎   | 10/16 [00:02<00:01,  3.36it/s][A[A[A[A



 69%|██████▉   | 11/16 [00:03<00:01,  3.37it/s][A[A[A[A



 75%|███████▌  | 12/16 [00:03<00:01,  3.38it/s][A[A[A[A



 81%|████████▏ | 13/16 [00:03<00:00,  3.39it/s][A[A[A[A



 88%|████████▊ | 14/16 [00:04<00:00,  3.40it/s][A[A[A[A



 94%|█████████▍| 15/

In [15]:
[print(i) for i in matrics0]

[100.0, 67.73886758565439, 0.208, 119.45335778348579, 0.15, 34.96666891670227]
[14.976605614659091, 14.845597114754561, 0.908, 19.629643258587993, 0.77, 28.809080837249756]
[16.23939249839318, 15.330526739499476, 0.526, 84.87614110764278, 0.464, 131.30051573753357]
[16.36063461191676, 15.671692117300404, 0.756, 84.01531375202994, 0.71, 129.24480063438415]
[16.17150946519909, 14.016477517569381, 0.542, 33773.05888294231, 0.474, 56.648111888885495]
[15.7302883326841, 13.869651456684421, 0.592, 76.56198512211884, 0.526, 40.375796036243436]
[72.93740220165328, 55.51956117732476, 0.63, 195.51868619591448, 0.658, 56.72542850255966]
[70.95090040766817, 50.624013029408566, 0.608, 407.8507847781964, 0.64, 165.67828907108307]
[67.85479371420534, 100.0, 0.424, 2607.4768312100746, 0.452, 76.81063127493859]


[None, None, None, None, None, None, None, None, None]

In [16]:
[print(i) for i in matrics1]

[100.0, 73.07703444909669, 0.2, 311.31211800210355, 0.138, 30.924947624206542]
[15.506475383131862, 14.616047532987405, 0.754, 21.250957704697296, 0.666, 31.39602951860428]
[17.225977804686043, 15.999586392807455, 0.384, 89.60899829181518, 0.298, 128.25660062837602]
[16.641866845761655, 15.725102417340914, 0.68, 76.83515055272728, 0.63, 115.7281846280098]
[16.138771562049822, 14.869266476333278, 0.47, 33800.62883955456, 0.45, 53.42682637453079]
[16.377687723501467, 15.030079763084302, 0.426, 128.17468398820753, 0.516, 43.9817015132904]
[74.19292935400931, 58.9965825823184, 0.57, 283.8150743920332, 0.6, 53.614639067173]
[70.99148690208703, 54.99056521087671, 0.544, 533.6119836489512, 0.572, 176.37393500709533]
[73.05091989442911, 100.0, 0.428, 37938.65602114074, 0.612, 77.09565449380875]


[None, None, None, None, None, None, None, None, None]

In [17]:
matricsavg = (np.array(matrics0)+np.array(matrics1))/2

In [18]:
df_res0 = pd.DataFrame(matrics0, columns=['BLEU_source','BLEU_human','fasttext_classifier','klm_ppl', 'BERT_classifier', 'gpt2_ppl'])
df_res1 = pd.DataFrame(matrics1, columns=['BLEU_source','BLEU_human','fasttext_classifier','klm_ppl', 'BERT_classifier', 'gpt2_ppl'])
df_resavg = pd.DataFrame(matricsavg, columns=['BLEU_source','BLEU_human','fasttext_classifier','klm_ppl', 'BERT_classifier', 'gpt2_ppl'])

In [19]:
models_list = df[0:1].values.tolist()
#df_res.insert(loc=0, column='GLEU_score', value=gleu_list)
df_res0.insert(loc=0, column='model', value=models_list[0])
df_res1.insert(loc=0, column='model', value=models_list[0])
df_resavg.insert(loc=0, column='model', value=models_list[0])

In [20]:
df_res0

Unnamed: 0,model,BLEU_source,BLEU_human,fasttext_classifier,klm_ppl,BERT_classifier,gpt2_ppl
0,Source,100.0,67.738868,0.208,119.453358,0.15,34.966669
1,CROSSALIGNED,14.976606,14.845597,0.908,19.629643,0.77,28.809081
2,STYLEEMBEDDING,16.239392,15.330527,0.526,84.876141,0.464,131.300516
3,MULTIDECODER,16.360635,15.671692,0.756,84.015314,0.71,129.244801
4,DELETEONLY,16.171509,14.016478,0.542,33773.058883,0.474,56.648112
5,DELETEANDRETRIEVE,15.730288,13.869651,0.592,76.561985,0.526,40.375796
6,BERT_DEL,72.937402,55.519561,0.63,195.518686,0.658,56.725429
7,BERT_RET_TFIDF,70.9509,50.624013,0.608,407.850785,0.64,165.678289
8,HUMAN,67.854794,100.0,0.424,2607.476831,0.452,76.810631


In [26]:
df_res0.to_csv('matrics/amazon/matrics_amazon_all_model_prediction_0.csv')

In [27]:
df_res1.to_csv('matrics/amazon/matrics_amazon_all_model_prediction_1.csv')

In [25]:
df_resavg.to_csv('matrics/amazon/matrics_amazon_all_model_prediction_avg.csv')