In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import kenlm
from tqdm import tqdm
import fastText
import pandas as pd
from bleu import *
import torch, os

In [2]:
#bert classifier

from tqdm import trange

from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.tokenization import BertTokenizer

model_cls = BertForSequenceClassification.from_pretrained("./bert_classifier/imagecaption", num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

model_cls.to('cuda')
model_cls.eval()

max_seq_len=70
sm = torch.nn.Softmax(dim=-1)

def evaluate_dev_set(input_sentences, labels, bs=32):
    """
    To evaluate whole dataset and return accuracy
    """
    ids = []
    segment_ids = []
    input_masks = []
    pred_lt = []
    for sen in input_sentences:
        text_tokens = tokenizer.tokenize(sen)
        if len(text_tokens) >= max_seq_len - 2:
            text_tokens = text_tokens[:max_seq_len - 3]
        tokens = ["[CLS]"] + text_tokens + ["[SEP]"]
        temp_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(temp_ids)
        segment_id = [0] * len(temp_ids)
        padding = [0] * (max_seq_len - len(temp_ids))

        temp_ids += padding
        input_mask += padding
        segment_id += padding
        
        ids.append(temp_ids)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
    
    ids = torch.tensor(ids).to('cuda')
    segment_ids = torch.tensor(segment_ids).to('cuda')
    input_masks = torch.tensor(input_masks).to('cuda')
    
    steps = len(ids) // bs
    
    for i in trange(steps+1):
        if i == steps:
            temp_ids = ids[i * bs : len(ids)]
            temp_segment_ids = segment_ids[i * bs: len(ids)]
            temp_input_masks = input_masks[i * bs: len(ids)]
        else:
            temp_ids = ids[i * bs : i * bs + bs]
            temp_segment_ids = segment_ids[i * bs: i * bs + bs]
            temp_input_masks = input_masks[i * bs: i * bs + bs]
        
        with torch.no_grad():
            preds = sm(model_cls(temp_ids, temp_segment_ids, temp_input_masks))
        
        #preds = preds.view(-1,bs)
        try:
            args = torch.argmax(preds, dim=-1)
            pred_lt.extend(args.tolist())
        except RuntimeError:
            pass
    accuracy = sum(np.array(pred_lt) == np.array(labels)) / len(labels)
    
    return accuracy, pred_lt

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [20]:
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import logging
logging.basicConfig(level=logging.INFO)

lm_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
lm_model = GPT2LMHeadModel.from_pretrained('gpt2')
path = os.path.join(os.getcwd(), "GPT2/imagecaption_language_model_2.bin")
lm_model_state_dict = torch.load(path)
lm_model.load_state_dict(lm_model_state_dict)
lm_model.to(device)
lm_model.eval()

lm_loss = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='none')


def calculate_ppl_gpt2(sentence_batch, bs=16):
    # tokenize the sentences
    tokenized_ids = [None for i in range(len(sentence_batch))]
    ppl = [None for i in range(len(sentence_batch))]
    
    for i in range(len(sentence_batch)):
        tokenized_ids[i] = lm_tokenizer.encode(sentence_batch[i])
        
    sen_lengths = [len(x) for x in tokenized_ids]
    max_sen_length = max(sen_lengths)
    
    n_batch = len(sentence_batch)
    input_ids = np.zeros( shape=(n_batch, max_sen_length), dtype=np.int64)
    lm_labels = np.full(shape=(n_batch, max_sen_length), fill_value=-1)
    
    for i, tokens in enumerate(tokenized_ids):
        input_ids[i, :len(tokens)] = tokens
        lm_labels[i, :len(tokens)-1] = tokens[1:] 
    
    input_ids = torch.tensor(input_ids)#.to(device)
    lm_labels = torch.tensor(lm_labels)#.to(device)
    
    steps = n_batch // bs
    
    for i in range(steps+1):
        
        if i == steps:
            temp_input_ids = input_ids[i * bs : n_batch]
            temp_lm_labels = lm_labels[i * bs : n_batch]
            temp_sen_lengths = sen_lengths[i * bs : n_batch]
        else:
            temp_input_ids = input_ids[i * bs : i * bs + bs]
            temp_lm_labels = lm_labels[i * bs : i * bs + bs]
            temp_sen_lengths = sen_lengths[i * bs : i * bs + bs]
            
        temp_input_ids = temp_input_ids.to('cuda')
        temp_lm_labels = temp_lm_labels.to('cuda')
            
        with torch.no_grad():
            lm_pred = lm_model(temp_input_ids)
            
        loss_val = lm_loss(lm_pred[0].view(-1, lm_pred[0].size(-1)), temp_lm_labels.view(-1))
        normalized_loss = loss_val.view(len(temp_input_ids),-1).sum(dim= -1) / torch.tensor(temp_sen_lengths, dtype=torch.float32).to(device)
        tmp_ppl = torch.exp(normalized_loss)
        ppl[i * bs: i * bs + len(temp_input_ids)] = tmp_ppl.tolist()
    
    return  ppl


INFO:pytorch_pretrained_bert.tokenization_gpt2:loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /home/ubuntu/.pytorch_pretrained_bert/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:pytorch_pretrained_bert.tokenization_gpt2:loading merges file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /home/ubuntu/.pytorch_pretrained_bert/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
INFO:pytorch_pretrained_bert.modeling_gpt2:loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin from cache at /home/ubuntu/.pytorch_pretrained_bert/4295d67f022061768f4adc386234dbdb781c814c39662dd1662221c309962c55.778cf36f5c4e5d94c8cd9cefcf2a580c8643570eb327f0d4a1f007fab2acbdf1
INFO:pytorch_pretrained_bert.modeling_gpt

In [4]:
#fasttext classifier
classifier_model = fastText.load_model('fasttextmodel/imagecaption_model.bin')

#kenlm lm
kenlm_lm = kenlm.Model('kenlmmodel/imagecaption.arpa')


In [21]:
df = pd.read_csv('imagecaption_all_model_prediction_1.csv', header = None)
label = 0
label_str = '__label__0'

list_sentences = df[1:len(df)].values.tolist()

list_sentences_source = []
list_sentences_human = []
for list_sentance in list_sentences:
    list_sentences_source.append(list_sentance[0])
    list_sentences_human.append(list_sentance[-1])

matrics1 = []
for i in tqdm(range(0, len(list_sentences[0]))):
    bleu_s = 0
    bleu_r = 0
    fasttext_c = 0
    kenlm_ppl = 0
    gpt2_ppl = 0
    
    sentences = []
    for j in range(0, len(list_sentences)):
        if(pd.isnull(list_sentences[j][i])):
            continue
        sentences.append(list_sentences[j][i])
        
    fasttext_labels = classifier_model.predict(sentences)
    
    total_sentences = len(sentences)

    bleu_s = get_bleu(list_sentences_source, sentences)
    bleu_r = get_bleu(list_sentences_human, sentences)

    for _, sentence in enumerate(sentences):
        if(fasttext_labels[0][_][0]==label_str):
            fasttext_c += 1
        kenlm_ppl += kenlm_lm.perplexity(sentence)
        
    labels_list = [label] * len(sentences)

    bert_accuracy, pred_label_list = evaluate_dev_set(sentences, labels_list)
    ppl_list_gpt2 = calculate_ppl_gpt2(sentences)
    
    for j in range(0, len(ppl_list_gpt2)):
        gpt2_ppl += ppl_list_gpt2[j]

    matrics1.append([bleu_s , bleu_r , fasttext_c/total_sentences , kenlm_ppl/total_sentences, bert_accuracy, gpt2_ppl/len(ppl_list_gpt2)])
    
    

  0%|          | 0/9 [00:00<?, ?it/s]
  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:00<00:03,  2.73it/s][A
 20%|██        | 2/10 [00:00<00:02,  2.94it/s][A
 30%|███       | 3/10 [00:00<00:02,  3.10it/s][A
 40%|████      | 4/10 [00:01<00:01,  3.19it/s][A
 50%|█████     | 5/10 [00:01<00:01,  3.23it/s][A
 60%|██████    | 6/10 [00:01<00:01,  3.26it/s][A
 70%|███████   | 7/10 [00:02<00:00,  3.29it/s][A
 80%|████████  | 8/10 [00:02<00:00,  3.32it/s][A
 90%|█████████ | 9/10 [00:02<00:00,  3.35it/s][A
100%|██████████| 10/10 [00:02<00:00,  3.56it/s][A
 11%|█         | 1/9 [00:04<00:33,  4.18s/it]
  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:00<00:02,  3.81it/s][A
 20%|██        | 2/10 [00:00<00:02,  3.71it/s][A
 30%|███       | 3/10 [00:00<00:01,  3.65it/s][A
 40%|████      | 4/10 [00:01<00:01,  3.63it/s][A
 50%|█████     | 5/10 [00:01<00:01,  3.62it/s][A
 60%|██████    | 6/10 [00:01<00:01,  3.60it/s][A
 70%|███████   | 7/10 [00:01<00

In [11]:
# for i in tqdm(range(0, len(list_sentences[0]))):
#     gpt2_ppl = 0
    
#     sentences = []
#     for j in range(0, len(list_sentences)):
#         if(pd.isnull(list_sentences[j][i])):
#             continue
#         sentences.append(list_sentences[j][i])
    
#     ppl_list_gpt2 = calculate_ppl_gpt2(sentences)
#     for j in range(0, len(ppl_list_gpt2)):
#         gpt2_ppl += ppl_list_gpt2[j]

#     matrics1[i].append(gpt2_ppl/len(ppl_list_gpt2))

100%|██████████| 9/9 [00:14<00:00,  1.66s/it]


In [22]:
df = pd.read_csv('imagecaption_all_model_prediction_0.csv', header = None)
label = 1
label_str = '__label__1'

list_sentences = df[1:len(df)].values.tolist()

list_sentences_source = []
list_sentences_human = []
for list_sentance in list_sentences:
    list_sentences_source.append(list_sentance[0])
    list_sentences_human.append(list_sentance[-1])

matrics0 = []
for i in tqdm(range(0, len(list_sentences[0]))):
    bleu_s = 0
    bleu_r = 0
    fasttext_c = 0
    kenlm_ppl = 0
    gpt2_ppl = 0

    sentences = []
    for j in range(0, len(list_sentences)):
        if(pd.isnull(list_sentences[j][i])):
            continue
        sentences.append(list_sentences[j][i])
        
    fasttext_labels = classifier_model.predict(sentences)
    
    total_sentences = len(sentences)
    
    bleu_s = get_bleu(list_sentences_source, sentences)
    bleu_r = get_bleu(list_sentences_human, sentences)
    
    for _, sentence in enumerate(sentences):
        if(fasttext_labels[0][_][0]==label_str):
            fasttext_c += 1
        kenlm_ppl += kenlm_lm.perplexity(sentence)
        
    labels_list = [label] * len(sentences)
    bert_accuracy, pred_label_list = evaluate_dev_set(sentences, labels_list)
    
    ppl_list_gpt2 = calculate_ppl_gpt2(sentences)

    for j in range(0, len(ppl_list_gpt2)):
        gpt2_ppl += ppl_list_gpt2[j]
        
    matrics0.append([bleu_s , bleu_r , fasttext_c/total_sentences , kenlm_ppl/total_sentences, bert_accuracy, gpt2_ppl/len(ppl_list_gpt2)])
    
    

  0%|          | 0/9 [00:00<?, ?it/s]
  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:00<00:03,  2.88it/s][A
 20%|██        | 2/10 [00:00<00:02,  3.09it/s][A
 30%|███       | 3/10 [00:00<00:02,  3.19it/s][A
 40%|████      | 4/10 [00:01<00:01,  3.25it/s][A
 50%|█████     | 5/10 [00:01<00:01,  3.29it/s][A
 60%|██████    | 6/10 [00:01<00:01,  3.33it/s][A
 70%|███████   | 7/10 [00:02<00:00,  3.35it/s][A
 80%|████████  | 8/10 [00:02<00:00,  3.37it/s][A
 90%|█████████ | 9/10 [00:02<00:00,  3.39it/s][A
100%|██████████| 10/10 [00:02<00:00,  3.60it/s][A
 11%|█         | 1/9 [00:04<00:33,  4.15s/it]
  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:00<00:02,  3.78it/s][A
 20%|██        | 2/10 [00:00<00:02,  3.70it/s][A
 30%|███       | 3/10 [00:00<00:01,  3.63it/s][A
 40%|████      | 4/10 [00:01<00:01,  3.60it/s][A
 50%|█████     | 5/10 [00:01<00:01,  3.58it/s][A
 60%|██████    | 6/10 [00:01<00:01,  3.58it/s][A
 70%|███████   | 7/10 [00:01<00

In [23]:
[print(i) for i in matrics0]

[100.0, 39.87944345358151, 0.98, 71.11467744760239, 0.6933333333333334, 34.39835715134939]
[24.92056428895913, 21.76467691499278, 1.0, 7.267386331689432, 1.0, 7.927897548675537]
[53.76071199348269, 29.713073271739376, 0.9966666666666667, 51.69677062697555, 0.59, 74.064298842748]
[49.09943533601552, 26.90944662837605, 0.9966666666666667, 30.49334370915424, 0.7366666666666667, 41.53330832163493]
[59.311072999300876, 36.10505325823188, 0.9966666666666667, 48.23631147057629, 0.8733333333333333, 51.96432491461436]
[47.074873410530884, 38.836287631862426, 1.0, 23.240615692689445, 0.9866666666666667, 27.62262170950572]
[69.90959012446687, 42.80293604104987, 0.8966666666666666, 204.70766625865213, 0.69, 26.737085501352947]
[12.89936397051252, 19.5490483856569, 0.9728813559322034, 659.302723232918, 0.7864406779661017, 43.01603642116159]
[36.256820971076934, 100.0, 0.9933333333333333, 109.83743714546452, 0.82, 36.76217499256134]


[None, None, None, None, None, None, None, None, None]

In [24]:
[print(i) for i in matrics1]

[100.0, 39.59805219279706, 0.02, 71.11467744760239, 0.30666666666666664, 34.39835715134939]
[23.236296228468905, 23.10680525628537, 0.016666666666666666, 10.131880712102145, 0.93, 12.273937996228536]
[53.77309695241419, 29.392399961802912, 0.023333333333333334, 55.34128283008179, 0.49666666666666665, 86.45355588595072]
[48.23012622194243, 27.05896019128694, 0.03, 28.247175534833158, 0.5833333333333334, 39.369440463383995]
[58.8079940029184, 36.499275332307356, 0.15333333333333332, 42.9083329745895, 0.7733333333333333, 53.05547393957774]
[51.03982230583542, 37.52242985249541, 0.35, 23.519130799891176, 0.9133333333333333, 29.930170119603474]
[66.64320101223511, 43.321603338624456, 0.22333333333333333, 101.63720073179675, 0.72, 30.990958212216697]
[13.346641728109384, 18.38065693490144, 0.07317073170731707, 811.8717184392814, 0.5923344947735192, 46.99428256255825]
[36.635239934223485, 100.0, 0.11666666666666667, 139.35899225692074, 0.79, 46.08650080680847]


[None, None, None, None, None, None, None, None, None]

In [25]:
matricsavg = (np.array(matrics0)+np.array(matrics1))/2

In [26]:
df_res0 = pd.DataFrame(matrics0, columns=['BLEU_source','BLEU_human','fasttext_classifier','klm_ppl', 'BERT_classifier', 'gpt2_ppl'])
df_res1 = pd.DataFrame(matrics1, columns=['BLEU_source','BLEU_human','fasttext_classifier','klm_ppl', 'BERT_classifier', 'gpt2_ppl'])
df_resavg = pd.DataFrame(matricsavg, columns=['BLEU_source','BLEU_human','fasttext_classifier','klm_ppl', 'BERT_classifier', 'gpt2_ppl'])

In [27]:
models_list = df[0:1].values.tolist()
#df_res.insert(loc=0, column='GLEU_score', value=gleu_list)
df_res0.insert(loc=0, column='model', value=models_list[0])
df_res1.insert(loc=0, column='model', value=models_list[0])
df_resavg.insert(loc=0, column='model', value=models_list[0])

In [28]:
df_resavg

Unnamed: 0,model,BLEU_source,BLEU_human,fasttext_classifier,klm_ppl,BERT_classifier,gpt2_ppl
0,Source,100.0,39.738748,0.5,71.114677,0.5,34.398357
1,CROSSALIGNED,24.07843,22.435741,0.508333,8.699634,0.965,10.100918
2,STYLEEMBEDDING,53.766904,29.552737,0.51,53.519027,0.543333,80.258927
3,MULTIDECODER,48.664781,26.984203,0.513333,29.37026,0.66,40.451374
4,DELETEONLY,59.059534,36.302164,0.575,45.572322,0.823333,52.509899
5,DELETEANDRETRIEVE,49.057348,38.179359,0.675,23.379873,0.95,28.776396
6,BERT_DEL,68.276396,43.06227,0.56,153.172433,0.705,28.864022
7,BERT_RET_TFIDF,13.123003,18.964853,0.523026,735.587221,0.689388,45.005159
8,HUMAN,36.44603,100.0,0.555,124.598215,0.805,41.424338


In [29]:
df_res0.to_csv('matrics/imagecaption/matrics_imagecaption_all_model_prediction_0.csv')

In [30]:
df_res1.to_csv('matrics/imagecaption/matrics_imagecaption_all_model_prediction_1.csv')

In [31]:
df_resavg.to_csv('matrics/imagecaption/matrics_imagecaption_all_model_prediction_avg.csv')