# Demo for ACTS Assignments
#### Rodrigo Alejandro Chavez Mulsa


In [1]:
%load_ext autoreload
%autoreload 2
from modules.Classifier import Classifier
from pytorch_lightning import LightningModule

from pytorch_lightning import Trainer
from modules.AverageEmbeddings import AverageEmbeddings
from modules.Classifier import Classifier
import torch
from torchtext.legacy.datasets.nli import SNLI
from torchtext.legacy.data import Field
from torchtext.legacy import data
import torchtext
from torch.utils.data import DataLoader

If you need the rest of the code to run it, it is available at: 
https://github.com/Noixas/ACTS-SNLI-Transfer

### Functions

In [2]:
def get_checkpoint_path(model_name='awe'):
    """Get the path to the model checkpoints. Available: [awe, lstm, bilstm, bilstm-max]
    """
    return'trained_models/'+model_name+'/gold/'+model_name+'.ckpt'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [205]:
def get_prediction(_premise,_hypothesis,model,print_premise = True, print_pred = True):
    premise  = _premise.split(' ')
    hypothesis  = _hypothesis.split(' ')

    prem = TEXT.process([premise])
    hyp = TEXT.process([hypothesis])

    prem = [a.to(device) for a in prem] #Send to device
    hyp = [a.to(device) for a in hyp] 

    pred = model.demo_inference(prem,hyp)
    if print_premise:
        print("Premise:",_premise,'\nHypothesis:',_hypothesis)
    if print_pred:
        print("Predicted label -->",pred[1],'<-- with confidence:',pred[2])
    return pred[-1].detach().cpu().numpy()

In [148]:
def get_all_models_prediction(_premise,_hypothesis,models_list):
    print("Premise:",_premise,'\nHypothesis:',_hypothesis)
    for model_name in models_list:        
        print("Model:",model_name)
        model = load_model(model_name)
        probs = get_prediction(_premise,_hypothesis,model,print_premise = False)
        print("Probabilities",probs)

In [4]:
def load_model(model_name):
    checkpath = get_checkpoint_path(model_name)
    model = Classifier()
    pretrained_model = model.load_from_checkpoint(checkpath,model_name=model_name,disable_nonlinear=True,emb_vec=TEXT.vocab.vectors).to(device)
    return pretrained_model

## Load models

In [5]:
glove = torchtext.vocab.GloVe(name='840B', dim=300)

In [193]:
TEXT = Field(lower=True, include_lengths=True, batch_first=True,tokenize='spacy',tokenizer_language="en_core_web_sm")
LABEL = Field(sequential=False)

train, dev, test = SNLI.splits(TEXT, LABEL, root= './data')
TEXT.build_vocab(train, vectors=glove)
LABEL.build_vocab(train, specials_first=False)

In [7]:
awe_model = load_model('awe')
lstm_model = load_model('lstm')
bilstm_model = load_model('bilstm')
bilstm_max_model = load_model('bilstm-max')

# Demo HERE
Comment out the model to test and change the premise or hypothesis.

In [238]:
#Twice the same cells in case we want to input some sentences but want to keep the basic example.
#Entailment
premise = "Two woman are embracing while holding to go packages"
hypothesis = "Two woman are holding packages"
#Contradiction
premise="A man is typing on a machine used for stenography."
hypothesis="The man is not operating a stenograph."

In [297]:
# active_model = awe_model
# active_model = lstm_model
# active_model = bilstm_model
active_model = bilstm_max_model

#### Change input here!

In [296]:
premise = "A man is walking a dog"
hypothesis = "No cat is outside"

In [298]:
get_prediction(premise,hypothesis,active_model)

Premise: A man is walking a dog 
Hypothesis: No cat is outside
Predicted label --> contradiction <-- with confidence: 0.9996337890625


array([7.8099544e-07, 9.9963379e-01, 3.6542348e-04], dtype=float32)

## Scores 
In the following table we can see the accuracy scores for the NLI dev and test set along with the micro and macro scores of the transfer task which were measured by aggregating the accuracy scores of the following transfer tasks:
 
`transfer_tasks=['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC','MRPC', 'SICKEntailment']`

| Model      | NLI-dev | NLI-test | Transf-micro | Transf-macro |
|------------|---------|----------|--------------|--------------|
| AWE        | 0.6173  | 0.6283   | 82.573       | 79.129       |
| LSTM       | 0.791   | 0.7834   | 79.894       | 78.337       |
| BILSTM     | 0.7935  | 0.7948   | 83.36        | 82.185       |
| BILSTM-MAX | 0.834   | 0.8333   | 87.075       | 84.95        |

For the transfer tasks I used the suggested parameters to reproduce the results from the authors which are the following:    
`params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}`    
`params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,'tenacity': 5, 'epoch_size': 4}`

## Error Analysis

### Tokenization
First would like to compare an example where the difference between the tokenization of `isn't` and `is not` give use different results:

In [279]:
premise="A man is typing on a machine used for stenography."
print('Prediction with "is not" gives good results')
print('Expected: contradiction\n')

print('----------------------------------------------')
hypothesis="The man is not operating a stenograph."
get_all_models_prediction(premise,hypothesis,['awe','lstm','bilstm','bilstm-max'])

print('----------------------------------------------')
print('Prediction with "is not" gives good results')
print('Expected: contradiction\n')
hypothesis="The man is'nt operating a stenograph."
get_all_models_prediction(premise,hypothesis,['awe','lstm','bilstm','bilstm-max'])

Prediction with "is not" gives good results
Expected: contradiction

----------------------------------------------
Premise: A man is typing on a machine used for stenography. 
Hypothesis: The man is not operating a stenograph.
Model: awe
Predicted label --> entailment <-- with confidence: 0.4668668806552887
Probabilities [0.46686688 0.23888522 0.29424793]
Model: lstm
Predicted label --> entailment <-- with confidence: 0.7181154489517212
Probabilities [0.71811545 0.22792126 0.0539633 ]
Model: bilstm
Predicted label --> contradiction <-- with confidence: 0.7327508330345154
Probabilities [0.20750299 0.73275083 0.0597462 ]
Model: bilstm-max
Predicted label --> contradiction <-- with confidence: 0.8825724720954895
Probabilities [0.06302392 0.8825725  0.05440364]
----------------------------------------------
Prediction with "is not" gives good results
Expected: contradiction

Premise: A man is typing on a machine used for stenography. 
Hypothesis: The man is'nt operating a stenograph.
Mode

Based on the previous scores we can see how depending on the tokenization of certain abbreviatures could bring completely different predictions. Spacy tokenizer transforms "isn't" to "is","n't" and the glove vectors have a different vector for "n't" than "not". My hypothesis is that the amount of samples that use "n't" is less than the ones that use "not" so the models dont have as much data to learn with those examples.

In [277]:
print("ID of n't in glove:",glove.stoi["n't"])
print("ID of not in glove:",glove.stoi["not"])

ID of n't in glove: 40
ID of not in glove: 35


#### Examples from paper

In the analysis of the paper "A large annotated corpus for learning natural language inference", they mention an example with: "A young girl throws sand toward the ocean" as premise and "A girl can’t stand the ocean" as hypothesis and mention that all the models wrongly predict it as entailment, we can see in the cell bellow that is not the case for our models, they predict contradiction yet the expected label is neutral.

In [254]:
premise = "A young girl throws sand toward the ocean"
hypothesis = "A girl can’t stand the ocean"
#neutral
print('Expected: Neutral')
get_all_models_prediction(premise,hypothesis,['awe','lstm','bilstm','bilstm-max'])

Expected: Neutral
Premise: A young girl throws sand toward the ocean 
Hypothesis: A girl can’t stand the ocean
Model: awe
Predicted label --> neutral <-- with confidence: 0.5283992290496826
Probabilities [0.02873656 0.44286418 0.5283992 ]
Model: lstm
Predicted label --> contradiction <-- with confidence: 0.4698620140552521
Probabilities [0.43704367 0.469862   0.09309435]
Model: bilstm
Predicted label --> contradiction <-- with confidence: 0.49269798398017883
Probabilities [0.4405183  0.49269798 0.06678371]
Model: bilstm-max
Predicted label --> contradiction <-- with confidence: 0.9215252995491028
Probabilities [0.01286405 0.9215253  0.0656106 ]


The lstm based models missclassifies the example as contradiction while the AWE Correclty classifies it as Neutral. We must pay attention to the probabilities where the lstm and bilstm have almost the same value for this example while the max pooling confidently predicts it a contradiction. The AWE correctly classifies the example but it does not mean it actually understand the context.

### Contradiction misinterpreted as entailment
Nevertheless the previous example does mean that our models outperform the models from the paper, we can see a similar example below where the models wrongly predict the example as entailment probably due to beach and ocean being associated but the models fail to understand the contradiction between fully clothed and naked. But the bilstm max pooling confidently predict it as contradiction.

In [255]:
premise = "Three men stand on the beach, fully clothed."
hypothesis = "Three naked men in the ocean."
#Contradiction
print('Expected: Contradiction')
get_all_models_prediction(premise,hypothesis,['awe','lstm','bilstm','bilstm-max'])

Premise: Three men stand on the beach, fully clothed. 
Hypothesis: Three naked men in the ocean.
Model: awe
Predicted label --> entailment <-- with confidence: 0.9874842762947083
Probabilities [9.87484276e-01 6.79708493e-04 1.18359765e-02]
Model: lstm
Predicted label --> entailment <-- with confidence: 0.48467692732810974
Probabilities [0.48467693 0.18359077 0.3317323 ]
Model: bilstm
Predicted label --> entailment <-- with confidence: 0.741700291633606
Probabilities [0.7417003  0.11770273 0.14059706]
Model: bilstm-max
Predicted label --> contradiction <-- with confidence: 0.9837123155593872
Probabilities [0.0012838  0.9837123  0.01500385]


### Other examples

In [259]:
premise = "The kid is inside"
hypothesis = "The kid is outside"

print('Expected: Contradiction')
get_all_models_prediction(premise,hypothesis,['awe','lstm','bilstm','bilstm-max'])

Expected: Contradiction
Premise: The kid is inside 
Hypothesis: The kid is outside
Model: awe
Predicted label --> entailment <-- with confidence: 0.999660849571228
Probabilities [9.9966085e-01 6.5033839e-07 3.3850095e-04]
Model: lstm
Predicted label --> contradiction <-- with confidence: 0.693578839302063
Probabilities [0.17099066 0.69357884 0.13543046]
Model: bilstm
Predicted label --> contradiction <-- with confidence: 0.7297667860984802
Probabilities [0.0951359  0.7297668  0.17509724]
Model: bilstm-max
Predicted label --> contradiction <-- with confidence: 0.9805310368537903
Probabilities [0.00177579 0.98053104 0.01769319]


In [258]:
premise = "Older lady checking out her goods at the check-out counter."
hypothesis = "A woman is sitting on her bed."
#Contradiction
print('Expected: Contradiction')
get_all_models_prediction(premise,hypothesis,['awe','lstm','bilstm','bilstm-max'])

Expected: Contradiction
Premise: Older lady checking out her goods at the check-out counter. 
Hypothesis: A woman is sitting on her bed.
Model: awe
Predicted label --> neutral <-- with confidence: 0.43991172313690186
Probabilities [0.22904624 0.33104205 0.43991172]
Model: lstm
Predicted label --> entailment <-- with confidence: 0.8786997199058533
Probabilities [0.8786997  0.00556939 0.11573087]
Model: bilstm
Predicted label --> entailment <-- with confidence: 0.8536057472229004
Probabilities [0.85360575 0.02180326 0.12459102]
Model: bilstm-max
Predicted label --> contradiction <-- with confidence: 0.410432368516922
Probabilities [0.40006053 0.41043237 0.18950711]


### Dev data sanalysis

In [217]:
LABEL.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f7e316b5ca0>>,
            {'entailment': 0,
             'contradiction': 1,
             'neutral': 2,
             '<unk>': 3,
             
             [torchtext.legacy.data.batch.Batch of size 64 from SNLI]
             	[.premise]:('[torch.cuda.LongTensor of size 64x27 (GPU 0)]', '[torch.cuda.LongTensor of size 64 (GPU 0)]')
             	[.hypothesis]:('[torch.cuda.LongTensor of size 64x22 (GPU 0)]', '[torch.cuda.LongTensor of size 64 (GPU 0)]')
             	[.label]:[torch.cuda.LongTensor of size 64 (GPU 0)]: 3})

In [287]:
def get_acc_dev(model,name):
    correct = 0
    total = len(dev.examples)
    correct_class = [0,0,0]
    amount_class = [0,0,0]
    from tqdm import tqdm
    for i, ex in enumerate(tqdm(dev.examples)):
        # print("\nExpected Label",ex.label)
        pr = get_prediction(' '.join(ex.premise),' '.join(ex.hypothesis),model,False,print_pred=False)
        prediction_n = np.argmax(pr)
        correct_id_labe = LABEL.vocab.stoi[ex.label]
        correct += 1 if correct_id_labe == prediction_n else 0
        correct_class[correct_id_labe] += 1 if correct_id_labe == prediction_n else 0
        amount_class[correct_id_labe] += 1 

    acc = correct/total
    acc_class = [correct_class[0]/amount_class[0], correct_class[1]/amount_class[1],correct_class[2]/amount_class[2]]
    print("\n ",name,"Dev Accuracy:",acc)
    labels = ['entailment', 'contradiction', 'neutral']
    print(labels)
    print("Acc per class:",acc_class)
    return acc,acc_class

In [295]:
acc_awe  =get_acc_dev(awe_model,'AWE')
acc_lstm  =get_acc_dev(lstm_model,'LSTM')
acc_bilstm  =get_acc_dev(bilstm_model,'BILSTM')
acc_bilstm_max  =get_acc_dev(bilstm_max_model,'BILSTM-MAX')

100%|██████████| 9842/9842 [00:04<00:00, 2346.03it/s]
  0%|          | 14/9842 [00:00<01:13, 134.15it/s]
  AWE Dev Accuracy: 0.6040438935175777
['entailment', 'contradiction', 'neutral']
Acc per class: [0.4373685791528988, 0.7535082367297132, 0.6241112828438949]
100%|██████████| 9842/9842 [00:56<00:00, 173.62it/s]
  0%|          | 8/9842 [00:00<02:08, 76.71it/s]
  LSTM Dev Accuracy: 0.7958748221906117
['entailment', 'contradiction', 'neutral']
Acc per class: [0.8437969360168218, 0.8081147040878585, 0.7341576506955177]
100%|██████████| 9842/9842 [01:44<00:00, 93.79it/s]
  0%|          | 8/9842 [00:00<02:13, 73.47it/s]
  BILSTM Dev Accuracy: 0.7897785003048161
['entailment', 'contradiction', 'neutral']
Acc per class: [0.8146590567738059, 0.801098230628432, 0.7527047913446677]
100%|██████████| 9842/9842 [01:49<00:00, 89.61it/s]
  BILSTM-MAX Dev Accuracy: 0.8292013818329608
['entailment', 'contradiction', 'neutral']
Acc per class: [0.878642234905377, 0.7928615009151921, 0.8151468315301391]

In [None]:
#Output in case notebook is cleaned
# 100%|██████████| 9842/9842 [00:04<00:00, 2346.03it/s]
#   0%|          | 14/9842 [00:00<01:13, 134.15it/s]
#   AWE Dev Accuracy: 0.6040438935175777
# ['entailment', 'contradiction', 'neutral']
# Acc per class: [0.4373685791528988, 0.7535082367297132, 0.6241112828438949]
# 100%|██████████| 9842/9842 [00:56<00:00, 173.62it/s]
#   0%|          | 8/9842 [00:00<02:08, 76.71it/s]
#   LSTM Dev Accuracy: 0.7958748221906117
# ['entailment', 'contradiction', 'neutral']
# Acc per class: [0.8437969360168218, 0.8081147040878585, 0.7341576506955177]
# 100%|██████████| 9842/9842 [01:44<00:00, 93.79it/s]
#   0%|          | 8/9842 [00:00<02:13, 73.47it/s]
#   BILSTM Dev Accuracy: 0.7897785003048161
# ['entailment', 'contradiction', 'neutral']
# Acc per class: [0.8146590567738059, 0.801098230628432, 0.7527047913446677]
# 100%|██████████| 9842/9842 [01:49<00:00, 89.61it/s]
#   BILSTM-MAX Dev Accuracy: 0.8292013818329608
# ['entailment', 'contradiction', 'neutral']
# Acc per class: [0.878642234905377, 0.7928615009151921, 0.8151468315301391]

On the output of the previous cell we can see that AWE model is better in finding contradiction but this could be simply that the model is biased to it and having certain words like "not" implies contradiciton.   
In the LSTM based models the results are more balanced per class but neutral remains lower in the lstm and bilstm while for the max pooling model contradictions perform worse.

In [288]:
acc_awe  =get_acc_dev(awe_model,'AWE')

100%|██████████| 9842/9842 [00:04<00:00, 2271.92it/s]
  AWE Dev Accuracy: 0.6040438935175777
['entailment', 'contradiction', 'neutral']
Acc per class: [0.4373685791528988, 0.7535082367297132, 0.6241112828438949]



In [None]:
# LABEL.vocab.stoi