# GLUE data
Let's see if the LMs are aware of some meta-task incoherences

In [1]:
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
import datasets

In [2]:
from transformers import pipeline
nlp = pipeline("fill-mask", model="bert-base-cased")
nlp(f"This is the best thing I've {nlp.tokenizer.mask_token} in my life.")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[{'sequence': "[CLS] This is the best thing I've done in my life. [SEP]",
  'score': 0.4526607096195221,
  'token': 1694,
  'token_str': 'done'},
 {'sequence': "[CLS] This is the best thing I've said in my life. [SEP]",
  'score': 0.12792159616947174,
  'token': 1163,
  'token_str': 'said'},
 {'sequence': "[CLS] This is the best thing I've heard in my life. [SEP]",
  'score': 0.08337395638227463,
  'token': 1767,
  'token_str': 'heard'},
 {'sequence': "[CLS] This is the best thing I've had in my life. [SEP]",
  'score': 0.07100346684455872,
  'token': 1125,
  'token_str': 'had'},
 {'sequence': "[CLS] This is the best thing I've seen in my life. [SEP]",
  'score': 0.044269781559705734,
  'token': 1562,
  'token_str': 'seen'}]

## IMDB

In [3]:
nlp(f"This movie is great and my kids loved it. This is a {nlp.tokenizer.mask_token} review", targets=[' positive', ' negative'])

[{'sequence': '[CLS] This movie is great and my kids loved it. This is a positive review [SEP]',
  'score': 0.0781143307685852,
  'token': 3112,
  'token_str': 'positive'},
 {'sequence': '[CLS] This movie is great and my kids loved it. This is a negative review [SEP]',
  'score': 0.012503745965659618,
  'token': 4366,
  'token_str': 'negative'}]

In [4]:
nlp(f"My kids thought this movie is disgusting. This is a {nlp.tokenizer.mask_token} review", targets=[' positive', ' negative'])

[{'sequence': '[CLS] My kids thought this movie is disgusting. This is a negative review [SEP]',
  'score': 0.06674962490797043,
  'token': 4366,
  'token_str': 'negative'},
 {'sequence': '[CLS] My kids thought this movie is disgusting. This is a positive review [SEP]',
  'score': 0.04843594878911972,
  'token': 3112,
  'token_str': 'positive'}]

In [5]:
long_review = """I admit I had some trepidation when I first saw the previews for this film. Was VH-1 treading on 
hollow ground here? I mean, Harris and Quinn don't really look or even sound like John or Paul. But I 
have to admit, this film really surprised me. It's far from the exploitation film I expected.
Instead, it's a character study, a low-key, whimsical, and ultimately bittersweet look at 
friendship, and the ultimate lesson we all learn: it's hard, if not impossible, to capture what we 
once had, and what has passed us by."""  # gold label: positive
nlp(f"{long_review} This is a {nlp.tokenizer.mask_token} review", targets=[' positive', ' negative'])

[{'sequence': "[CLS] I admit I had some trepidation when I first saw the previews for this film. Was VH - 1 treading on hollow ground here? I mean, Harris and Quinn don't really look or even sound like John or Paul. But I have to admit, this film really surprised me. It's far from the exploitation film I expected. Instead, it's a character study, a low - key, whimsical, and ultimately bittersweet look at friendship, and the ultimate lesson we all learn : it's hard, if not impossible, to capture what we once had, and what has passed us by. This is a positive review [SEP]",
  'score': 0.12882345914840698,
  'token': 3112,
  'token_str': 'positive'},
 {'sequence': "[CLS] I admit I had some trepidation when I first saw the previews for this film. Was VH - 1 treading on hollow ground here? I mean, Harris and Quinn don't really look or even sound like John or Paul. But I have to admit, this film really surprised me. It's far from the exploitation film I expected. Instead, it's a character st

In [6]:
long_review = """This film tried to be too many things all at once: stinging political satire, Hollywood blockbuster, ..."""  # gold label: negative
nlp(f"{long_review} This is a {nlp.tokenizer.mask_token} review", targets=[' positive', ' negative'])

[{'sequence': '[CLS] This film tried to be too many things all at once : stinging political satire, Hollywood blockbuster,... This is a negative review [SEP]',
  'score': 0.056411635130643845,
  'token': 4366,
  'token_str': 'negative'},
 {'sequence': '[CLS] This film tried to be too many things all at once : stinging political satire, Hollywood blockbuster,... This is a positive review [SEP]',
  'score': 0.03836614638566971,
  'token': 3112,
  'token_str': 'positive'}]

## Quora
Similarity is ok

In [7]:
nlp(f"Question 1: Who is Bill Gates? Question 2: Who is Elon Musk? Comment: Are these the same questions? {nlp.tokenizer.mask_token}", 
    targets=['Yes', 'No'])

[{'sequence': '[CLS] Question 1 : Who is Bill Gates? Question 2 : Who is Elon Musk? Comment : Are these the same questions? No [SEP]',
  'score': 1.9899765902664512e-05,
  'token': 1302,
  'token_str': 'No'},
 {'sequence': '[CLS] Question 1 : Who is Bill Gates? Question 2 : Who is Elon Musk? Comment : Are these the same questions? Yes [SEP]',
  'score': 1.7568801922607236e-05,
  'token': 2160,
  'token_str': 'Yes'}]

In [8]:
nlp(f"Question 1: Who is Bill Gates? Question 2: Who is the person named Bill Gates? Comment: Are these the same questions? {nlp.tokenizer.mask_token}", 
    targets=['Yes', 'No'])
# The probabilities are very small. These are likely because the sentences do not follow natural grammar.

[{'sequence': '[CLS] Question 1 : Who is Bill Gates? Question 2 : Who is the person named Bill Gates? Comment : Are these the same questions? No [SEP]',
  'score': 2.5009821911226027e-05,
  'token': 1302,
  'token_str': 'No'},
 {'sequence': '[CLS] Question 1 : Who is Bill Gates? Question 2 : Who is the person named Bill Gates? Comment : Are these the same questions? Yes [SEP]',
  'score': 1.9519991838023998e-05,
  'token': 2160,
  'token_str': 'Yes'}]

In [9]:
nlp(f"Are \"Who is Bill Gates?\" and \"Who is the man named Bill Gates?\" the same questions? {nlp.tokenizer.mask_token}", 
    targets=['Yes', 'No'])

[{'sequence': '[CLS] Are " Who is Bill Gates? " and " Who is the man named Bill Gates? " the same questions? Yes [SEP]',
  'score': 2.149405554519035e-05,
  'token': 2160,
  'token_str': 'Yes'},
 {'sequence': '[CLS] Are " Who is Bill Gates? " and " Who is the man named Bill Gates? " the same questions? No [SEP]',
  'score': 1.904843702504877e-05,
  'token': 1302,
  'token_str': 'No'}]

In [10]:
nlp(f"Are \"Who is Bill Gates?\" and \"Who is Elon Musk?\" the same questions? {nlp.tokenizer.mask_token}", 
    targets=['Yes', 'No'])

[{'sequence': '[CLS] Are " Who is Bill Gates? " and " Who is Elon Musk? " the same questions? No [SEP]',
  'score': 1.9226656149839982e-05,
  'token': 1302,
  'token_str': 'No'},
 {'sequence': '[CLS] Are " Who is Bill Gates? " and " Who is Elon Musk? " the same questions? Yes [SEP]',
  'score': 1.9152759705320932e-05,
  'token': 2160,
  'token_str': 'Yes'}]

In [11]:
nlp(f"Are \"Who is Bill Gates?\" and \"Who is the founder of Microsoft?\" the same questions? {nlp.tokenizer.mask_token}", 
    targets=['Yes', 'No'])

[{'sequence': '[CLS] Are " Who is Bill Gates? " and " Who is the founder of Microsoft? " the same questions? Yes [SEP]',
  'score': 1.9339111531735398e-05,
  'token': 2160,
  'token_str': 'Yes'},
 {'sequence': '[CLS] Are " Who is Bill Gates? " and " Who is the founder of Microsoft? " the same questions? No [SEP]',
  'score': 1.872795655799564e-05,
  'token': 1302,
  'token_str': 'No'}]

## NLI
NLI appears hard. Multiple types of anomalies occur when a semantic contradiction exists.

In [12]:
# HANS example from https://huggingface.co/datasets/viewer/?dataset=hans
nlp(f"Does \"The scientists supported the doctors.\" entail \"The doctors supported the scientists.\"? {nlp.tokenizer.mask_token}", 
    targets=['Yes', 'No'])

[{'sequence': '[CLS] Does " The scientists supported the doctors. " entail " The doctors supported the scientists. "? No [SEP]',
  'score': 1.100172994483728e-05,
  'token': 1302,
  'token_str': 'No'},
 {'sequence': '[CLS] Does " The scientists supported the doctors. " entail " The doctors supported the scientists. "? Yes [SEP]',
  'score': 8.395280019612983e-06,
  'token': 2160,
  'token_str': 'Yes'}]

In [13]:
nlp(f"Does \"The scientists supported the doctors.\" entail \"The doctors were supported by the scientists.\"? {nlp.tokenizer.mask_token}", 
    targets=['Yes', 'No'])
# These two sentences appear to have different syntax (but the same meaning).

[{'sequence': '[CLS] Does " The scientists supported the doctors. " entail " The doctors were supported by the scientists. "? No [SEP]',
  'score': 7.5180846579314675e-06,
  'token': 1302,
  'token_str': 'No'},
 {'sequence': '[CLS] Does " The scientists supported the doctors. " entail " The doctors were supported by the scientists. "? Yes [SEP]',
  'score': 5.064520337327849e-06,
  'token': 2160,
  'token_str': 'Yes'}]

## Gender Bias

In [14]:
nlp(f"A scientist came into the classroom. {nlp.tokenizer.mask_token} started teaching.", 
    targets=['He', 'She'])

[{'sequence': '[CLS] A scientist came into the classroom. He started teaching. [SEP]',
  'score': 0.5704528093338013,
  'token': 1124,
  'token_str': 'He'},
 {'sequence': '[CLS] A scientist came into the classroom. She started teaching. [SEP]',
  'score': 0.10203046351671219,
  'token': 1153,
  'token_str': 'She'}]

In [19]:
nlp(f"A scientist came into the classroom. {nlp.tokenizer.mask_token} started crying.", 
    targets=['He', 'She'])

[{'sequence': '[CLS] A scientist came into the classroom. He started crying. [SEP]',
  'score': 0.2962949275970459,
  'token': 1124,
  'token_str': 'He'},
 {'sequence': '[CLS] A scientist came into the classroom. She started crying. [SEP]',
  'score': 0.2684028148651123,
  'token': 1153,
  'token_str': 'She'}]

In [16]:
nlp(f"Someone came into the room. {nlp.tokenizer.mask_token} started teaching.", 
    targets=['He', 'She'])

[{'sequence': '[CLS] Someone came into the room. He started teaching. [SEP]',
  'score': 0.24824635684490204,
  'token': 1124,
  'token_str': 'He'},
 {'sequence': '[CLS] Someone came into the room. She started teaching. [SEP]',
  'score': 0.10383621603250504,
  'token': 1153,
  'token_str': 'She'}]

In [18]:
nlp(f"Someone came into the room. {nlp.tokenizer.mask_token} started crying.", 
    targets=['He', 'She'])

[{'sequence': '[CLS] Someone came into the room. She started crying. [SEP]',
  'score': 0.23532070219516754,
  'token': 1153,
  'token_str': 'She'},
 {'sequence': '[CLS] Someone came into the room. He started crying. [SEP]',
  'score': 0.07483687251806259,
  'token': 1124,
  'token_str': 'He'}]