# CH07c Semantic similarity experiment with FLAIR

In [25]:
!pip install flair



In [26]:
import pandas as pd

In [27]:
similar=[("A black dog walking beside a pool.","A black dog is walking along the side of a pool."),
("A blonde woman looks for medical supplies for work in a suitcase.	"," The blond woman is searching for medical supplies in a suitcase."),
("A doubly decker red bus driving down the road.","A red double decker bus driving down a street."),
("There is a black dog jumping into a swimming pool.","A black dog is leaping into a swimming pool."),
("The man used a sword to slice a plastic bottle.	","A man sliced a plastic bottle with a sword.")]
pd.DataFrame(similar, columns=["sen1", "sen2"])


Unnamed: 0,sen1,sen2
0,A black dog walking beside a pool.,A black dog is walking along the side of a pool.
1,A blonde woman looks for medical supplies for ...,The blond woman is searching for medical supp...
2,A doubly decker red bus driving down the road.,A red double decker bus driving down a street.
3,There is a black dog jumping into a swimming p...,A black dog is leaping into a swimming pool.
4,The man used a sword to slice a plastic bottle.\t,A man sliced a plastic bottle with a sword.


In [28]:
import pandas as pd
dissimilar= [("A little girl and boy are reading books. ", "An older child is playing with a doll while gazing out the window."),
("Two horses standing in a field with trees in the background.", "A black and white bird on a body of water with grass in the background."),
("Two people are walking by the ocean." , "Two men in fleeces and hats looking at the camera."),
("A cat is pouncing on a trampoline.","A man is slicing a tomato."),
("A woman is riding on a horse.","A man is turning over tables in anger.")]
pd.DataFrame(dissimilar, columns=["sen1", "sen2"])

Unnamed: 0,sen1,sen2
0,A little girl and boy are reading books.,An older child is playing with a doll while ga...
1,Two horses standing in a field with trees in t...,A black and white bird on a body of water with...
2,Two people are walking by the ocean.,Two men in fleeces and hats looking at the cam...
3,A cat is pouncing on a trampoline.,A man is slicing a tomato.
4,A woman is riding on a horse.,A man is turning over tables in anger.


In [29]:
import torch, numpy as np
def sim(s1,s2):
  # cosine similarity function outputs in the range 0-1
  s1=s1.embedding.unsqueeze(0)
  s2=s2.embedding.unsqueeze(0)
  sim=torch.cosine_similarity(s1,s2).item() 
  return np.round(sim,2)

def evaluate(embeddings, myPairList):
  # it evaluates embeddings for a given list of sentence pair
  scores=[]
  for s1, s2 in myPairList:
    s1,s2=Sentence(s1), Sentence(s2)
    embeddings.embed(s1)
    embeddings.embed(s2)
    score=sim(s1,s2)
    scores.append(score)
  return scores, np.round(np.mean(scores),2)

## Document Pool Embedding

The Document Pool embeddings apply mean pooling operation over all word where the average of all word embeddings in a sentence is computed to obtain sentence embedding.

In [30]:
from flair.data import Sentence
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
glove_embedding = WordEmbeddings('glove')
glove_pool_embeddings = DocumentPoolEmbeddings([glove_embedding])

In [31]:
evaluate(glove_pool_embeddings, similar)

([0.97, 0.99, 0.97, 0.99, 0.98], 0.98)

In [32]:
evaluate(glove_pool_embeddings, dissimilar)

([0.94, 0.97, 0.94, 0.92, 0.93], 0.94)

## RNN-based Document Embeddings

In [33]:
from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings
gru_embeddings = DocumentRNNEmbeddings([glove_embedding])

In [34]:
evaluate(gru_embeddings, similar)

([0.99, 1.0, 0.95, 1.0, 0.91], 0.97)

In [35]:
evaluate(gru_embeddings, dissimilar)

([0.87, 1.0, 0.92, 0.92, 0.88], 0.92)

## Transformer-based BERT Embeddings

In [36]:
from flair.embeddings import TransformerDocumentEmbeddings
from flair.data import Sentence
bert_embeddings = TransformerDocumentEmbeddings('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
evaluate(bert_embeddings, similar)

([0.85, 0.9, 0.96, 0.91, 0.89], 0.9)

In [38]:
evaluate(bert_embeddings, dissimilar)

([0.93, 0.94, 0.86, 0.93, 0.92], 0.92)

## SentenceBERT

In [39]:
!pip install sentence-transformers



In [40]:
from flair.data import Sentence
from flair.embeddings import SentenceTransformerDocumentEmbeddings
# init embedding
sbert_embeddings = SentenceTransformerDocumentEmbeddings('bert-base-nli-mean-tokens')

In [41]:
evaluate(sbert_embeddings, similar)

([0.98, 0.95, 0.96, 0.99, 0.98], 0.97)

In [42]:
evaluate(sbert_embeddings, dissimilar)

([0.48, 0.41, 0.19, -0.05, 0.0], 0.21)

In [43]:
# Tricky pairs

In [44]:
tricky_pairs=[("An elephant is bigger than a lion","A lion is bigger than an elephant") ,("the cat sat on the mat","the mat sat on the cat")]

In [45]:
evaluate(glove_pool_embeddings, tricky_pairs)

([1.0, 1.0], 1.0)

In [46]:
evaluate(gru_embeddings, tricky_pairs)

([0.91, 0.67], 0.79)

In [47]:
evaluate(bert_embeddings, tricky_pairs)

([1.0, 0.98], 0.99)

In [48]:
evaluate(sbert_embeddings, tricky_pairs)

([0.93, 0.97], 0.95)