In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/glove6b50dtxt/glove.6B.50d.txt
/kaggle/input/sick-sentence-similarity/SICK.txt


Here, I am gonna try out various methods to infer sentence similarities. The methods I am gonna try out are as follows:
1. Averaging Glove embeddings & finding cosine distance.
2. Using smooth inverse frequency on word embeddings.
3. Using Word Movers Distance method on embeddings.
4. Google Sentence Encoder
5. Cosine Similarity on BERT embeddings.

In [33]:
import re
from nltk.corpus import stopwords
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
from sklearn.preprocessing import MinMaxScaler

# Preparing the Data
The data I'll be using for practice is SICK data (https://zenodo.org/record/2787612) that contains 10,000 English sentence pairs labelled with their semantic relatedness and entailment relation

In [16]:
original_df = pd.read_csv('../input/sick-sentence-similarity/SICK.txt', header = 0, error_bad_lines = False, sep = '\t')
display(original_df.head())
display(len(original_df))

Unnamed: 0,pair_ID,sentence_A,sentence_B,entailment_label,relatedness_score,entailment_AB,entailment_BA,sentence_A_original,sentence_B_original,sentence_A_dataset,sentence_B_dataset,SemEval_set
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,NEUTRAL,4.5,A_neutral_B,B_neutral_A,"A group of children playing in a yard, a man i...","A group of children playing in a yard, a man i...",FLICKR,FLICKR,TRAIN
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,NEUTRAL,3.2,A_contradicts_B,B_neutral_A,"A group of children playing in a yard, a man i...","A group of children playing in a yard, a man i...",FLICKR,FLICKR,TRAIN
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,ENTAILMENT,4.7,A_entails_B,B_entails_A,"The children are playing outdoors, while a man...","The children are playing outdoors, while a man...",FLICKR,FLICKR,TRAIN
3,4,The young boys are playing outdoors and the ma...,There is no boy playing outdoors and there is ...,CONTRADICTION,3.6,A_contradicts_B,B_contradicts_A,"The children are playing outdoors, while a man...","The children are playing outdoors, while a man...",FLICKR,FLICKR,TRIAL
4,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,NEUTRAL,3.4,A_neutral_B,B_neutral_A,"A group of children playing in a yard, a man i...","The children are playing outdoors, while a man...",FLICKR,FLICKR,TRAIN


9840

In [17]:
display(original_df.tail(10))

Unnamed: 0,pair_ID,sentence_A,sentence_B,entailment_label,relatedness_score,entailment_AB,entailment_BA,sentence_A_original,sentence_B_original,sentence_A_dataset,sentence_B_dataset,SemEval_set
9830,9991,The young girl is blowing a bubble that is huge,There is no girl in pink twirling a ribbon,NEUTRAL,2.1,A_neutral_B,B_neutral_A,The young girl blows a huge bubble.,a girl in pink twirls a ribbon,FLICKR,FLICKR,TEST
9831,9992,A dog in a colored coat is running across the ...,The flute is being played by one man,NEUTRAL,1.0,A_neutral_B,B_neutral_A,A man is playing a flute.,a dog with a brindle-colored coat is running a...,SEMEVAL,FLICKR,TEST
9832,9993,A door is being opened by a man,A bald man in a band is playing guitar in the ...,NEUTRAL,1.1,A_neutral_B,B_neutral_A,a man is opening a door,a bald man in a band is playing guitar in the ...,SEMEVAL,FLICKR,TRAIN
9833,9994,A boy is happily playing the piano,A white bird is landing swiftly in the water,NEUTRAL,1.0,A_neutral_B,B_neutral_A,A boy is playing the piano.,a white birds lands swiftly in the water,SEMEVAL,FLICKR,TEST
9834,9995,"The girl, who is little, is combing her hair i...",Two people wearing helmets are driving over th...,NEUTRAL,1.0,A_neutral_B,B_neutral_A,The little girl is putting her hair into a pon...,two people wearing helmets ride over the yello...,SEMEVAL,FLICKR,TEST
9835,9996,A man is in a parking lot and is playing tenni...,The snowboarder is leaping fearlessly over whi...,NEUTRAL,1.0,A_neutral_B,B_neutral_A,A man is playing tennis with himself against a...,A snowboarder launches into the air over white...,FLICKR,FLICKR,TEST
9836,9997,Someone is boiling okra in a pot,The man is not playing the drums,NEUTRAL,1.0,A_neutral_B,B_neutral_A,someone is boiling okra in a pot,the man is playing the drums,SEMEVAL,SEMEVAL,TRAIN
9837,9998,The man is singing heartily and playing the gu...,A bicyclist is holding a bike over his head in...,NEUTRAL,1.0,A_neutral_B,B_neutral_A,the man sang and played his guitar,a bicyclist holds their bike over their head b...,SEMEVAL,FLICKR,TRAIN
9838,9999,A man in blue has a yellow ball in the mitt,A man is jumping rope outside,NEUTRAL,1.2,A_neutral_B,B_neutral_A,a man is jumping rope outside,a woman in blue has a yellow ball in her mitt,SEMEVAL,FLICKR,TRAIN
9839,10000,Three dogs are resting on a sidewalk,The woman with a knife is slicing a pepper,NEUTRAL,1.0,A_neutral_B,B_neutral_A,a woman with a knife is slicing a pepper,three dogs on a sidewalk,SEMEVAL,FLICKR,TRAIN


In [30]:
# Let's try to get the range of the relatedness score
original_df['relatedness_score'].describe()

count    9840.000000
mean        3.525636
std         1.017050
min         1.000000
25%         3.000000
50%         3.600000
75%         4.300000
max         5.000000
Name: relatedness_score, dtype: float64

# 1. Averaging Glove Embeddings and Cosine Distance

In [36]:
gloveFile = '../input/glove6b50dtxt/glove.6B.50d.txt'
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    with open(gloveFile, encoding="utf8" ) as f:
        content = f.readlines()
    model = {}
    for line in content:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done.",len(model)," words loaded!")
    return model

model = loadGloveModel(gloveFile)

Loading Glove Model
Done. 400000  words loaded!


In [51]:
def preprocess(raw_text):
    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # convert to lower case and split 
    words = letters_only_text.lower().split()

    # remove stopwords
    stopword_set = set(stopwords.words("english"))
    cleaned_words = list(set([w for w in words if w not in stopword_set]))
    
    # remove the words we do not have an embedding for
    preprocessed_words = list(set([w for w in cleaned_words if w in list(model.keys())]))

    return preprocessed_words

def cosine_distance_between_two_words(word1, word2):
    return (1- scipy.spatial.distance.cosine(model[word1], model[word2]))

def calculate_heat_matrix_for_two_sentences(s1,s2):
    s1 = preprocess(s1)
    s2 = preprocess(s2)
    result_list = [[cosine_distance_between_two_words(word1, word2) for word2 in s2] for word1 in s1]
    result_df = pd.DataFrame(result_list)
    result_df.columns = s2
    result_df.index = s1
    return result_df

def cosine_distance_on_sentences(s1, s2):
    vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
    vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0)
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
    # print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%')
    return cosine

def heat_map_matrix_between_two_sentences(s1,s2):
    df = calculate_heat_matrix_for_two_sentences(s1,s2)
    fig, ax = plt.subplots(figsize=(5,5)) 
    ax_blue = sns.heatmap(df, cmap="YlGnBu")
    # ax_red = sns.heatmap(df)
    print(cosine_distance_wordembedding_method(s1, s2))
    return ax_blue

def evaluate_cosine_distance_on_dataset(df):
    predicted_similarity = []
    cosine_distance = []
    for i in range(len(df)):
        #print(df.iloc[i, 1])
        #print(type(df.iloc[i, 1]))
        cosine_distance_between_sentences = cosine_distance_on_sentences(df.iloc[i,1], df.iloc[i, 2])
        cosine_distance.append(cosine_distance_between_sentences)
    scaler = MinMaxScaler()
    cosine_distance = np.array(cosine_distance)
    cosine_distance_scaled = scaler.fit_transform(cosine_distance.reshape(-1, 1))
    predicted_similarity = np.multiply(np.array(cosine_distance_scaled), 4) + 1     #rescaling the data into 1 to 5 range (just like the relatedness score)
    return predicted_similarity

In [52]:
# Now we test it on the given original_df dataset

pred_similarity = evaluate_cosine_distance_on_dataset(original_df)
print(pred_similarity)

[[1.05526665]
 [1.14133014]
 [1.15881694]
 ...
 [2.8044616 ]
 [1.9937419 ]
 [2.75121888]]


In [55]:
pred_similarity_df = pd.DataFrame(pred_similarity)
pred_similarity_df.describe()

# Look at the goddamn mess, seems like I did a terrible job at scaling. Will have to change that.

Unnamed: 0,0
count,9840.0
mean,1.434124
std,0.477848
min,1.0
25%,1.085031
50%,1.289991
75%,1.614767
max,5.0


In [None]:
# Now we evaluate the results by calculating the MSE of the predicted similarity against, the actual similarity

# 2. Using Smooth Inverse Frequency on Word Embeddings