This is a quick code starter to evaluate sentence similarity

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
from nltk import sent_tokenize
import random
from scipy import stats
from scipy.spatial import distance
import re
import spacy;
nlp = spacy.load('en_core_web_lg')

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Load the Evaluation Dataset

In [3]:
df = pd.read_csv("data/sts.csv",delimiter="\t",error_bad_lines=False).dropna().sample(frac=1).reset_index(drop=True)
df = df[df.Score >= 0];len(df)

FileNotFoundError: File b'data/sts.csv' does not exist

In [None]:
train_test_split = int(.7*len(df))
train = df[:train_test_split]
test = df[train_test_split:]
len(train),len(test)

In [None]:
test.head()

In [None]:
random.choice(test[["Sent1","Sent2","Score"]].values)

In [None]:
def mse(y_pred,y):
    return "Average Loss: {}".format(((y - y_pred) ** 2).mean(axis=None))

def corr(y_pred,y):
    pearson_correlation = stats.pearsonr(y_pred,y)
    return "Correlation To Human Scoring: {}".format(pearson_correlation[0])

# Test 1

Our initial benchmark can be assigning 3s to all the pairs of sentence. 

In [None]:
def similarity1(sent1,sent2):
    # returns random values around 3
    return 3 + (np.random.rand() - np.random.rand())/2

In [None]:
test["pred"] = test.apply(lambda row: similarity1(row["Sent1"],row["Sent2"]),axis=1)

In [None]:
test.head()

In [None]:
mse(test.pred.values,test.Score.values)

In [None]:
corr(test.pred.values,test.Score.values)

# Test 2 - Embeddings Averaging Sentence

In [None]:
%%time
sent1Embeddings = test["Sent1"].apply(lambda s: nlp(s).vector).values
sent2Embeddings = test["Sent2"].apply(lambda s: nlp(s).vector).values

In [None]:
cosine_distance = [distance.cosine(sent1Embeddings[i],sent2Embeddings[i]) for i in range(len(test))]

In [None]:
test["dist"] = cosine_distance
minv = test.dist.min()
maxv = test.dist.max()

In [None]:
test["pred"] = test.dist.apply(lambda val: 5 - 5*(val-minv)/(maxv-minv))

In [None]:
random.choice(test[["Score","pred"]].values)

In [None]:
mse(test.pred.values,test.Score.values),corr(test.pred.values,test.Score.values)

# Test 2 - Universal Sentence Encoding

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [None]:
embed = hub.Module(module_url)

In [None]:
%%time
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    sent1Embeddings = session.run(embed(test.Sent1.values))
    sent2Embeddings = session.run(embed(test.Sent2.values))
    
    print("Sent Embeddings Done!")

In [None]:
cosine_distance = [distance.cosine(sent1Embeddings[i],sent2Embeddings[i]) for i in range(len(test))]

In [None]:
test["dist"] = cosine_distance

In [None]:
minv = test.dist.min()
maxv = test.dist.max()

In [None]:
test["pred"] = test.dist.apply(lambda val: 5 - 5*(val-minv)/(maxv-minv))

In [None]:
random.choice(test[["Score","pred"]].values)

In [None]:
mse(test.pred.values,test.Score.values),corr(test.pred.values,test.Score.values)

# Test 3

POS and Dependency parsing + Theme embedding?

## Define Helpers

In [None]:
class ENFRTokenizer():
    def __init__(self):
        self.re_apos = re.compile(r"(\w)'s\b")         # make 's a separate word
        self.re_mw_punc = re.compile(r"(\w[’'])(\w)")  # other ' in a word creates 2 words
        self.re_punc = re.compile("([\"().,;:/_?!—])") # add spaces around punctuation
        self.re_mult_space = re.compile(r"  *")        # replace multiple spaces with just one

    def tokenize(self,sent):
        sent = self.re_apos.sub(r"\1 's", sent)
        sent = self.re_mw_punc.sub(r"\1 \2", sent)
        sent = self.re_punc.sub(r" \1 ", sent).replace('-', ' ')
        sent = self.re_mult_space.sub(' ', sent)
        return sent.lower().split()


In [None]:
tokenizer = ENFRTokenizer()

In [None]:
sent = random.choice(df.Sent1.values)
print(tokenizer.tokenize(sent))

In [None]:
doc = nlp(sent)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

**We care about:** 
- token.pos_
- token.dep_
- (token.tag_)*


In [None]:
idx_to_pos = []
idx_to_dep = []

dep_to_idx = {idx_to_dep[idx]:idx for idx in range(len(idx_to_dep))}
pos_to_idx = {idx_to_pos[idx]:idx for idx in range(len(idx_to_pos))}

In [None]:
train["Sent1POS"] = None
test["Sent1POS"] = None

train["Sent2POS"] = None
test["Sent2POS"] = None

train["Sent1DEP"] = None
test["Sent1DEP"] = None

train["Sent2DEP"] = None
test["Sent2DEP"] = None

Let's generate our POS and DEP sequences for our sentences.

In [None]:
def augment_df_with_tags(df):
    
    MAX_POS_LENGHT = 0
    MAX_DEP_LENGHT = 0
    
    for i,row in df.iterrows():
        doc = nlp(row["Sent1"])
        pos_list = []
        dep_list = []
        for token in doc:

            if token.pos_ not in pos_to_idx:
                pos_to_idx[token.pos_] = len(idx_to_pos)
                idx_to_pos.append(token.pos_)

            pos_list.append(pos_to_idx[token.pos_])

            if token.dep_ not in dep_to_idx:    
                dep_to_idx[token.dep_] = len(idx_to_dep)
                idx_to_dep.append(token.dep_)

            dep_list.append(dep_to_idx[token.dep_])

        if len(pos_list) > MAX_POS_LENGHT: MAX_POS_LENGHT = len(pos_list)
        if len(dep_list) > MAX_DEP_LENGHT: MAX_DEP_LENGHT = len(dep_list)

        df.at[i,"Sent1POS"] = pos_list
        df.at[i,"Sent1DEP"] = dep_list

        doc = nlp(row["Sent2"])
        pos_list = []
        dep_list = []

        for token in doc:
            if token.pos_ not in pos_to_idx:
                pos_to_idx[token.pos_] = len(idx_to_pos)
                idx_to_pos.append(token.pos_)

            pos_list.append(pos_to_idx[token.pos_])

            if token.dep_ not in dep_to_idx:    
                dep_to_idx[token.dep_] = len(idx_to_dep)
                idx_to_dep.append(token.dep_)

            dep_list.append(dep_to_idx[token.dep_])

        if len(pos_list) > MAX_POS_LENGHT: MAX_POS_LENGHT = len(pos_list)
        if len(dep_list) > MAX_DEP_LENGHT: MAX_DEP_LENGHT = len(dep_list)

        df.at[i,"Sent2POS"] = pos_list
        df.at[i,"Sent2DEP"] = dep_list   
        
    return MAX_POS_LENGHT,MAX_DEP_LENGHT
        

In [None]:
%%time
MAX_POS_LENGHT,MAX_DEP_LENGHT = augment_df_with_tags(train)

In [None]:
%time augment_df_with_tags(test)

In [86]:
MAX_POS_LENGHT,MAX_DEP_LENGHT

(212, 212)

## Let's Prepare Our Inputs

In [107]:
import torch
import torch.nn
from keras.utils import to_categorical
from keras.preprocessing import sequence

array([[1, 0],
       [1, 2]], dtype=int32)

In [105]:
len(idx_to_pos),len(idx_to_dep)

(16, 46)

In [137]:
MAX_LEN = 100

In [141]:
x1_pos = [[to_categorical(p,len(idx_to_pos)) for p in pos] for pos in train.Sent1POS]
x1_pos = sequence.pad_sequences(x1_pos,maxlen=MAX_LEN,padding='post')
x1_pos = x1_pos.reshape((x1_pos.shape[0],x1_pos.shape[1],x1_pos.shape[-1]))

x2_pos = [[to_categorical(p,len(idx_to_pos)) for p in pos] for pos in train.Sent2POS]
x2_pos = sequence.pad_sequences(x2_pos,maxlen=MAX_LEN,padding='post')
x2_pos = x2_pos.reshape((x2_pos.shape[0],x2_pos.shape[1],x2_pos.shape[-1]))

In [142]:
x1_pos.shape,x2_pos.shape

((10258, 100, 16), (10258, 100, 16))

In [140]:
x1_dep = [[to_categorical(d,len(idx_to_dep)) for d in dep] for dep in train.Sent1DEP]
x1_dep = sequence.pad_sequences(x1_dep,maxlen=MAX_LEN,padding='post')
x1_dep = x1_dep.reshape((x1_dep.shape[0],x1_dep.shape[1],x1_dep.shape[-1]))

x2_dep = [[to_categorical(d,len(idx_to_dep)) for d in dep] for dep in train.Sent2DEP]
x2_dep = sequence.pad_sequences(x2_dep,maxlen=MAX_LEN,padding='post')
x2_dep = x2_dep.reshape((x2_dep.shape[0],x2_dep.shape[1],x2_dep.shape[-1]))

In [143]:
x1_dep.shape,x2_dep.shape

((10258, 100, 46), (10258, 100, 46))

## Let's Define Our Network

In [4]:
import torch.nn as nn


class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        
        self.gru = nn.GRU(46, hidden_size=20, num_layers=1)

        self.fc1 = nn.Sequential(
            nn.Linear(50 * 4 * 4, 500),
            nn.ReLU(inplace=True),
            nn.Linear(500, 10),
            nn.Linear(10, 2))

    def forward_once(self, x):
        output = self.gru(x)
        print(output.size()[0])
        output = output.view(output.size()[0], -1)
        output = self.fc1(output)
        return output

    def forward(self, input1, input2):
        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        return output1, output2

In [5]:
class ContrastiveLoss(torch.nn.Module):
    """
    Contrastive loss function.
    Based on:
    """

    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def check_type_forward(self, in_types):
        assert len(in_types) == 3

        x0_type, x1_type, y_type = in_types
        assert x0_type.size() == x1_type.shape
        assert x1_type.size()[0] == y_type.shape[0]
        assert x1_type.size()[0] > 0
        assert x0_type.dim() == 2
        assert x1_type.dim() == 2
        assert y_type.dim() == 1

    def forward(self, x0, x1, y):
        self.check_type_forward((x0, x1, y))

        # euclidian distance
        diff = x0 - x1
        dist_sq = torch.sum(torch.pow(diff, 2), 1)
        dist = torch.sqrt(dist_sq)

        mdist = self.margin - dist
        dist = torch.clamp(mdist, min=0.0)
        loss = y * dist_sq + (1 - y) * torch.pow(dist, 2)
        loss = torch.sum(loss) / 2.0 / x0.size()[0]
        return loss

NameError: name 'torch' is not defined

In [6]:
net = SiameseNetwork()

In [7]:
x1_dep[0].shape

NameError: name 'x1_dep' is not defined

In [8]:
net.forward_once(torch.tensor([x1_dep[0]]).float())

NameError: name 'torch' is not defined