In [1]:
from zipfile import ZipFile
import pandas as pd
import csv

path = '../data/ru_translated.zip'
zf = ZipFile(path)
translated = pd.read_csv(zf.open('ru_translated.csv'), index_col=0,encoding='utf-8', quoting=csv.QUOTE_ALL)

In [4]:
import pandas as pd
import numpy as np
import string
import itertools
import spacy

def clean_text(sentence):
    
    clean_sentence = "".join(l for l in sentence if l not in string.punctuation)
    
    return clean_sentence

def cosine_similarity_calc(vec_1,vec_2):
    sim = np.dot(vec_1,vec_2)/(np.linalg.norm(vec_1)*np.linalg.norm(vec_2))
    return sim

def embeddings_similarity(sentences):
    
    # first we need to get data into | sentence_a | sentence_b | format
    sentence_pairs = list(itertools.combinations(sentences, 2))
    
    sentence_a = [pair[0] for pair in sentence_pairs]
    sentence_b = [pair[1] for pair in sentence_pairs]
    
    sentence_pairs_df = pd.DataFrame({'sentence_a':sentence_a, 'sentence_b':sentence_b})
    
    # get unique combinations of sentance_a and sentance_b
    sentence_pairs_df = sentence_pairs_df.loc[
        pd.DataFrame(
            np.sort(sentence_pairs_df[['sentence_a', 'sentence_b']],1),
            index=sentence_pairs_df.index
        ).drop_duplicates(keep='first').index
    ]

    # remove instances where sentence a == sentence b
    sentence_pairs_df = sentence_pairs_df[sentence_pairs_df['sentence_a'] != sentence_pairs_df['sentence_b']]
    
    # load word embeddings (will use these to convert sentence to vectors)
    embeddings = spacy.load('en_core_web_lg')
    
    # now we are ready to calculate the similarity
    sentence_pairs_df['similarity'] = sentence_pairs_df.apply(
        lambda row: cosine_similarity_calc(
            embeddings(clean_text(row['sentence_a'])).vector, 
            embeddings(clean_text(row['sentence_b'])).vector),
        axis=1
    )
    
    return sentence_pairs_df

# calculate similarity for sample sentences
sentences = ['All glory to ukrain, russia will lose the war!', 'Victory, win, victorious']
embeddings_similarity(sentences=sentences)

Unnamed: 0,sentence_a,sentence_b,similarity
0,"All glory to ukrain, russia will lose the war!","Victory, win, victorious",0.518986


In [5]:
sentences = ['All glory to ukrain, russia will lose the war!', 'Loosers, lose, defeat']
embeddings_similarity(sentences=sentences)

Unnamed: 0,sentence_a,sentence_b,similarity
0,"All glory to ukrain, russia will lose the war!","Loosers, lose, defeat",0.579797


In [8]:
sentences = ["Ukrain will win the war! The people of ukrain are strong", "The Ukrainian people will resist the russian forces"]
embeddings_similarity(sentences=sentences)

Unnamed: 0,sentence_a,sentence_b,similarity
0,Ukrain will win the war! The people of ukrain ...,The Ukrainian people will resist the russian f...,0.852232


In [10]:
sentences = ["Ukrain will win the war! The people of ukrain are strong", "Something unrelated, my favorite animal is a dog"]
embeddings_similarity(sentences=sentences)

Unnamed: 0,sentence_a,sentence_b,similarity
0,Ukrain will win the war! The people of ukrain ...,"Something unrelated, my favorite animal is a dog",0.631618
