In [21]:
import pandas as pd 
import math
from collections import Counter
import re
import numpy as np

df = pd.read_csv("/Users/cetiners/Desktop/Corpus/ru-en/scores.csv")

In [34]:
## VMD DISTANCE

def word_movers_distance(ground_truth, proposed_translation, embedding):

    similarities = {}
    i = 0

    for (tru, tra) in zip(ground_truth,proposed_translation):

        ground_truth_tokens = [token for token in tru]
        proposed_translation_tokens = [token for token in tra]

        similarities[i] = embedding.wmdistance(tru_tokens,tra_tokens)
        i =+ 1
    return similarities

In [12]:
## PURE PYTHON IMPLEMENTATION

def get_vectors(text):
    word = re.compile(r'\w+')
    words = word.findall(text)
    return Counter(words)

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator
    
def get_similarities(truth, trans):
    similarities = {}
    for i in range(len(truth)):
        vec1 = get_vectors(truth.iloc[i])
        vec2 = get_vectors(trans.iloc[i])
        cos = get_cosine(vec1,vec2)
        similarities[i] = cos
    return similarities


In [17]:
## SKLEARN IMPLEMENTATION

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

def get_cosine_sim(truth, trans):
    similarities = {}
    for i in range(len(truth)):
        t1 = truth.iloc[i]
        t2 = trans.iloc[i]
        tfidf = vectorizer.fit_transform([t1,t2])
        similarities[i] = ((tfidf * tfidf.T).A)[0,1]
    return similarities

In [None]:
get_cosine_sim(df["reference"],df["translation"])

In [33]:
## SCIPY IMPLEMENTATION

def cosine_distance_countvectorizer_method(s1, s2):
        allsentences = [s1 , s2]
        from sklearn.feature_extraction.text import CountVectorizer
        from scipy.spatial import distance

        vectorizer = CountVectorizer()
        all_sentences_to_vector = vectorizer.fit_transform(allsentences)
        text_to_vector_v1 = all_sentences_to_vector.toarray()[0].tolist()
        text_to_vector_v2 = all_sentences_to_vector.toarray()[1].tolist()

        cosine = distance.cosine(text_to_vector_v1, text_to_vector_v2)
        cosine_sim = round((1-cosine)*100,2)
        return cosine_sim

def get_similarities_cvm(truth, trans):
    similarities = {}
    for i in range(len(truth)):
        s1 = truth.iloc[i]
        s2 = trans.iloc[i]
        similarities[i] = cosine_distance_countvectorizer_method(s1,s2)
    return similarities

In [36]:
## GET SIMILARITIES FOR ALL

similarities_master = {}

for filename in os.listdir("/Users/cetiners/Desktop/corpus"):
    if not filename.startswith("."):
        print(f"{filename} klasörünü paketliyorum abi'")
        df = pd.read_csv(f"/Users/cetiners/Desktop/corpus/{filename}/scores.csv")
        similarities_master[filename] = get_similarities_cvm(df["reference"],df["translation"])

final_similarities = pd.DataFrame(similarities_master)

#final_similarities.to_excel("/Users/cetiners/Desktop/cosine_sim.xlsx")

en-fi klasörünü paketliyorum abi'
zh-en klasörünü paketliyorum abi'
cs-en klasörünü paketliyorum abi'
en-zh klasörünü paketliyorum abi'
de-en klasörünü paketliyorum abi'
ru-en klasörünü paketliyorum abi'


In [38]:
final_similarities

Unnamed: 0,en-fi,zh-en,cs-en,en-zh,de-en,ru-en
0,52.22,64.89,76.06,0.0,20.70,52.17
1,50.17,76.28,73.03,0.0,75.00,75.79
2,42.16,68.02,76.61,0.0,66.67,50.17
3,58.83,48.04,62.99,50.0,55.34,65.87
4,25.00,47.34,59.67,0.0,78.35,33.81
...,...,...,...,...,...,...
26414,,73.53,,,,
26415,,60.68,,,,
26416,,28.57,,,,
26417,,33.81,,,,
