In [1]:
import pandas as pd
import numpy as np

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import warnings
warnings.filterwarnings("ignore")
def tokenize(text):
    # Tokenize the text into words, removing punctuation and converting to lowercase
    return Counter(text.translate(str.maketrans('', '', string.punctuation)).lower().split())

def lexical_similarity(text1, text2):
    # Calculate lexical similarity based on common tokens
    tokens1 = tokenize(text1)
    tokens2 = tokenize(text2)
    common_tokens = tokens1 & tokens2
    total_tokens = sum(tokens1.values()) + sum(tokens2.values())
    return sum(common_tokens.values()) * 2 / total_tokens if total_tokens > 0 else 0

def batch_cosine_similarity(texts1, texts2, batch_size=100):
    similarities = []
    vectorizer = CountVectorizer()

    for i in range(0, len(texts1), batch_size):
        # Prepare batch data
        end = i + batch_size
        batch_texts1 = texts1[i:end]
        batch_texts2 = texts2[i:end]

        # Vectorize the batch texts
        vectorizer.fit(batch_texts1 + batch_texts2)
        vectors1 = vectorizer.transform(batch_texts1)
        vectors2 = vectorizer.transform(batch_texts2)

        # Calculate cosine similarity for each pair in the batch
        batch_similarities = [cosine_similarity(vectors1[j], vectors2[j])[0,0] 
                              for j in range(len(batch_texts1))]
        similarities.extend(batch_similarities)

    return np.mean(similarities)

### EN-FR

In [2]:
file_path_en = 'x-final/en/train.tsv'
file_path_fr = 'x-final/fr/translated_train.tsv'
df_en = pd.read_csv(file_path_en, delimiter='\t', error_bad_lines=False)
df_fr = pd.read_csv(file_path_fr, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_en, df_fr, on="id", suffixes=('_en', '_fr'))

english_texts1 = merged_df['sentence1_en'].astype(str)
french_texts1 = merged_df['sentence1_fr'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
french_texts2 = merged_df['sentence2_fr'].astype(str)

similarities1 = [lexical_similarity(en, fr) for en, fr in zip(english_texts1, french_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, fr) for en, fr in zip(english_texts2, french_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, french_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, french_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 1810: expected 4 fields, saw 5\nSkipping line 2161: expected 4 fields, saw 5\nSkipping line 4531: expected 4 fields, saw 5\nSkipping line 13379: expected 4 fields, saw 5\nSkipping line 15111: expected 4 fields, saw 5\nSkipping line 15457: expected 4 fields, saw 5\nSkipping line 17265: expected 4 fields, saw 5\nSkipping line 17986: expected 4 fields, saw 5\nSkipping line 18421: expected 4 fields, saw 5\nSkipping line 29019: expected 4 fields, saw 5\nSkipping line 36181: expected 4 fields, saw 5\nSkipping line 41179: expected 4 fields, saw 5\nSkipping line 43631: expected 4 fields, saw 5\nSkipping line 45641: expected 4 fields, saw 5\nSkipping line 46177: expected 4 fields, saw 5\nSkipping line 47418: expected 4 fields, saw 5\n'


(0.2539846592405373, 0.24628111994507534)

### EN-DE

In [3]:
file_path_en = 'x-final/en/train.tsv'
file_path_de = 'x-final/de/translated_train.tsv'
df_en = pd.read_csv(file_path_en, delimiter='\t', error_bad_lines=False)
df_de = pd.read_csv(file_path_de, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_en, df_de, on="id", suffixes=('_en', '_de'))

english_texts1 = merged_df['sentence1_en'].astype(str)
german_texts1 = merged_df['sentence1_de'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
german_texts2 = merged_df['sentence2_de'].astype(str)


similarities1 = [lexical_similarity(en, de) for en, de in zip(english_texts1, german_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, de) for en, de in zip(english_texts2, german_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, german_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, german_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 1810: expected 4 fields, saw 5\nSkipping line 2161: expected 4 fields, saw 5\nSkipping line 4531: expected 4 fields, saw 5\nSkipping line 13379: expected 4 fields, saw 5\nSkipping line 15111: expected 4 fields, saw 5\nSkipping line 15457: expected 4 fields, saw 5\nSkipping line 17265: expected 4 fields, saw 5\nSkipping line 17986: expected 4 fields, saw 5\nSkipping line 18421: expected 4 fields, saw 5\nSkipping line 29019: expected 4 fields, saw 5\nSkipping line 36181: expected 4 fields, saw 5\nSkipping line 41179: expected 4 fields, saw 5\nSkipping line 43631: expected 4 fields, saw 5\nSkipping line 45641: expected 4 fields, saw 5\nSkipping line 46177: expected 4 fields, saw 5\nSkipping line 47418: expected 4 fields, saw 5\n'


(0.304132061518894, 0.3075392320920287)

### EN-ES

In [4]:
file_path_en = 'x-final/en/train.tsv'
file_path_es = 'x-final/es/translated_train.tsv'
df_en = pd.read_csv(file_path_en, delimiter='\t', error_bad_lines=False)
df_es = pd.read_csv(file_path_es, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_en, df_es, on="id", suffixes=('_en', '_es'))

english_texts1 = merged_df['sentence1_en'].astype(str)
spanish_texts1 = merged_df['sentence1_es'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
spanish_texts2 = merged_df['sentence2_es'].astype(str)

similarities1 = [lexical_similarity(en, es) for en, es in zip(english_texts1, spanish_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, es) for en, es in zip(english_texts2, spanish_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, spanish_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, spanish_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 1810: expected 4 fields, saw 5\nSkipping line 2161: expected 4 fields, saw 5\nSkipping line 4531: expected 4 fields, saw 5\nSkipping line 13379: expected 4 fields, saw 5\nSkipping line 15111: expected 4 fields, saw 5\nSkipping line 15457: expected 4 fields, saw 5\nSkipping line 17265: expected 4 fields, saw 5\nSkipping line 17986: expected 4 fields, saw 5\nSkipping line 18421: expected 4 fields, saw 5\nSkipping line 29019: expected 4 fields, saw 5\nSkipping line 36181: expected 4 fields, saw 5\nSkipping line 41179: expected 4 fields, saw 5\nSkipping line 43631: expected 4 fields, saw 5\nSkipping line 45641: expected 4 fields, saw 5\nSkipping line 46177: expected 4 fields, saw 5\nSkipping line 47418: expected 4 fields, saw 5\n'


(0.25060848073609454, 0.22769727905546902)

### EN-KO

In [5]:
file_path_en = 'x-final/en/train.tsv'
file_path_ko = 'x-final/ko/translated_train.tsv'
df_en = pd.read_csv(file_path_en, delimiter='\t', error_bad_lines=False)
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_en, df_ko, on="id", suffixes=('_en', '_ko'))

english_texts1 = merged_df['sentence1_en'].astype(str)
korea_texts1 = merged_df['sentence1_ko'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
korea_texts2 = merged_df['sentence2_ko'].astype(str)

similarities1 = [lexical_similarity(en, ko) for en, ko in zip(english_texts1, korea_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, ko) for en, ko in zip(english_texts2, korea_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, korea_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, korea_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 1810: expected 4 fields, saw 5\nSkipping line 2161: expected 4 fields, saw 5\nSkipping line 4531: expected 4 fields, saw 5\nSkipping line 13379: expected 4 fields, saw 5\nSkipping line 15111: expected 4 fields, saw 5\nSkipping line 15457: expected 4 fields, saw 5\nSkipping line 17265: expected 4 fields, saw 5\nSkipping line 17986: expected 4 fields, saw 5\nSkipping line 18421: expected 4 fields, saw 5\nSkipping line 29019: expected 4 fields, saw 5\nSkipping line 36181: expected 4 fields, saw 5\nSkipping line 41179: expected 4 fields, saw 5\nSkipping line 43631: expected 4 fields, saw 5\nSkipping line 45641: expected 4 fields, saw 5\nSkipping line 46177: expected 4 fields, saw 5\nSkipping line 47418: expected 4 fields, saw 5\n'
b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 2165: expected 4 fields, saw 5\nSkipping line 7

(0.14748268396707015, 0.17332460796386406)

### EN-JA

In [6]:
file_path_en = 'x-final/en/train.tsv'
file_path_zh = 'x-final/ja/translated_train.tsv'
df_en = pd.read_csv(file_path_en, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_en, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 1810: expected 4 fields, saw 5\nSkipping line 2161: expected 4 fields, saw 5\nSkipping line 4531: expected 4 fields, saw 5\nSkipping line 13379: expected 4 fields, saw 5\nSkipping line 15111: expected 4 fields, saw 5\nSkipping line 15457: expected 4 fields, saw 5\nSkipping line 17265: expected 4 fields, saw 5\nSkipping line 17986: expected 4 fields, saw 5\nSkipping line 18421: expected 4 fields, saw 5\nSkipping line 29019: expected 4 fields, saw 5\nSkipping line 36181: expected 4 fields, saw 5\nSkipping line 41179: expected 4 fields, saw 5\nSkipping line 43631: expected 4 fields, saw 5\nSkipping line 45641: expected 4 fields, saw 5\nSkipping line 46177: expected 4 fields, saw 5\nSkipping line 47418: expected 4 fields, saw 5\n'


(0.029509598846015464, 0.07059264691219824)

### EN-ZH

In [7]:
file_path_en = 'x-final/en/train.tsv'
file_path_zh = 'x-final/zh/translated_train.tsv'
df_en = pd.read_csv(file_path_en, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_en, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 1810: expected 4 fields, saw 5\nSkipping line 2161: expected 4 fields, saw 5\nSkipping line 4531: expected 4 fields, saw 5\nSkipping line 13379: expected 4 fields, saw 5\nSkipping line 15111: expected 4 fields, saw 5\nSkipping line 15457: expected 4 fields, saw 5\nSkipping line 17265: expected 4 fields, saw 5\nSkipping line 17986: expected 4 fields, saw 5\nSkipping line 18421: expected 4 fields, saw 5\nSkipping line 29019: expected 4 fields, saw 5\nSkipping line 36181: expected 4 fields, saw 5\nSkipping line 41179: expected 4 fields, saw 5\nSkipping line 43631: expected 4 fields, saw 5\nSkipping line 45641: expected 4 fields, saw 5\nSkipping line 46177: expected 4 fields, saw 5\nSkipping line 47418: expected 4 fields, saw 5\n'


(0.02570235768541026, 0.0718408346426059)

### FR-ES

In [8]:
file_path_en = 'x-final/fr/translated_train.tsv'
file_path_zh = 'x-final/es/translated_train.tsv'
df_en = pd.read_csv(file_path_en, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_en, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

(0.3215430262362947, 0.36690604375245384)

### FR-DE

In [9]:
file_path_ko = 'x-final/fr/translated_train.tsv'
file_path_zh = 'x-final/de/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

(0.23260903113208703, 0.23027652423945655)

### FR-KO

In [10]:
file_path_ko = 'x-final/fr/translated_train.tsv'
file_path_zh = 'x-final/ko/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 2165: expected 4 fields, saw 5\nSkipping line 7642: expected 4 fields, saw 5\nSkipping line 15126: expected 4 fields, saw 5\n'


(0.13232842953299934, 0.15639689026785789)

### FR-JA

In [11]:
file_path_ko = 'x-final/fr/translated_train.tsv'
file_path_zh = 'x-final/ja/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

(0.030243125413892544, 0.06359879130102243)

### FR-ZH

In [12]:
file_path_ko = 'x-final/fr/translated_train.tsv'
file_path_zh = 'x-final/zh/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

(0.028641939401856765, 0.06670960344385476)

### ES-DE

In [13]:
file_path_ko = 'x-final/es/translated_train.tsv'
file_path_zh = 'x-final/de/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

(0.23723702958357934, 0.2189863065578936)

### ES-KO

In [14]:
file_path_ko = 'x-final/es/translated_train.tsv'
file_path_zh = 'x-final/ko/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 2165: expected 4 fields, saw 5\nSkipping line 7642: expected 4 fields, saw 5\nSkipping line 15126: expected 4 fields, saw 5\n'


(0.13887979672407072, 0.153573187334758)

### ES-JA

In [15]:
file_path_ko = 'x-final/es/translated_train.tsv'
file_path_zh = 'x-final/ja/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

(0.031267890300259615, 0.06329803931661071)

### ES-ZH

In [16]:
file_path_ko = 'x-final/es/translated_train.tsv'
file_path_zh = 'x-final/zh/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

(0.028959108854353287, 0.06662804393296569)

### DE-KO

In [17]:
file_path_ko = 'x-final/de/translated_train.tsv'
file_path_zh = 'x-final/ko/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 2165: expected 4 fields, saw 5\nSkipping line 7642: expected 4 fields, saw 5\nSkipping line 15126: expected 4 fields, saw 5\n'


(0.14817754017697132, 0.172754893241511)

### DE-JA

In [18]:
file_path_ko = 'x-final/de/translated_train.tsv'
file_path_zh = 'x-final/ja/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

(0.03487844971891019, 0.06920324881816498)

### DE-ZH

In [19]:
file_path_ko = 'x-final/de/translated_train.tsv'
file_path_zh = 'x-final/zh/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

(0.03135730632234393, 0.07151116756042089)

### KO-JA

In [20]:
file_path_ko = 'x-final/ko/translated_train.tsv'
file_path_zh = 'x-final/ja/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 2165: expected 4 fields, saw 5\nSkipping line 7642: expected 4 fields, saw 5\nSkipping line 15126: expected 4 fields, saw 5\n'


(0.035380390068386745, 0.07249324263925604)

### KO-ZH

In [21]:
file_path_ko = 'x-final/ko/translated_train.tsv'
file_path_zh = 'x-final/zh/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 2165: expected 4 fields, saw 5\nSkipping line 7642: expected 4 fields, saw 5\nSkipping line 15126: expected 4 fields, saw 5\n'


(0.032682018875849025, 0.06931812695606496)

### JA-ZH

In [22]:
file_path_ko = 'x-final/ja/translated_train.tsv'
file_path_zh = 'x-final/zh/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

similarities1 = [lexical_similarity(en, zh) for en, zh in zip(english_texts1, chinese_texts1)]
average_similarity1 = sum(similarities1) / len(similarities1) if similarities1 else 0
similarities2 = [lexical_similarity(en, zh) for en, zh in zip(english_texts2, chinese_texts2)]
average_similarity2 = sum(similarities2) / len(similarities2) if similarities2 else 0

average_lexical_similarity = (average_similarity1+average_similarity2)/2

cos_similarities1 = batch_cosine_similarity(english_texts1, chinese_texts1)
cos_similarities2 = batch_cosine_similarity(english_texts2, chinese_texts2)

average_cosine_similarity = (cos_similarities1 + cos_similarities2) / 2

average_lexical_similarity, average_cosine_similarity

(0.057943033961526766, 0.08756656872188451)