In [3]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')

def pos_tag_sentence(sentence):
    words = word_tokenize(sentence)
    words = [word for word in words if word.isalpha()]  # Remove non-alphabetic tokens
    return [tag for word, tag in pos_tag(words)]

def syntactic_similarity(tags1, tags2):
    common_tags = set(tags1) & set(tags2)
    total_tags = len(set(tags1 + tags2))
    return len(common_tags) / total_tags if total_tags > 0 else 0

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ruohan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/ruohan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ruohan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### EN-FR

In [4]:
file_path_en = 'x-final/en/train.tsv'
file_path_fr = 'x-final/fr/translated_train.tsv'
df_en = pd.read_csv(file_path_en, delimiter='\t', error_bad_lines=False)
df_fr = pd.read_csv(file_path_fr, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_en, df_fr, on="id", suffixes=('_en', '_fr'))

english_texts1 = merged_df['sentence1_en'].astype(str)
french_texts1 = merged_df['sentence1_fr'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
french_texts2 = merged_df['sentence2_fr'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, french_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, french_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 1810: expected 4 fields, saw 5\nSkipping line 2161: expected 4 fields, saw 5\nSkipping line 4531: expected 4 fields, saw 5\nSkipping line 13379: expected 4 fields, saw 5\nSkipping line 15111: expected 4 fields, saw 5\nSkipping line 15457: expected 4 fields, saw 5\nSkipping line 17265: expected 4 fields, saw 5\nSkipping line 17986: expected 4 fields, saw 5\nSkipping line 18421: expected 4 fields, saw 5\nSkipping line 29019: expected 4 fields, saw 5\nSkipping line 36181: expected 4 fields, saw 5\nSkipping line 41179: expected 4 fields, saw 5\nSkipping line 43631: expected 4 fields, saw 5\nSkipping line 45641: expected 4 fields, saw 5\nSkipping line 46177: expected 4 fields, saw 5\nSkipping line 47418: expected 4 fields, saw 5\n'


0.3604336670935255

### EN-DE

In [5]:
file_path_en = 'x-final/en/train.tsv'
file_path_de = 'x-final/de/translated_train.tsv'
df_en = pd.read_csv(file_path_en, delimiter='\t', error_bad_lines=False)
df_de = pd.read_csv(file_path_de, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_en, df_de, on="id", suffixes=('_en', '_de'))

english_texts1 = merged_df['sentence1_en'].astype(str)
german_texts1 = merged_df['sentence1_de'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
german_texts2 = merged_df['sentence2_de'].astype(str)


syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, german_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, german_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 1810: expected 4 fields, saw 5\nSkipping line 2161: expected 4 fields, saw 5\nSkipping line 4531: expected 4 fields, saw 5\nSkipping line 13379: expected 4 fields, saw 5\nSkipping line 15111: expected 4 fields, saw 5\nSkipping line 15457: expected 4 fields, saw 5\nSkipping line 17265: expected 4 fields, saw 5\nSkipping line 17986: expected 4 fields, saw 5\nSkipping line 18421: expected 4 fields, saw 5\nSkipping line 29019: expected 4 fields, saw 5\nSkipping line 36181: expected 4 fields, saw 5\nSkipping line 41179: expected 4 fields, saw 5\nSkipping line 43631: expected 4 fields, saw 5\nSkipping line 45641: expected 4 fields, saw 5\nSkipping line 46177: expected 4 fields, saw 5\nSkipping line 47418: expected 4 fields, saw 5\n'


0.3571244274722682

### EN-ES

In [8]:
file_path_en = 'x-final/en/train.tsv'
file_path_es = 'x-final/es/translated_train.tsv'
df_en = pd.read_csv(file_path_en, delimiter='\t', error_bad_lines=False)
df_es = pd.read_csv(file_path_es, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_en, df_es, on="id", suffixes=('_en', '_es'))

english_texts1 = merged_df['sentence1_en'].astype(str)
spanish_texts1 = merged_df['sentence1_es'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
spanish_texts2 = merged_df['sentence2_es'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, spanish_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, spanish_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 1810: expected 4 fields, saw 5\nSkipping line 2161: expected 4 fields, saw 5\nSkipping line 4531: expected 4 fields, saw 5\nSkipping line 13379: expected 4 fields, saw 5\nSkipping line 15111: expected 4 fields, saw 5\nSkipping line 15457: expected 4 fields, saw 5\nSkipping line 17265: expected 4 fields, saw 5\nSkipping line 17986: expected 4 fields, saw 5\nSkipping line 18421: expected 4 fields, saw 5\nSkipping line 29019: expected 4 fields, saw 5\nSkipping line 36181: expected 4 fields, saw 5\nSkipping line 41179: expected 4 fields, saw 5\nSkipping line 43631: expected 4 fields, saw 5\nSkipping line 45641: expected 4 fields, saw 5\nSkipping line 46177: expected 4 fields, saw 5\nSkipping line 47418: expected 4 fields, saw 5\n'


0.37393655614023663

### EN-KO

In [9]:
file_path_en = 'x-final/en/train.tsv'
file_path_ko = 'x-final/ko/translated_train.tsv'
df_en = pd.read_csv(file_path_en, delimiter='\t', error_bad_lines=False)
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_en, df_ko, on="id", suffixes=('_en', '_ko'))

english_texts1 = merged_df['sentence1_en'].astype(str)
korea_texts1 = merged_df['sentence1_ko'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
korea_texts2 = merged_df['sentence2_ko'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, korea_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, korea_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 1810: expected 4 fields, saw 5\nSkipping line 2161: expected 4 fields, saw 5\nSkipping line 4531: expected 4 fields, saw 5\nSkipping line 13379: expected 4 fields, saw 5\nSkipping line 15111: expected 4 fields, saw 5\nSkipping line 15457: expected 4 fields, saw 5\nSkipping line 17265: expected 4 fields, saw 5\nSkipping line 17986: expected 4 fields, saw 5\nSkipping line 18421: expected 4 fields, saw 5\nSkipping line 29019: expected 4 fields, saw 5\nSkipping line 36181: expected 4 fields, saw 5\nSkipping line 41179: expected 4 fields, saw 5\nSkipping line 43631: expected 4 fields, saw 5\nSkipping line 45641: expected 4 fields, saw 5\nSkipping line 46177: expected 4 fields, saw 5\nSkipping line 47418: expected 4 fields, saw 5\n'
b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 2165: expected 4 fields, saw 5\nSkipping line 7

0.2628014007472034

### EN-JA

In [10]:
file_path_en = 'x-final/en/train.tsv'
file_path_zh = 'x-final/ja/translated_train.tsv'
df_en = pd.read_csv(file_path_en, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_en, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 1810: expected 4 fields, saw 5\nSkipping line 2161: expected 4 fields, saw 5\nSkipping line 4531: expected 4 fields, saw 5\nSkipping line 13379: expected 4 fields, saw 5\nSkipping line 15111: expected 4 fields, saw 5\nSkipping line 15457: expected 4 fields, saw 5\nSkipping line 17265: expected 4 fields, saw 5\nSkipping line 17986: expected 4 fields, saw 5\nSkipping line 18421: expected 4 fields, saw 5\nSkipping line 29019: expected 4 fields, saw 5\nSkipping line 36181: expected 4 fields, saw 5\nSkipping line 41179: expected 4 fields, saw 5\nSkipping line 43631: expected 4 fields, saw 5\nSkipping line 45641: expected 4 fields, saw 5\nSkipping line 46177: expected 4 fields, saw 5\nSkipping line 47418: expected 4 fields, saw 5\n'


0.039389559075448645

### EN-ZH

In [11]:
file_path_en = 'x-final/en/train.tsv'
file_path_zh = 'x-final/zh/translated_train.tsv'
df_en = pd.read_csv(file_path_en, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_en, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 1810: expected 4 fields, saw 5\nSkipping line 2161: expected 4 fields, saw 5\nSkipping line 4531: expected 4 fields, saw 5\nSkipping line 13379: expected 4 fields, saw 5\nSkipping line 15111: expected 4 fields, saw 5\nSkipping line 15457: expected 4 fields, saw 5\nSkipping line 17265: expected 4 fields, saw 5\nSkipping line 17986: expected 4 fields, saw 5\nSkipping line 18421: expected 4 fields, saw 5\nSkipping line 29019: expected 4 fields, saw 5\nSkipping line 36181: expected 4 fields, saw 5\nSkipping line 41179: expected 4 fields, saw 5\nSkipping line 43631: expected 4 fields, saw 5\nSkipping line 45641: expected 4 fields, saw 5\nSkipping line 46177: expected 4 fields, saw 5\nSkipping line 47418: expected 4 fields, saw 5\n'


0.0524902702704794

### FR-ES

In [12]:
file_path_en = 'x-final/fr/translated_train.tsv'
file_path_zh = 'x-final/es/translated_train.tsv'
df_en = pd.read_csv(file_path_en, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_en, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

0.5617415621271628

### FR-DE

In [14]:
file_path_ko = 'x-final/fr/translated_train.tsv'
file_path_zh = 'x-final/de/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

0.4436560749674848

### FR-KO

In [15]:
file_path_ko = 'x-final/fr/translated_train.tsv'
file_path_zh = 'x-final/ko/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 2165: expected 4 fields, saw 5\nSkipping line 7642: expected 4 fields, saw 5\nSkipping line 15126: expected 4 fields, saw 5\n'


0.39805873924419033

### FR-JA

In [16]:
file_path_ko = 'x-final/fr/translated_train.tsv'
file_path_zh = 'x-final/ja/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

0.053824086953983726

### FR-ZH

In [17]:
file_path_ko = 'x-final/fr/translated_train.tsv'
file_path_zh = 'x-final/zh/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

0.07380704877061381

### ES-DE

In [18]:
file_path_ko = 'x-final/es/translated_train.tsv'
file_path_zh = 'x-final/de/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

0.4760827192765159

### ES-KO

In [19]:
file_path_ko = 'x-final/es/translated_train.tsv'
file_path_zh = 'x-final/ko/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 2165: expected 4 fields, saw 5\nSkipping line 7642: expected 4 fields, saw 5\nSkipping line 15126: expected 4 fields, saw 5\n'


0.44104245255853536

### ES-JA

In [20]:
file_path_ko = 'x-final/es/translated_train.tsv'
file_path_zh = 'x-final/ja/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

0.06167180805851999

### ES-ZH

In [21]:
file_path_ko = 'x-final/es/translated_train.tsv'
file_path_zh = 'x-final/zh/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

0.0843166670624586

### DE-KO

In [22]:
file_path_ko = 'x-final/de/translated_train.tsv'
file_path_zh = 'x-final/ko/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 2165: expected 4 fields, saw 5\nSkipping line 7642: expected 4 fields, saw 5\nSkipping line 15126: expected 4 fields, saw 5\n'


0.520982067215189

### DE-JA

In [23]:
file_path_ko = 'x-final/de/translated_train.tsv'
file_path_zh = 'x-final/ja/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

0.07377742464607281

### DE-ZH

In [24]:
file_path_ko = 'x-final/de/translated_train.tsv'
file_path_zh = 'x-final/zh/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

0.09969023494216837

### KO-JA

In [25]:
file_path_ko = 'x-final/ko/translated_train.tsv'
file_path_zh = 'x-final/ja/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 2165: expected 4 fields, saw 5\nSkipping line 7642: expected 4 fields, saw 5\nSkipping line 15126: expected 4 fields, saw 5\n'


0.11707716900507242

### KO-ZH

In [26]:
file_path_ko = 'x-final/ko/translated_train.tsv'
file_path_zh = 'x-final/zh/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

b'Skipping line 1250: expected 4 fields, saw 5\nSkipping line 1680: expected 4 fields, saw 5\nSkipping line 2165: expected 4 fields, saw 5\nSkipping line 7642: expected 4 fields, saw 5\nSkipping line 15126: expected 4 fields, saw 5\n'


0.15546188491522395

### JA-ZH

In [27]:
file_path_ko = 'x-final/ja/translated_train.tsv'
file_path_zh = 'x-final/zh/translated_train.tsv'
df_ko = pd.read_csv(file_path_ko, delimiter='\t', error_bad_lines=False)
df_zh = pd.read_csv(file_path_zh, delimiter='\t', error_bad_lines=False)

merged_df = pd.merge(df_ko, df_zh, on="id", suffixes=('_en', '_zh'))

english_texts1 = merged_df['sentence1_en'].astype(str)
chinese_texts1 = merged_df['sentence1_zh'].astype(str)
english_texts2 = merged_df['sentence2_en'].astype(str)
chinese_texts2 = merged_df['sentence2_zh'].astype(str)

syntactic_similarities1 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts1, chinese_texts1)]
syntactic_similarities2 = [syntactic_similarity(pos_tag_sentence(en), pos_tag_sentence(fr)) for en, fr in zip(english_texts2, chinese_texts2)]

# Calculate average syntactic similarity
average_syntactic_similarity1 = sum(syntactic_similarities1) / len(syntactic_similarities1) if syntactic_similarities1 else 0
average_syntactic_similarity2 = sum(syntactic_similarities2) / len(syntactic_similarities2) if syntactic_similarities2 else 0

average_syntactic_similarity = (average_syntactic_similarity1 + average_syntactic_similarity2) / 2

average_syntactic_similarity

0.15707728588490047