In [64]:
import pandas as pd
from statsmodels.stats.inter_rater import fleiss_kappa
import krippendorff

In [65]:
bt0 = pd.read_csv("data_subsets/backtranslation_subset0.csv", sep=";")
bt1 = pd.read_csv("data_subsets/backtranslation_subset1.csv", sep=";")
bt2 = pd.read_csv("data_subsets/backtranslation_subset2.csv", sep=";")
pm0 = pd.read_csv("data_subsets/paraphrase_mining_subset0.csv", sep=";")
pm1 = pd.read_csv("data_subsets/paraphrase_mining_subset1.csv", sep=";")
pm2= pd.read_csv("data_subsets/paraphrase_mining_subset2.csv", sep=";")

In [66]:
bt_dfs = [bt0, bt1, bt2]
pm_dfs = [pm0, pm1, pm2]

bt_df = pd.concat(bt_dfs)
pm_df = pd.concat(pm_dfs)


In [67]:
bt_mean = bt_df.mean()
bt_std = bt_df.std()
pm_mean = pm_df.mean()
pm_std = pm_df.std()

print("Back-translation technique:")
print(f"Accuracy: {bt_mean['accuracy']:.2f} +/- {bt_std['accuracy']:.2f}")
print(f"Fluency: {bt_mean['fluency']:.2f} +/- {bt_std['fluency']:.2f}")
print(f"Diversity: {bt_mean['diversity']:.2f} +/- {bt_std['diversity']:.2f}\n")

print("Paraphrase-mining technique:")
print(f"Accuracy: {pm_mean['accuracy']:.2f} +/- {pm_std['accuracy']:.2f}")
print(f"Fluency: {pm_mean['fluency']:.2f} +/- {pm_std['fluency']:.2f}")
print(f"Diversity: {pm_mean['diversity']:.2f} +/- {pm_std['diversity']:.2f}")


Back-translation technique:
Accuracy: 4.29 +/- 0.93
Fluency: 4.28 +/- 0.85
Diversity: 3.31 +/- 1.22

Paraphrase-mining technique:
Accuracy: 4.12 +/- 1.10
Fluency: 4.17 +/- 1.05
Diversity: 3.03 +/- 1.25


  bt_mean = bt_df.mean()
  bt_std = bt_df.std()
  pm_mean = pm_df.mean()
  pm_std = pm_df.std()


In [82]:
# Concatenate the ratings from the three annotators into a single DataFrame
bt_all = pd.concat([bt0, bt1, bt2], ignore_index=True)

# Create a contingency table of the ratings
bt_table_acc = pd.crosstab(bt_all['sentence1'], bt_all['accuracy'])
print(f"Back-translation, accuracy, Krippendorff's alpha coefficient: {krippendorff.alpha(bt_table_acc.values)}")

bt_table_flu = pd.crosstab(bt_all['sentence1'], bt_all['fluency'])
print(f"Back-translation, fluency, Krippendorff's alpha coefficient: {krippendorff.alpha(bt_table_flu.values)}")

bt_table_div = pd.crosstab(bt_all['sentence1'], bt_all['diversity'])
print(f"Back-translation, diversirty, Krippendorff's alpha coefficient: {krippendorff.alpha(bt_table_div.values)}")


Back-translation, accuracy, Krippendorff's alpha coefficient: 0.4408430561921084
Back-translation, fluency, Krippendorff's alpha coefficient: 0.357096159428521
Back-translation, diversirty, Krippendorff's alpha coefficient: 0.07051843035670513


In [83]:
# Concatenate the ratings from the three annotators into a single DataFrame
pm_all = pd.concat([pm0, pm1, pm2], ignore_index=True)

# Create a contingency table of the ratings
pm_table_acc = pd.crosstab(pm_all['sentence1'], pm_all['accuracy'])
print(f"Paraphrase-mining, accuracy, Krippendorff's alpha coefficient: {krippendorff.alpha(pm_table_acc.values)}")

pm_table_flu = pd.crosstab(pm_all['sentence1'], pm_all['fluency'])
print(f"Paraphrase-mining, fluency, Krippendorff's alpha coefficient: {krippendorff.alpha(pm_table_flu.values)}")

pm_table_div = pd.crosstab(pm_all['sentence1'], pm_all['diversity'])
print(f"Paraphrase-mining, diversirty, Krippendorff's alpha coefficient: {krippendorff.alpha(pm_table_div.values)}")


Paraphrase-mining, accuracy, Krippendorff's alpha coefficient: 0.34439163702128506
Paraphrase-mining, fluency, Krippendorff's alpha coefficient: 0.42402967922755375
Paraphrase-mining, diversirty, Krippendorff's alpha coefficient: 0.12220452724739062
