In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

path = "./data/"

model_emb_langs = {"mBART": ["ta_IN", "xh_ZA", "vi_VN"], 
                   "m2m100": ["ta", "xh", "vi"]}

limit = 50000

# Low-resource language translations

In [3]:
results_lr_ph1 = []
labels = pd.read_csv("./data/labels.csv").replace([True, False], [1, 0]).to_numpy().ravel()

for model, langs in model_emb_langs.items():
	for sl in langs:
		sl_vec = np.load(path + f"emb_{model}_{sl}.npy", mmap_mode="r")

		x_train, x_test, y_train, y_test = train_test_split(sl_vec[:limit], labels[:limit], test_size=0.25, random_state=42)
		print(x_train.shape, x_test.shape)
		
		lr = LogisticRegression(solver="saga", max_iter=1000)
		lr.fit(x_train, y_train)
		pred = lr.predict(x_test)

		conf_matrix = confusion_matrix(y_test, pred)
		results_lr_ph1.append([model, sl, conf_matrix])

(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)




(35331, 1024) (11778, 1024)




(35331, 1024) (11778, 1024)




In [4]:
results_lr_ph2 = []
labels = pd.read_csv("./data/labels.csv").replace([True, False], [1, 0]).to_numpy().ravel()

for model, langs in model_emb_langs.items():
	for sl in langs:
		eng_string = "en_XX" if model == "mBART" else "en"
		sl_vec = np.load(path + f"emb_{model}_{eng_string}_backtranslation-{sl}.npy", mmap_mode="r")

		x_train, x_test, y_train, y_test = train_test_split(sl_vec[:limit], labels[:limit], test_size=0.25, random_state=42)
		print(x_train.shape, x_test.shape)

		lr = LogisticRegression(solver="saga", max_iter=1000)
		lr.fit(x_train, y_train)
		pred = lr.predict(x_test)

		conf_matrix = confusion_matrix(y_test, pred)
		results_lr_ph2.append([model, sl, conf_matrix])

(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)




(35331, 1024) (11778, 1024)




(35331, 1024) (11778, 1024)




In [6]:
for translated, backtranslated in zip(results_lr_ph1, results_lr_ph2):
    print(f"{translated[0]} -- {translated[1]} | {backtranslated[1]} -> en")
    tn_tr, fp_tr, fn_tr, tp_tr = translated[2].ravel()
    precision_tr = tp_tr/(fp_tr+tp_tr)
    recall_tr = tp_tr/(fn_tr+tp_tr)
    f1_tr = (2*recall_tr*precision_tr)/(recall_tr+precision_tr)

    tn_btr, fp_btr, fn_btr, tp_btr = backtranslated[2].ravel()
    precision_btr = tp_btr/(fp_btr+tp_btr)
    recall_btr = tp_btr/(fn_btr+tp_btr)
    f1_btr = (2*recall_btr*precision_btr)/(recall_btr+precision_btr) 

    print(f"Translated --> P = {precision_tr:0.3f} - R = {recall_tr:0.3f} - F1 = {f1_tr:0.3f}")
    print(f"Backtranslated --> P = {precision_btr:0.3f} - R = {recall_btr:0.3f} - F1 = {f1_btr:0.3f}")
    print("------------")

mBART -- ta_IN | ta_IN -> en
Translated --> P = 0.898 - R = 0.906 - F1 = 0.902
Backtranslated --> P = 0.893 - R = 0.893 - F1 = 0.893
------------
mBART -- xh_ZA | xh_ZA -> en
Translated --> P = 0.848 - R = 0.848 - F1 = 0.848
Backtranslated --> P = 0.863 - R = 0.883 - F1 = 0.873
------------
mBART -- vi_VN | vi_VN -> en
Translated --> P = 0.908 - R = 0.875 - F1 = 0.891
Backtranslated --> P = 0.929 - R = 0.921 - F1 = 0.925
------------
m2m100 -- ta | ta -> en
Translated --> P = 0.867 - R = 0.818 - F1 = 0.842
Backtranslated --> P = 0.857 - R = 0.805 - F1 = 0.830
------------
m2m100 -- xh | xh -> en
Translated --> P = 0.835 - R = 0.833 - F1 = 0.834
Backtranslated --> P = 0.808 - R = 0.814 - F1 = 0.811
------------
m2m100 -- vi | vi -> en
Translated --> P = 0.932 - R = 0.938 - F1 = 0.935
Backtranslated --> P = 0.931 - R = 0.929 - F1 = 0.930
------------
