In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

import pandas as pd

path = "./data/"

model_emb_langs = {"mBART": ["ta_IN", "xh_ZA", "vi_VN"], 
                   "m2m100": ["ta", "xh", "vi"]}

limit = 50000

In [2]:
results_svm_ph1 = []
labels = pd.read_csv("./data/labels.csv").replace([True, False], [1, 0]).to_numpy().ravel()

for model, langs in model_emb_langs.items():
	for sl in langs:
		sl_vec = np.load(path + f"emb_{model}_{sl}.npy", mmap_mode="r")

		x_train, x_test, y_train, y_test = train_test_split(sl_vec[:limit], labels[:limit], test_size=0.25, random_state=42)
		print(x_train.shape, x_test.shape)

		svm = SVC()
		svm.fit(x_train, y_train)
		pred = svm.predict(x_test)

		conf_matrix = confusion_matrix(y_test, pred)
		results_svm_ph1.append([model, sl, conf_matrix])



(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)


In [4]:
results_svm_ph2 = []
labels = pd.read_csv("./data/labels.csv").replace([True, False], [1, 0]).to_numpy().ravel()

for model, langs in model_emb_langs.items():
	for sl in langs:
		eng_string = "en_XX" if model == "mBART" else "en"
		sl_vec = np.load(path + f"emb_{model}_{eng_string}_backtranslation-{sl}.npy", mmap_mode="r")

		x_train, x_test, y_train, y_test = train_test_split(sl_vec[:limit], labels[:limit], test_size=0.25, random_state=42)
		print(x_train.shape, x_test.shape)

		svm = SVC()
		svm.fit(x_train, y_train)
		pred = svm.predict(x_test)

		conf_matrix = confusion_matrix(y_test, pred)
		results_svm_ph2.append([model, sl, conf_matrix])

(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)
(35331, 1024) (11778, 1024)


In [5]:
for translated, backtranslated in zip(results_svm_ph1, results_svm_ph2):
    print(f"{translated[0]} -- {translated[1]} | {backtranslated[1]} -> en")
    tn_tr, fp_tr, fn_tr, tp_tr = translated[2].ravel()
    precision_tr = tp_tr/(fp_tr+tp_tr)
    recall_tr = tp_tr/(fn_tr+tp_tr)
    f1_tr = (2*recall_tr*precision_tr)/(recall_tr+precision_tr)

    tn_btr, fp_btr, fn_btr, tp_btr = backtranslated[2].ravel()
    precision_btr = tp_btr/(fp_btr+tp_btr)
    recall_btr = tp_btr/(fn_btr+tp_btr)
    f1_btr = (2*recall_btr*precision_btr)/(recall_btr+precision_btr) 

    print(f"Translated --> P = {precision_tr:0.3f} - R = {recall_tr:0.3f} - F1 = {f1_tr:0.3f}")
    print(f"Backtranslated --> P = {precision_btr:0.3f} - R = {recall_btr:0.3f} - F1 = {f1_btr:0.3f}")
    print("------------")

mBART -- ta_IN | ta_IN -> en
Translated --> P = 0.906 - R = 0.892 - F1 = 0.899
Backtranslated --> P = 0.912 - R = 0.884 - F1 = 0.898
------------
mBART -- xh_ZA | xh_ZA -> en
Translated --> P = 0.866 - R = 0.848 - F1 = 0.857
Backtranslated --> P = 0.869 - R = 0.887 - F1 = 0.878
------------
mBART -- vi_VN | vi_VN -> en
Translated --> P = 0.941 - R = 0.866 - F1 = 0.902
Backtranslated --> P = 0.942 - R = 0.912 - F1 = 0.926
------------
m2m100 -- ta | ta -> en
Translated --> P = 0.881 - R = 0.812 - F1 = 0.845
Backtranslated --> P = 0.873 - R = 0.802 - F1 = 0.836
------------
m2m100 -- xh | xh -> en
Translated --> P = 0.837 - R = 0.834 - F1 = 0.836
Backtranslated --> P = 0.808 - R = 0.808 - F1 = 0.808
------------
m2m100 -- vi | vi -> en
Translated --> P = 0.952 - R = 0.926 - F1 = 0.939
Backtranslated --> P = 0.948 - R = 0.921 - F1 = 0.935
------------
