In [130]:
import time
import numpy as np
import pandas as pd
import pickle
import math

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

import sys
import pandas as pd

import torch
from transformers import MBartModel, MBart50Tokenizer

model_path = "models/"

sl_orig = "en_XX"

sls = ["en_XX", "ta_IN", "xh_ZA", "vi_VN"]

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(torch.cuda.is_available())

limit = 5000

True


In [120]:
def pool_embeddings(method, data, tokenized, pad_tok_id):
  if "attention_mask" in tokenized:
    attention_mask = tokenized["attention_mask"]
  else: # apparently ErnieM does NOT have attenion IDs in the tokenized output, so I am "computing" them myself - like in all other models, the model should not pay attention to [PAD] tokens, so they are ignored/not paid attention to
    token_ids = tokenized["input_ids"][0]
    padding_ids = len([tok for tok in token_ids if tok == pad_tok_id]) # count how many [PAD] tokens there are
    attention_mask = torch.ones((tokenized["input_ids"].shape)).to(device)
    if padding_ids > 0:
        attention_mask[:,-padding_ids:] = 0
    attention_mask = torch.tensor(attention_mask).to(device)
    
  attention_expanded = attention_mask.unsqueeze(-1).expand(data.size()).float()
  data_attention = data * attention_expanded
  return torch.sum(data_attention, 1) / torch.clamp(attention_expanded.sum(1), min=1e-9) # to not divide by 0

In [121]:
def get_embeddings(text, sl):
    inputs = tokenizer(text, return_tensors="pt")
    inputs = inputs.to(device)
    outputs = model(**inputs)

    pad_tok_id = tokenizer("[PAD]")
    pad_tok_id = pad_tok_id["input_ids"][1]

    outputs = pool_embeddings(torch.mean, outputs[0], inputs, pad_tok_id)[0]

    return np.array(outputs.cpu().detach().numpy()) 

In [None]:
# from tqdm import tqdm

# for sl in sls:
# 	if sl == sl_orig:
# 		continue
# 	df = pd.read_csv(f"snopes_backtranslation_{sl}-{sl_orig}.csv", sep=",")
# 	df = df.replace([True, False], [1, 0])
# 	df.head()

# 	tokenizer = MBart50Tokenizer.from_pretrained(model_path + "mbart-large-50-many-to-many-mmt", src_lang=sl)
# 	model = MBartModel.from_pretrained(model_path + "mbart-large-50-many-to-many-mmt").to(device)

# 	texts = df["claim"].iloc[:limit]
# 	labels = df["label"].iloc[:limit]

# 	embds = []
# 	for t in tqdm(texts):
# 		embds.append(get_embeddings(t, sl))

# 	embds = np.asarray(embds)
# 	np.save(f"emb_{sl}_back.npy", np.asarray(embds))

In [138]:
results_svm_ph1 = []
for sl in sls:
	if sl != sl_orig:
		df = pd.read_csv(f"snopes_{sl}.csv")
	else:
		df = pd.read_csv(f"cleaned_snopes.csv", sep="|")
	df = df.replace([True, False], [1, 0])

	labels = df["label"]

	df.head()
	embds = np.load(f"emb_{sl}.npy")

	x_train, x_test, y_train, y_test = train_test_split(embds, labels, test_size=0.25, random_state=42)
	print(x_train.shape, x_test.shape)

	svm = SVC()
	svm.fit(x_train, y_train)
	pred = svm.predict(x_test)

	conf_matrix = confusion_matrix(y_test, pred)
	results_svm_ph1.append(conf_matrix)

	print(conf_matrix)


(3324, 1024) (1109, 1024)
[[941   1]
 [167   0]]
(3324, 1024) (1109, 1024)
[[942   0]
 [167   0]]
(3324, 1024) (1109, 1024)
[[942   0]
 [167   0]]
(3324, 1024) (1109, 1024)
[[942   0]
 [167   0]]


In [139]:
results_svm_ph2 = []
for sl in sls:
	if sl == sl_orig:
		continue
	df = pd.read_csv(f"snopes_backtranslation_{sl}-{sl_orig}.csv", sep=",")
	df = df.replace([True, False], [1, 0])
	df.head()

	labels = df["label"].iloc[:limit]
	
	embds = np.asarray(embds)
	np.save(f"emb_{sl}_back.npy", np.asarray(embds))

	x_train, x_test, y_train, y_test = train_test_split(embds, labels, test_size=0.25, random_state=42)
	print(x_train.shape, x_test.shape)

	svm = SVC()
	svm.fit(x_train, y_train)
	pred = svm.predict(x_test)

	conf_matrix = confusion_matrix(y_test, pred)
	results_svm_ph2.append(conf_matrix)

	print(conf_matrix)

(3324, 1024) (1109, 1024)
[[942   0]
 [167   0]]
(3324, 1024) (1109, 1024)
[[942   0]
 [167   0]]
(3324, 1024) (1109, 1024)
[[942   0]
 [167   0]]


In [152]:
from sklearn.linear_model import LogisticRegression

results_lr_ph1 = []
for sl in sls:
	if sl != sl_orig:
		df = pd.read_csv(f"snopes_{sl}.csv")
	else:
		df = pd.read_csv(f"cleaned_snopes.csv", sep="|")
	df = df.replace([True, False], [1, 0])

	labels = df["label"]

	df.head()
	embds = np.load(f"emb_{sl}.npy", )

	x_train, x_test, y_train, y_test = train_test_split(embds, labels, test_size=0.25, random_state=42)
	print(x_train.shape, x_test.shape)

	lr = LogisticRegression(solver="newton-cg")
	lr.fit(x_train, y_train)
	pred = lr.predict(x_test)

	conf_matrix = confusion_matrix(y_test, pred)
	results_lr_ph1.append(conf_matrix)

	print(conf_matrix)


(3324, 1024) (1109, 1024)




[[861  81]
 [135  32]]
(3324, 1024) (1109, 1024)




[[858  84]
 [144  23]]
(3324, 1024) (1109, 1024)




[[859  83]
 [138  29]]
(3324, 1024) (1109, 1024)
[[855  87]
 [135  32]]




In [151]:
results_lr_ph2 = []
for sl in sls:
	if sl == sl_orig:
		continue
	df = pd.read_csv(f"snopes_backtranslation_{sl}-{sl_orig}.csv", sep=",")
	df = df.replace([True, False], [1, 0])
	df.head()

	labels = df["label"].iloc[:limit]
	
	embds = np.asarray(embds)
	np.save(f"emb_{sl}_back.npy", np.asarray(embds))

	x_train, x_test, y_train, y_test = train_test_split(embds, labels, test_size=0.25, random_state=42)
	print(x_train.shape, x_test.shape)

	lr = LogisticRegression(solver="newton-cg")
	lr.fit(x_train, y_train)
	pred = lr.predict(x_test)

	conf_matrix = confusion_matrix(y_test, pred)
	results_lr_ph2.append(conf_matrix)

	print(conf_matrix)

(3324, 1024) (1109, 1024)
int64




[[861  81]
 [135  32]]
(3324, 1024) (1109, 1024)
int64




[[861  81]
 [135  32]]
(3324, 1024) (1109, 1024)
int64
[[861  81]
 [135  32]]




In [133]:
print(results_ph1)
print(results_ph2)

[array([[941,   1],
       [167,   0]]), array([[942,   0],
       [167,   0]]), array([[942,   0],
       [167,   0]]), array([[942,   0],
       [167,   0]])]
[array([[942,   0],
       [167,   0]]), array([[942,   0],
       [167,   0]]), array([[942,   0],
       [167,   0]])]
