# Experiments on Cross Lingual Transfer for Intent Detection
## Santichai Pornavalai
## 10.2.2021

This notebook is used to run the experiments for Intent Only cross-lingual experiments using XLM-R. The blocks for training and testing are meant to be run individually and correspond to the experiments listed in the paper

In [1]:
from preprocessing.util import *
import pickle
import sklearn
import torch
import numpy as np
from simpletransformers.classification import ClassificationModel

torch.manual_seed(136)
#from model import *

<torch._C.Generator at 0x7ff9da26c150>

We begin by preprocessing the data

In [2]:
mapping = {}
with open('preprocessing/label_map.json','r') as f:
    mapping = json.load(f)
    mapping = {int(k):v for k,v in mapping.items()}
    
    
# preprocess training and test files to pandas df

# eng train
en_df, en_mapping = df_format(("data/en/train-en.tsv"),mapping)

# eng eval
en_df_eval, en_mapping = df_format("data/en/eval-en.tsv",mapping)

# eng test
en_df_test, en_mapping = df_format("data/en/test-en.tsv",mapping)

# es train
es_df, es_mapping = df_format("data/es/train-es.tsv",mapping)

# es eval
es_df_eval, es_mapping = df_format("data/es/eval-es.tsv",mapping)

# es test
es_df_test, es_mapping = df_format("data/es/test-es.tsv",mapping)


# th train
th_df, th_mapping = df_format("data/th/train-th_TH.tsv",mapping)

# th eval
th_df_eval, th_mapping = df_format("data/th/eval-th_TH.tsv",mapping)

# th test
th_df_test, th_mapping = df_format("data/th/test-th_TH.tsv",mapping)

mapping_list = list(mapping.values())

opening data/en/train-en.tsv
opening data/en/eval-en.tsv
opening data/en/test-en.tsv
opening data/es/train-es.tsv
opening data/es/eval-es.tsv
opening data/es/test-es.tsv
opening data/th/train-th_TH.tsv
opening data/th/eval-th_TH.tsv
opening data/th/test-th_TH.tsv


In [3]:

# drop some duplicate values 
# This is perhaps unnecessary. 
en_train = en_df.drop_duplicates("text")
en_eval = en_df_eval.drop_duplicates("text")
en_test = en_df_test.drop_duplicates("text")

es_train = es_df.drop_duplicates("text")
es_eval = es_df_eval.drop_duplicates("text")
es_test = es_df_test.drop_duplicates("text")

th_train = th_df.drop_duplicates("text")
th_eval = th_df_eval.drop_duplicates("text")
th_test = th_df_test.drop_duplicates("text")

Since we do not know if the other experiments were trained on a combination of train and eval, we err on the safe side ignore the eval file

In [4]:
# en_full_train = pd.concat([en_train,en_eval])
# es_full_train = pd.concat([es_train,es_eval])
# th_full_train = pd.concat([th_train, th_eval])
# quick hack to by-pass combining en eval 

en_full_train = en_train
es_full_train = es_train
th_full_train = th_train

In [5]:
# we no longer use these paths.
# should be removed
path2model = "prelim_models/"
path2model_en = "prelim_models/en/"
path2model_es = "prelim_models/es/"
path2model_th = "prelim_models/th/"
#path2model_x = "/home/santi/BA/final_models/x/"

In [6]:
# mix create data for cross-lingual training
en_th_full_train = pd.concat([en_full_train,th_full_train])
en_es_full_train = pd.concat([en_full_train,th_full_train])

In [7]:

def avg_sent_l(df):
    return sum([len(l.split()) for l in df["text"]])/len(df)

def lexical_diversity(df):
    lexes = set()
    for l in df["text"]:
        for w in l.split():
            lexes.add(w)
    return len(lexes), lexes


def analyze_wrong(wrong_predictions,model):
    wrongs = [(inp.text_a,inp.label) for inp in wrong_predictions]
    wrong_preds, vecs = model.predict([t for t,l in wrongs])

    dom_corr = 0
    weak_dom = 0
    rem_alarms = ["reminder","alarm"]
    results = []

    for (text, lab_true), lab_pred in zip(wrongs,wrong_preds):

        lab_pred = mapping[lab_pred]
        lab_true = mapping[lab_true]
        dom_pred = lab_pred.split("/")[0]
        dom_true = lab_true.split("/")[0]

        if dom_pred == dom_true:
            dom_corr += 1

        if (dom_pred in rem_alarms) and (dom_true in rem_alarms):
            weak_dom += 1    

        results.append((text,lab_pred, lab_true))

        #print(text,"\t" ,lab_pred,"\t", lab_true,"\t", dom_pred,"\t", dom_true)

    return results, dom_corr/len(wrongs)

In [8]:
# metrics used for evaluation
macro = lambda x,y:  sklearn.metrics.f1_score(x,y, average= 'macro')
micro = lambda x,y:  sklearn.metrics.f1_score(x,y, average= 'micro')
report = lambda x,y:  sklearn.metrics.classification_report(x,y,digits = 5,labels = list(range(0,12)), target_names = mapping_list)
report_dict = lambda x,y:  sklearn.metrics.classification_report(x,y,digits = 5,output_dict = True,labels = list(range(0,12)),target_names = mapping_list)
accuracy = lambda x,y:  sklearn.metrics.accuracy_score(x,y)


In [9]:
def custom_eval(df, model, ex_name = "experiment 1", verbose = True):
    results, predictions_vs, wrongs = model.eval_model(df, macro=macro, micro=micro,accuracy=accuracy, report=report, report_dict = report_dict)
    results["name"] = ex_name
    
    false_preds,dom_acc = analyze_wrong(wrongs,model)
    results["wrong_predictions"] = false_preds
    results["domain_of_wrongs"] = dom_acc
    results["domain_accuracy"] = results["accuracy"] + (1-results["accuracy"])*dom_acc
    
    if verbose:
        print("results for experiment: ",ex_name)

        print(results["report"])
        print("domain accuracy: ",results["domain_accuracy"])
        
    return results

Here we gather some statistics about the datasets

In [10]:
# 

print("average sentence length")
print("en",avg_sent_l(en_train))
print("es",avg_sent_l(es_train))
print("th",avg_sent_l(th_train))


print("unique tokens")
print("en",lexical_diversity(en_train)[0])
print("es",lexical_diversity(es_train)[0])
print("th",lexical_diversity(th_train)[0])

average sentence length
en 7.69126027754818
es 7.678731678133413
th 8.280373831775702
unique tokens
en 3983
es 1849
th 1138


We load up a pretrained XLM-R model with a Max Ent layer for classification. Arguments are left pretty vanilla except fp16 which is not relevant for the results. 

In [11]:
#change the hyper-parameters here. 

args={"fp16": True,
      'learning_rate':1e-5,
      'num_train_epochs': 5,
      'reprocess_input_data': True,
      'overwrite_output_dir': True,
      'save_steps':-1,
      "save_model_every_epoch":False,
     }

In [12]:
experiment_results = {}

In [14]:
# train english model
# full train = train + eval
args["output_dir"] = "models/intent_en_train"
model= ClassificationModel('xlmroberta','xlm-roberta-base', num_labels=12, args=args)
model.train_model(en_full_train)

# test eng
results = custom_eval(en_test, model, "train_en_test_en")
experiment_results[results["name"]] = results

# test es
results = custom_eval(es_test, model, "train_en_test_es")
experiment_results[results["name"]] = results

# test th
results = custom_eval(th_test, model, "train_en_test_th")
experiment_results[results["name"]] = results

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weig

HBox(children=(FloatProgress(value=0.0, max=22987.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=2874.0, style=ProgressStyle(de…





KeyboardInterrupt: 

In [None]:
# reset model
# train on full spanish
args["output_dir"] = "models/intent_es_train"

model= ClassificationModel('xlmroberta','xlm-roberta-base', num_labels=12, args=args)
model.train_model(es_full_train)

# test eng
results = custom_eval(en_test, model, "train_es_test_en")
experiment_results[results["name"]] = results


# test es
results = custom_eval(es_test, model, "train_es_test_es")
experiment_results[results["name"]] = results


# test th
results = custom_eval(th_test, model, "train_es_test_th")
experiment_results[results["name"]] = results

In [None]:
# reset model
args["output_dir"] = "models/intent_th_train"

model= ClassificationModel('xlmroberta','xlm-roberta-base', num_labels=12, args=args)
# train on full thai
model.train_model(th_full_train)

# test eng
results = custom_eval(en_test, model, "train_th_test_en")
experiment_results[results["name"]] = results


# test es
results = custom_eval(es_test, model, "train_th_test_es")
experiment_results[results["name"]] = results


# test th
results = custom_eval(th_test, model, "train_th_test_th")
experiment_results[results["name"]] = results

In [None]:
args["output_dir"] = "models/intent_en_th_train"

model= ClassificationModel('xlmroberta','xlm-roberta-base', num_labels=12, args=args)
# train on full thai and eng mixed
model.train_model(en_th_full_train)
# test eng
results = custom_eval(en_test, model, "train_th_test_en")
experiment_results[results["name"]] = results


# test es
results = custom_eval(es_test, model, "train_th_test_es")
experiment_results[results["name"]] = results


# test th
results = custom_eval(th_test, model, "train_th_test_th")
experiment_results[results["name"]] = results

In [None]:
args["output_dir"] = "models/intent_en_es_train"

model= ClassificationModel('xlmroberta','xlm-roberta-base', num_labels=12, args=args)
# train on full thai and eng mixed
model.train_model(en_es_full_train)
# test eng
results = custom_eval(en_test, model, "train_th_test_en")
experiment_results[results["name"]] = results


# test es
results = custom_eval(es_test, model, "train_th_test_es")
experiment_results[results["name"]] = results


# test th
results = custom_eval(th_test, model, "train_th_test_th")
experiment_results[results["name"]] = results

Here are some sanity checks 

In [None]:
for text, predicted, real in experiment_results["train_en_test_th"]["wrong_predictions"]:
    print(text, "\t", predicted, "\t", real)

In [None]:
##### SANITY CHECK #####
def unique_sents(test_df, train_df):
    print("unique utterances in test data out of :", len(test_df))
    unique_sents = []
    train_set = set(train_df["text"])
    for sent in test_df["text"]:
        if sent not in train_set:
            unique_sents.append(sent)
    print(len(unique_sents)/len(test_df)*100,"% of the sentences are unique")

In [None]:
unique_sents(en_test,en_full_train)

In [None]:
unique_sents(es_test, es_eval)
unique_sents(es_test,es_full_train)

In [None]:
unique_sents(th_test, th_eval)
unique_sents(th_test,th_full_train)

In [None]:
predict_sent = lambda sent: mapping[model.predict([sent])[0][0]]

In [None]:
predict_sent("what's the weather in Potsdam")

In [None]:
predict_sent("don't wake me up tomorrow")

In [None]:

predict_sent("ตั้ง เวลา พรุ่ง บ่าย พรุ่งนี้")

In [None]:
predict_sent("que temperatura hay aqui")

In [None]:
predict_sent("no necesito que levantarme el sabado" )

In [None]:
predict_sent("sabado no necesito que levantarme" )

In [None]:
predict_sent("ไม่ ต้อง ปลุก ฉัน วัน เสาร์ นะ" )

In [None]:
predict_sent("วัน เสาร์ ไม่ ต้อง ปลุก ฉัน นะ")

In [None]:
predict_sent("you don't have to wake me up on saturday")

In [None]:
predict_sent("saturday you don't have to wake me up")

In [None]:
en_full_train[en_full_train["text"].str.contains("^on (saturday|sunday|monday|tuesday)",case=False, regex=True)]

In [None]:
mapping[10]

In [None]:
predict_sent("I don't have to wake up early on saturday")

In [None]:
# this is a weird sentence 
predict_sent("saturday you don't have to wake me up")

In [None]:
predict_sent("am Samstag musst du mich nicht aufwecken")

In [None]:
predict_sent("ich nicht muss aufstehen am Samstag")

In [None]:
"el sabado no necesito el despertador" 
# doesn't work
# implicit 

In [None]:
"cuanto falta hasta el alarma"
"cuanto tiempo queda hasta que me levanto"
"que temperatura hay aqui"