# Experiments on Cross Lingual Transfer for Intent Detection


The first step is to prepare the data from Schuster et al. For now we are only examining English and Spanish datasets, since preprocessing Thai requires extra steps and is slightly more complex(tokenization). Firstly, we parse the tsv data into dataframes

In [1]:
from util import *
import pickle
import sklearn
import torch
import numpy as np
from simpletransformers.classification import ClassificationModel

torch.manual_seed(136)
#from model import *

<torch._C.Generator at 0x7f6928836130>

In [2]:
mapping = {}
with open('deprecated/label_map.json','r') as f:
    mapping = json.load(f)
    mapping = {int(k):v for k,v in mapping.items()}
    
    
# preprocess training and test files to pandas df

# eng train
en_df, en_mapping = df_format(("data/en/train-en.tsv"),mapping)

# eng eval
en_df_eval, en_mapping = df_format("data/en/eval-en.tsv",mapping)

# eng test
en_df_test, en_mapping = df_format("data/en/test-en.tsv",mapping)

# es train
es_df, es_mapping = df_format("data/es/train-es.tsv",mapping)

# es eval
es_df_eval, es_mapping = df_format("data/es/eval-es.tsv",mapping)

# es test
es_df_test, es_mapping = df_format("data/es/test-es.tsv",mapping)


# th train
th_df, th_mapping = df_format("data/th/train-th_TH.tsv",mapping)

# th eval
th_df_eval, th_mapping = df_format("data/th/eval-th_TH.tsv",mapping)

# th test
th_df_test, th_mapping = df_format("data/th/test-th_TH.tsv",mapping)

mapping_list = list(mapping.values())

opening data/en/train-en.tsv
opening data/en/eval-en.tsv
opening data/en/test-en.tsv
opening data/es/train-es.tsv
opening data/es/eval-es.tsv
opening data/es/test-es.tsv
opening data/th/train-th_TH.tsv
opening data/th/eval-th_TH.tsv
opening data/th/test-th_TH.tsv


In [3]:
en_train = en_df.drop_duplicates("text")
en_eval = en_df_eval.drop_duplicates("text")
en_test = en_df_test.drop_duplicates("text")

es_train = es_df.drop_duplicates("text")
es_eval = es_df_eval.drop_duplicates("text")
es_test = es_df_test.drop_duplicates("text")

th_train = th_df.drop_duplicates("text")
th_eval = th_df_eval.drop_duplicates("text")
th_test = th_df_test.drop_duplicates("text")

In [6]:
# en_full_train = pd.concat([en_train,en_eval])
# es_full_train = pd.concat([es_train,es_eval])
# th_full_train = pd.concat([th_train, th_eval])

# quick hack to by-pass combining en eval 

en_full_train = en_train
es_full_train = es_train
th_full_train = th_train

In [7]:
path2model = "prelim_models/"
path2model_en = "prelim_models/en/"
path2model_es = "prelim_models/es/"
path2model_th = "prelim_models/th/"
#path2model_x = "/home/santi/BA/final_models/x/"

In [8]:
en_th_full_train = pd.concat([en_full_train,th_full_train])
en_es_full_train = pd.concat([en_full_train,th_full_train])

In [9]:
def avg_sent_l(df):
    return sum([len(l.split()) for l in df["text"]])/len(df)

def lexical_diversity(df):
    lexes = set()
    for l in df["text"]:
        for w in l.split():
            lexes.add(w)
    return len(lexes), lexes


def analyze_wrong(wrong_predictions,model):
    wrongs = [(inp.text_a,inp.label) for inp in wrong_predictions]
    wrong_preds, vecs = model.predict([t for t,l in wrongs])

    dom_corr = 0
    weak_dom = 0
    rem_alarms = ["reminder","alarm"]
    results = []

    for (text, lab_true), lab_pred in zip(wrongs,wrong_preds):

        lab_pred = mapping[lab_pred]
        lab_true = mapping[lab_true]
        dom_pred = lab_pred.split("/")[0]
        dom_true = lab_true.split("/")[0]

        if dom_pred == dom_true:
            dom_corr += 1

        if (dom_pred in rem_alarms) and (dom_true in rem_alarms):
            weak_dom += 1    

        results.append((text,lab_pred, lab_true))

        #print(text,"\t" ,lab_pred,"\t", lab_true,"\t", dom_pred,"\t", dom_true)

    return results, dom_corr/len(wrongs)

In [10]:
macro = lambda x,y:  sklearn.metrics.f1_score(x,y, average= 'macro')
micro = lambda x,y:  sklearn.metrics.f1_score(x,y, average= 'micro')
report = lambda x,y:  sklearn.metrics.classification_report(x,y,digits = 5,labels = list(range(0,12)), target_names = mapping_list)
report_dict = lambda x,y:  sklearn.metrics.classification_report(x,y,digits = 5,output_dict = True,labels = list(range(0,12)),target_names = mapping_list)
accuracy = lambda x,y:  sklearn.metrics.accuracy_score(x,y)


In [11]:
def custom_eval(df, model, ex_name = "experiment 1", verbose = True):
    results, predictions_vs, wrongs = model.eval_model(df, macro=macro, micro=micro,accuracy=accuracy, report=report, report_dict = report_dict)
    results["name"] = ex_name
    
    false_preds,dom_acc = analyze_wrong(wrongs,model)
    results["wrong_predictions"] = false_preds
    results["domain_of_wrongs"] = dom_acc
    results["domain_accuracy"] = results["accuracy"] + (1-results["accuracy"])*dom_acc
    
    if verbose:
        print("results for experiment: ",ex_name)

        print(results["report"])
        print("domain accuracy: ",results["domain_accuracy"])
        
    return results

In [12]:
# gather some statistics about the data sets

print("average sentence length")
print("en",avg_sent_l(en_train))
print("es",avg_sent_l(es_train))
print("th",avg_sent_l(th_train))


print("unique tokens")
print("en",lexical_diversity(en_train)[0])
print("es",lexical_diversity(es_train)[0])
print("th",lexical_diversity(th_train)[0])

average sentence length
en 7.69126027754818
es 7.678731678133413
th 8.280373831775702
unique tokens
en 3983
es 1849
th 1138


We load up a pretrained XLM-R model with a Max Ent layer for classification. Arguments are left pretty vanilla except fp16 which is not relevant for the results. 

In [13]:
#change the hyper-parameters here. 
args={"fp16": True,
      'learning_rate':1e-5,
      'num_train_epochs': 5,
      'reprocess_input_data': True,
      'overwrite_output_dir': True,
      'save_steps':-1,
      "save_model_every_epoch":False,
     }

In [14]:
experiment_results = {}

In [17]:
# train english model
# full train = train + eval
args["output_dir"] = "models/intent_en_train"
model= ClassificationModel('xlmroberta','xlm-roberta-base', num_labels=12, args=args)
model.train_model(en_full_train)

# test eng
results = custom_eval(en_test, model, "train_en_test_en")
experiment_results[results["name"]] = results

# test es
results = custom_eval(es_test, model, "train_en_test_es")
experiment_results[results["name"]] = results

# test th
results = custom_eval(th_test, model, "train_en_test_th")
experiment_results[results["name"]] = results

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weig

HBox(children=(FloatProgress(value=0.0, max=22987.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=2874.0, style=ProgressStyle(de…








HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=2874.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=2874.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=2874.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=2874.0, style=ProgressStyle(de…





HBox(children=(FloatProgress(value=0.0, max=7420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=928.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, max=52.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


results for experiment:  train_en_test_en
                          precision    recall  f1-score   support

reminder/cancel_reminder    1.00000   0.98944   0.99469       284
            weather/find    0.99911   1.00000   0.99956      3386
      alarm/cancel_alarm    0.98210   0.98874   0.98541       444
 reminder/show_reminders    0.99537   0.99078   0.99307       217
      alarm/snooze_alarm    0.95556   0.96629   0.96089        89
alarm/time_left_on_alarm    0.94048   0.97531   0.95758        81
      alarm/modify_alarm    0.90476   0.93443   0.91935       122
     weather/checkSunset    1.00000   0.94595   0.97222        37
    weather/checkSunrise    1.00000   1.00000   1.00000        25
       alarm/show_alarms    0.99038   0.97170   0.98095       212
   reminder/set_reminder    0.99613   0.99922   0.99767      1287
         alarm/set_alarm    0.99023   0.98382   0.98701      1236

                accuracy                        0.99299      7420
               macro avg    0.9

HBox(children=(FloatProgress(value=0.0, max=2854.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=357.0, style=ProgressStyle(descr…




  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


HBox(children=(FloatProgress(value=0.0, max=258.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33.0), HTML(value='')))


results for experiment:  train_en_test_es
                          precision    recall  f1-score   support

reminder/cancel_reminder    0.99412   0.98830   0.99120       171
            weather/find    0.93217   0.99793   0.96393       964
      alarm/cancel_alarm    0.95939   0.87097   0.91304       217
 reminder/show_reminders    0.98425   0.94697   0.96525       132
      alarm/snooze_alarm    0.44186   0.59375   0.50667        32
alarm/time_left_on_alarm    1.00000   1.00000   1.00000        28
      alarm/modify_alarm    0.58929   0.97059   0.73333        34
     weather/checkSunset    0.00000   0.00000   0.00000         2
    weather/checkSunrise    0.00000   0.00000   0.00000         0
       alarm/show_alarms    0.95283   0.91818   0.93519       110
   reminder/set_reminder    0.98879   0.70111   0.82047       629
         alarm/set_alarm    0.81636   0.98879   0.89434       535

               micro avg    0.90960   0.90960   0.90960      2854
               macro avg    0.7

HBox(children=(FloatProgress(value=0.0, max=1557.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=195.0, style=ProgressStyle(descr…




  'recall', 'true', average, warn_for)


HBox(children=(FloatProgress(value=0.0, max=364.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))


results for experiment:  train_en_test_th
                          precision    recall  f1-score   support

reminder/cancel_reminder    0.90000   0.09574   0.17308        94
            weather/find    0.88278   1.00000   0.93774       610
      alarm/cancel_alarm    0.58371   0.94161   0.72067       137
 reminder/show_reminders    0.72222   0.62903   0.67241        62
      alarm/snooze_alarm    0.00000   0.00000   0.00000        20
alarm/time_left_on_alarm    1.00000   0.90000   0.94737        20
      alarm/modify_alarm    0.23404   0.84615   0.36667        13
     weather/checkSunset    0.00000   0.00000   0.00000         0
    weather/checkSunrise    0.00000   0.00000   0.00000         0
       alarm/show_alarms    0.86667   0.16883   0.28261        77
   reminder/set_reminder    0.99194   0.46415   0.63239       265
         alarm/set_alarm    0.64960   0.93050   0.76508       259

               micro avg    0.76622   0.76622   0.76622      1557
               macro avg    0.5

In [18]:
# reset model
# train on full spanish
args["output_dir"] = "models/intent_es_train"

model= ClassificationModel('xlmroberta','xlm-roberta-base', num_labels=12, args=args)
model.train_model(es_full_train)

# test eng
results = custom_eval(en_test, model, "train_es_test_en")
experiment_results[results["name"]] = results


# test es
results = custom_eval(es_test, model, "train_es_test_es")
experiment_results[results["name"]] = results


# test th
results = custom_eval(th_test, model, "train_es_test_th")
experiment_results[results["name"]] = results

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weig

HBox(children=(FloatProgress(value=0.0, max=3343.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=418.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=418.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=418.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=418.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=418.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=7420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=928.0, style=ProgressStyle(descr…




  'precision', 'predicted', average, warn_for)


HBox(children=(FloatProgress(value=0.0, max=340.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))


results for experiment:  train_es_test_en
                          precision    recall  f1-score   support

reminder/cancel_reminder    1.00000   0.90845   0.95203       284
            weather/find    0.98115   0.99941   0.99020      3386
      alarm/cancel_alarm    0.89597   0.95045   0.92240       444
 reminder/show_reminders    1.00000   0.84793   0.91771       217
      alarm/snooze_alarm    0.27273   0.03371   0.06000        89
alarm/time_left_on_alarm    0.93671   0.91358   0.92500        81
      alarm/modify_alarm    0.88298   0.68033   0.76852       122
     weather/checkSunset    0.00000   0.00000   0.00000        37
    weather/checkSunrise    0.00000   0.00000   0.00000        25
       alarm/show_alarms    0.96135   0.93868   0.94988       212
   reminder/set_reminder    0.96203   0.98446   0.97312      1287
         alarm/set_alarm    0.89333   0.97573   0.93271      1236

                accuracy                        0.95418      7420
               macro avg    0.7

HBox(children=(FloatProgress(value=0.0, max=2854.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=357.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))


results for experiment:  train_es_test_es
                          precision    recall  f1-score   support

reminder/cancel_reminder    0.98276   1.00000   0.99130       171
            weather/find    0.99586   0.99793   0.99689       964
      alarm/cancel_alarm    0.97664   0.96313   0.96984       217
 reminder/show_reminders    0.99213   0.95455   0.97297       132
      alarm/snooze_alarm    0.96429   0.84375   0.90000        32
alarm/time_left_on_alarm    1.00000   1.00000   1.00000        28
      alarm/modify_alarm    1.00000   0.85294   0.92063        34
     weather/checkSunset    0.00000   0.00000   0.00000         2
    weather/checkSunrise    0.00000   0.00000   0.00000         0
       alarm/show_alarms    0.96296   0.94545   0.95413       110
   reminder/set_reminder    0.98736   0.99364   0.99049       629
         alarm/set_alarm    0.96527   0.98692   0.97597       535

               micro avg    0.98423   0.98423   0.98423      2854
               macro avg    0.8

HBox(children=(FloatProgress(value=0.0, max=1557.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=195.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, max=324.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))


results for experiment:  train_es_test_th
                          precision    recall  f1-score   support

reminder/cancel_reminder    0.00000   0.00000   0.00000        94
            weather/find    0.87231   0.99672   0.93037       610
      alarm/cancel_alarm    0.54545   0.91971   0.68478       137
 reminder/show_reminders    0.00000   0.00000   0.00000        62
      alarm/snooze_alarm    0.16667   0.05000   0.07692        20
alarm/time_left_on_alarm    1.00000   0.10000   0.18182        20
      alarm/modify_alarm    0.63636   0.53846   0.58333        13
     weather/checkSunset    0.00000   0.00000   0.00000         0
    weather/checkSunrise    0.00000   0.00000   0.00000         0
       alarm/show_alarms    0.62963   0.22078   0.32692        77
   reminder/set_reminder    0.89916   0.80755   0.85089       265
         alarm/set_alarm    0.74783   0.99614   0.85430       259

               micro avg    0.79191   0.79191   0.79191      1557
               macro avg    0.4

In [19]:
# reset model
args["output_dir"] = "models/intent_th_train"

model= ClassificationModel('xlmroberta','xlm-roberta-base', num_labels=12, args=args)
# train on full thai
model.train_model(th_full_train)

# test eng
results = custom_eval(en_test, model, "train_th_test_en")
experiment_results[results["name"]] = results


# test es
results = custom_eval(es_test, model, "train_th_test_es")
experiment_results[results["name"]] = results


# test th
results = custom_eval(th_test, model, "train_th_test_th")
experiment_results[results["name"]] = results

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weig

HBox(children=(FloatProgress(value=0.0, max=1926.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=241.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=241.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=241.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=241.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=241.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=7420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=928.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, max=1285.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=161.0), HTML(value='')))


results for experiment:  train_th_test_en
                          precision    recall  f1-score   support

reminder/cancel_reminder    0.94340   0.17606   0.29674       284
            weather/find    0.87242   0.99970   0.93174      3386
      alarm/cancel_alarm    0.73542   0.79505   0.76407       444
 reminder/show_reminders    1.00000   0.08756   0.16102       217
      alarm/snooze_alarm    0.28571   0.02247   0.04167        89
alarm/time_left_on_alarm    0.93750   0.37037   0.53097        81
      alarm/modify_alarm    0.00000   0.00000   0.00000       122
     weather/checkSunset    0.00000   0.00000   0.00000        37
    weather/checkSunrise    0.00000   0.00000   0.00000        25
       alarm/show_alarms    0.18280   0.08019   0.11148       212
   reminder/set_reminder    0.78378   0.87879   0.82857      1287
         alarm/set_alarm    0.81246   0.92880   0.86674      1236

                accuracy                        0.82682      7420
               macro avg    0.5

HBox(children=(FloatProgress(value=0.0, max=2854.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=357.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, max=852.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=107.0), HTML(value='')))


results for experiment:  train_th_test_es
                          precision    recall  f1-score   support

reminder/cancel_reminder    0.86047   0.43275   0.57588       171
            weather/find    0.67226   0.99793   0.80334       964
      alarm/cancel_alarm    0.53929   0.69585   0.60765       217
 reminder/show_reminders    0.75000   0.06818   0.12500       132
      alarm/snooze_alarm    0.00000   0.00000   0.00000        32
alarm/time_left_on_alarm    1.00000   0.07143   0.13333        28
      alarm/modify_alarm    0.00000   0.00000   0.00000        34
     weather/checkSunset    0.00000   0.00000   0.00000         2
    weather/checkSunrise    0.00000   0.00000   0.00000         0
       alarm/show_alarms    0.01639   0.00909   0.01170       110
   reminder/set_reminder    0.82313   0.57711   0.67850       629
         alarm/set_alarm    0.81481   0.82243   0.81860       535

               micro avg    0.70147   0.70147   0.70147      2854
               macro avg    0.4

HBox(children=(FloatProgress(value=0.0, max=1557.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=195.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


results for experiment:  train_th_test_th
                          precision    recall  f1-score   support

reminder/cancel_reminder    0.93878   0.97872   0.95833        94
            weather/find    0.99836   0.99836   0.99836       610
      alarm/cancel_alarm    0.89286   0.91241   0.90253       137
 reminder/show_reminders    0.95082   0.93548   0.94309        62
      alarm/snooze_alarm    1.00000   0.75000   0.85714        20
alarm/time_left_on_alarm    1.00000   0.95000   0.97436        20
      alarm/modify_alarm    0.00000   0.00000   0.00000        13
     weather/checkSunset    0.00000   0.00000   0.00000         0
    weather/checkSunrise    0.00000   0.00000   0.00000         0
       alarm/show_alarms    0.95946   0.92208   0.94040        77
   reminder/set_reminder    0.96691   0.99245   0.97952       265
         alarm/set_alarm    0.95149   0.98456   0.96774       259

               micro avg    0.96789   0.96789   0.96789      1557
               macro avg    0.7

In [15]:
args["output_dir"] = "models/intent_en_th_train"

model= ClassificationModel('xlmroberta','xlm-roberta-base', num_labels=12, args=args)
# train on full thai and eng mixed
model.train_model(en_th_full_train)
# test eng
results = custom_eval(en_test, model, "train_th_test_en")
experiment_results[results["name"]] = results


# test es
results = custom_eval(es_test, model, "train_th_test_es")
experiment_results[results["name"]] = results


# test th
results = custom_eval(th_test, model, "train_th_test_th")
experiment_results[results["name"]] = results

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weig

HBox(children=(FloatProgress(value=0.0, max=24913.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=3115.0, style=ProgressStyle(de…






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=3115.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=3115.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=3115.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=3115.0, style=ProgressStyle(de…





HBox(children=(FloatProgress(value=0.0, max=7420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=928.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))


results for experiment:  train_th_test_en
                          precision    recall  f1-score   support

reminder/cancel_reminder    0.99647   0.99296   0.99471       284
            weather/find    1.00000   1.00000   1.00000      3386
      alarm/cancel_alarm    0.98658   0.99324   0.98990       444
 reminder/show_reminders    1.00000   0.98618   0.99304       217
      alarm/snooze_alarm    0.95652   0.98876   0.97238        89
alarm/time_left_on_alarm    0.95238   0.98765   0.96970        81
      alarm/modify_alarm    0.92562   0.91803   0.92181       122
     weather/checkSunset    1.00000   1.00000   1.00000        37
    weather/checkSunrise    1.00000   1.00000   1.00000        25
       alarm/show_alarms    0.99519   0.97642   0.98571       212
   reminder/set_reminder    0.99613   0.99922   0.99767      1287
         alarm/set_alarm    0.99026   0.98706   0.98865      1236

                accuracy                        0.99434      7420
               macro avg    0.9

HBox(children=(FloatProgress(value=0.0, max=2854.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=357.0, style=ProgressStyle(descr…




  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


HBox(children=(FloatProgress(value=0.0, max=135.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


results for experiment:  train_th_test_es
                          precision    recall  f1-score   support

reminder/cancel_reminder    0.96571   0.98830   0.97688       171
            weather/find    0.97470   0.99896   0.98668       964
      alarm/cancel_alarm    0.95025   0.88018   0.91388       217
 reminder/show_reminders    0.97674   0.95455   0.96552       132
      alarm/snooze_alarm    0.56522   0.40625   0.47273        32
alarm/time_left_on_alarm    1.00000   1.00000   1.00000        28
      alarm/modify_alarm    0.78571   0.97059   0.86842        34
     weather/checkSunset    0.00000   0.00000   0.00000         2
    weather/checkSunrise    0.00000   0.00000   0.00000         0
       alarm/show_alarms    0.92593   0.90909   0.91743       110
   reminder/set_reminder    0.99297   0.89825   0.94324       629
         alarm/set_alarm    0.89848   0.99252   0.94316       535

               micro avg    0.95270   0.95270   0.95270      2854
               macro avg    0.7

HBox(children=(FloatProgress(value=0.0, max=1557.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=195.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


results for experiment:  train_th_test_th
                          precision    recall  f1-score   support

reminder/cancel_reminder    0.96809   0.96809   0.96809        94
            weather/find    0.99836   1.00000   0.99918       610
      alarm/cancel_alarm    0.97692   0.92701   0.95131       137
 reminder/show_reminders    0.96610   0.91935   0.94215        62
      alarm/snooze_alarm    1.00000   1.00000   1.00000        20
alarm/time_left_on_alarm    1.00000   0.95000   0.97436        20
      alarm/modify_alarm    1.00000   0.76923   0.86957        13
     weather/checkSunset    0.00000   0.00000   0.00000         0
    weather/checkSunrise    0.00000   0.00000   0.00000         0
       alarm/show_alarms    0.96154   0.97403   0.96774        77
   reminder/set_reminder    0.98134   0.99245   0.98687       265
         alarm/set_alarm    0.95896   0.99228   0.97533       259

               micro avg    0.98202   0.98202   0.98202      1557
               macro avg    0.8

In [16]:
args["output_dir"] = "models/intent_en_es_train"

model= ClassificationModel('xlmroberta','xlm-roberta-base', num_labels=12, args=args)
# train on full thai and eng mixed
model.train_model(en_es_full_train)
# test eng
results = custom_eval(en_test, model, "train_th_test_en")
experiment_results[results["name"]] = results


# test es
results = custom_eval(es_test, model, "train_th_test_es")
experiment_results[results["name"]] = results


# test th
results = custom_eval(th_test, model, "train_th_test_th")
experiment_results[results["name"]] = results

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weig

HBox(children=(FloatProgress(value=0.0, max=24913.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=3115.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=3115.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=3115.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=3115.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=3115.0, style=ProgressStyle(de…





HBox(children=(FloatProgress(value=0.0, max=7420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=928.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


results for experiment:  train_th_test_en
                          precision    recall  f1-score   support

reminder/cancel_reminder    0.99644   0.98592   0.99115       284
            weather/find    1.00000   0.99941   0.99970      3386
      alarm/cancel_alarm    0.99097   0.98874   0.98985       444
 reminder/show_reminders    0.99074   0.98618   0.98845       217
      alarm/snooze_alarm    0.96667   0.97753   0.97207        89
alarm/time_left_on_alarm    0.94118   0.98765   0.96386        81
      alarm/modify_alarm    0.90551   0.94262   0.92369       122
     weather/checkSunset    1.00000   0.97297   0.98630        37
    weather/checkSunrise    0.96154   1.00000   0.98039        25
       alarm/show_alarms    0.99034   0.96698   0.97852       212
   reminder/set_reminder    0.99381   0.99845   0.99612      1287
         alarm/set_alarm    0.99026   0.98706   0.98865      1236

                accuracy                        0.99326      7420
               macro avg    0.9

HBox(children=(FloatProgress(value=0.0, max=2854.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=357.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, max=343.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))


results for experiment:  train_th_test_es
                          precision    recall  f1-score   support

reminder/cancel_reminder    0.88083   0.99415   0.93407       171
            weather/find    0.92234   0.99793   0.95864       964
      alarm/cancel_alarm    0.90278   0.89862   0.90069       217
 reminder/show_reminders    0.95312   0.92424   0.93846       132
      alarm/snooze_alarm    0.68421   0.40625   0.50980        32
alarm/time_left_on_alarm    0.96552   1.00000   0.98246        28
      alarm/modify_alarm    0.53968   1.00000   0.70103        34
     weather/checkSunset    0.00000   0.00000   0.00000         2
    weather/checkSunrise    0.00000   0.00000   0.00000         0
       alarm/show_alarms    0.94340   0.90909   0.92593       110
   reminder/set_reminder    0.99440   0.56439   0.72008       629
         alarm/set_alarm    0.76000   0.99439   0.86154       535

               micro avg    0.87982   0.87982   0.87982      2854
               macro avg    0.7

HBox(children=(FloatProgress(value=0.0, max=1557.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=195.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, max=27.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


results for experiment:  train_th_test_th
                          precision    recall  f1-score   support

reminder/cancel_reminder    0.96809   0.96809   0.96809        94
            weather/find    1.00000   0.99836   0.99918       610
      alarm/cancel_alarm    0.97656   0.91241   0.94340       137
 reminder/show_reminders    0.95161   0.95161   0.95161        62
      alarm/snooze_alarm    0.95238   1.00000   0.97561        20
alarm/time_left_on_alarm    1.00000   0.95000   0.97436        20
      alarm/modify_alarm    1.00000   0.84615   0.91667        13
     weather/checkSunset    0.00000   0.00000   0.00000         0
    weather/checkSunrise    0.00000   0.00000   0.00000         0
       alarm/show_alarms    0.96104   0.96104   0.96104        77
   reminder/set_reminder    0.98507   0.99623   0.99062       265
         alarm/set_alarm    0.96269   0.99614   0.97913       259

               micro avg    0.98266   0.98266   0.98266      1557
               macro avg    0.8

In [None]:
for text, predicted, real in experiment_results["train_en_test_th"]["wrong_predictions"]:
    print(text, "\t", predicted, "\t", real)

In [None]:
##### SANITY CHECK #####
def unique_sents(test_df, train_df):
    print("unique utterances in test data out of :", len(test_df))
    unique_sents = []
    train_set = set(train_df["text"])
    for sent in test_df["text"]:
        if sent not in train_set:
            unique_sents.append(sent)
    print(len(unique_sents)/len(test_df)*100,"% of the sentences are unique")

In [None]:
unique_sents(en_test,en_full_train)

In [None]:
unique_sents(es_test, es_eval)
unique_sents(es_test,es_full_train)

In [None]:
unique_sents(th_test, th_eval)
unique_sents(th_test,th_full_train)

In [None]:
predict_sent = lambda sent: mapping[model.predict([sent])[0][0]]

In [None]:
predict_sent("what's the weather in Potsdam")

In [None]:
predict_sent("don't wake me up tomorrow")

In [None]:

predict_sent("ตั้ง เวลา พรุ่ง บ่าย พรุ่งนี้")

In [None]:
predict_sent("que temperatura hay aqui")

In [None]:
predict_sent("no necesito que levantarme el sabado" )

In [None]:
predict_sent("sabado no necesito que levantarme" )

In [None]:
predict_sent("ไม่ ต้อง ปลุก ฉัน วัน เสาร์ นะ" )

In [None]:
predict_sent("วัน เสาร์ ไม่ ต้อง ปลุก ฉัน นะ")

In [None]:
predict_sent("you don't have to wake me up on saturday")

In [None]:
predict_sent("saturday you don't have to wake me up")

In [None]:
en_full_train[en_full_train["text"].str.contains("^on (saturday|sunday|monday|tuesday)",case=False, regex=True)]

In [None]:
mapping[10]

In [None]:
predict_sent("I don't have to wake up early on saturday")

In [None]:
# this is a weird sentence 
predict_sent("saturday you don't have to wake me up")

In [None]:
predict_sent("am Samstag musst du mich nicht aufwecken")

In [None]:
predict_sent("ich nicht muss aufstehen am Samstag")

In [None]:
"el sabado no necesito el despertador" 
# doesn't work
# implicit 

In [None]:
"cuanto falta hasta el alarma"
"cuanto tiempo queda hasta que me levanto"
"que temperatura hay aqui"