<a href="https://colab.research.google.com/github/VanessaSchenkel/how_to/blob/main/how_to_word_align.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy
!python -m spacy download en_core_web_trf
!python -m spacy download pt_core_news_lg

In [3]:
import spacy
from spacy.tokens import Doc
from spacy.training import Example

nlp_en = spacy.load("en_core_web_trf")
nlp = spacy.load("pt_core_news_lg")

In [4]:
pred_words = ["Apply", "some", "sunscreen"]
pred_spaces = [True, True, False]
gold_words = ["Apply", "some", "sun", "screen"]
gold_spaces = [True, True, False, False]
gold_tags = ["VERB", "DET", "NOUN", "NOUN"]
predicted = Doc(nlp_en.vocab, words=pred_words, spaces=pred_spaces)
reference = Doc(nlp_en.vocab, words=gold_words, spaces=gold_spaces, tags=gold_tags)
example = Example(predicted, reference)

In [7]:
example.get_aligned("TAG", as_string=True)

['VERB', 'DET', 'NOUN']

In [8]:
example.get_aligned_parse(projectivize=True)

([0, 1, None], ['', '', None])

In [9]:
words = ["Mrs", "Smith", "flew", "to", "New York"]
doc = Doc(nlp_en.vocab, words=words)
entities = [(0, 9, "PERSON"), (18, 26, "LOC")]
gold_words = ["Mrs Smith", "flew", "to", "New", "York"]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner()

In [10]:
ner_tags

['B-PERSON', 'L-PERSON', 'O', 'O', 'U-LOC']

In [11]:
from spacy.training import Alignment

bert_tokens = ["obama", "'", "s", "podcast"]
spacy_tokens = ["obama", "'s", "podcast"]
alignment = Alignment.from_strings(bert_tokens, spacy_tokens)
a2b = alignment.x2y

In [13]:
a2b.data

array([0, 1, 1, 2], dtype=int32)

In [14]:
bert_tokens = ["A", "desenvolvedora", "argumentou", "com", "o", "design"]
spacy_tokens = ["O", "desenvolvedor", "discutiu", "com", "o", "design"]
alignment = Alignment.from_strings(bert_tokens, spacy_tokens)
a2b = alignment.x2y

ValueError: ignored

In [15]:
pred_words = ["Aplicar", "algum", "protetor solar"]
pred_spaces = [True, True, False]
gold_words = ["Aplicar", "algum", "protetor", "solar"]
gold_spaces = [True, True, False, False]
gold_tags = ["VERB", "DET", "NOUN", "NOUN"]
predicted = Doc(nlp_en.vocab, words=pred_words, spaces=pred_spaces)
reference = Doc(nlp_en.vocab, words=gold_words, spaces=gold_spaces, tags=gold_tags)
example = Example(predicted, reference)

In [21]:
example.get_aligned("TAG", as_string=True)

['VERB', 'DET', 'NOUN']

In [26]:
example.text

'Aplicar algum protetor solar'

In [27]:
sent1 = nlp("O desenvolvedor discutiu com o designer porque não gostava do desenho.")
sent2 = nlp("A desenvolvedora argumentou com o designer porque ela não gostou do design.")

In [None]:
!pip install simalign

In [30]:
from simalign import SentenceAligner

# making an instance of our model.
# You can specify the embedding model and all alignment settings in the constructor.
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")

# The source and target sentences should be tokenized to words.
src_sentence = ["This", "is", "a", "test", "."]
trg_sentence = ["Das", "ist", "ein", "Test", "."]

# The output is a dictionary with different matching methods.
# Each method has a list of pairs indicating the indexes of aligned words (The alignments are zero-indexed).
alignments = myaligner.get_word_aligns(src_sentence, trg_sentence)

for matching_method in alignments:
    print(matching_method, ":", alignments[matching_method])

Downloading config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading vocab.txt:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

2022-10-21 23:47:08,153 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased
INFO:simalign.simalign:Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


mwmf : [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)]
inter : [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)]
itermax : [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)]


In [36]:
src_sentence = ["A", "desenvolvedora", "argumentou", "com", "o", "designer", "porque", "ela","não","gostou","do","design"]
trg_sentence = ["O", "desenvolvedor", "discutiu", "com", "o", "designer", "porque","não","gostava","do","desenho"]

alignments = myaligner.get_word_aligns(src_sentence, trg_sentence)

In [38]:
# inter (argmax) works better for close language pairs according to the article
# so should work better for the same language, it tends to give higher kendall
# tau values wrt other alignement methods
alignments['itermax']

[(0, 0),
 (1, 1),
 (2, 2),
 (3, 3),
 (4, 4),
 (5, 5),
 (6, 6),
 (8, 7),
 (9, 8),
 (10, 9),
 (11, 10)]

In [48]:
myaligner

['mwmf', 'inter', 'itermax']

In [49]:
from scipy.stats import kendalltau


l1 = [x1 for x1,x2 in alignments["inter"]]
l2 = [x2 for x1,x2 in alignments["inter"]]

correlation, p_value = kendalltau(l1, l2)
print(correlation)


1.0


In [50]:
corr_pairs = alignments["mwmf"]
corr_tokens = [_p[1] for _p in corr_pairs]
corr_mask = [1 if i in corr_tokens else 0 for i, _ in enumerate(trg_sentence)]
print(corr_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [51]:
print("Possible Alignments From SimAlign")
print("Word in Sent 1 -----> Word in Sent 2")
sent1 = []
sent2 = []
for item in alignments['itermax']:
  print(src_sentence[item[0]],"---------->",trg_sentence[item[1]])
  sent1.append(src_sentence[item[0]])
  sent2.append(trg_sentence[item[1]])

Possible Alignments From SimAlign
Word in Sent 1 -----> Word in Sent 2
A ----------> O
desenvolvedora ----------> desenvolvedor
argumentou ----------> discutiu
com ----------> com
o ----------> o
designer ----------> designer
porque ----------> porque
não ----------> não
gostou ----------> gostava
do ----------> do
design ----------> desenho


In [53]:
word_align_pairs = zip(sent1,sent2)
list(word_align_pairs)

[('A', 'O'),
 ('desenvolvedora', 'desenvolvedor'),
 ('argumentou', 'discutiu'),
 ('com', 'com'),
 ('o', 'o'),
 ('designer', 'designer'),
 ('porque', 'porque'),
 ('não', 'não'),
 ('gostou', 'gostava'),
 ('do', 'do'),
 ('design', 'desenho')]