<a href="https://colab.research.google.com/github/VanessaSchenkel/how_to/blob/main/how_to_contrained_beam_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install spacy
!python -m spacy download pt_core_news_lg

### Model

In [2]:
import spacy

nlp = spacy.load("pt_core_news_lg")

doc = nlp("O médico acabou seu trabalho.")

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 'VanessaSchenkel/pt-unicamp-handcrafted'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

sentence = "The doctor finished her work."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

outputs = model.generate(input_ids, num_beams=5)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [4]:
translation_google = nlp("O médico terminou seu trabalho.")
translation_model = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print(translation_google)
print(translation_model)

O médico terminou seu trabalho.
['A médica terminou seu trabalho.']


In [5]:
from transformers import PhrasalConstraint

sentence = "The doctor finished her work."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

constraints = [
    PhrasalConstraint(
        tokenizer("acabou seu trabalho", add_special_tokens=False).input_ids
    )
]

outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=3,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: A médica acabou seu trabalho.
1: O médico acabou seu trabalho.
2: O doutor acabou seu trabalho.


In [6]:
from transformers import DisjunctiveConstraint

sentence = "The doctor finished her work."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

flexible_phrases = tokenizer(["médica", "médico"], add_special_tokens=False).input_ids

constraints = [DisjunctiveConstraint(flexible_phrases)]

outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=3,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: A médica terminou seu trabalho.
1: A médica terminou o seu trabalho.
2: O médico terminou seu trabalho.


In [7]:
print(translation_google)
print(translation_model)

O médico terminou seu trabalho.
['A médica terminou seu trabalho.']


In [8]:
translation_model = nlp(translation_model[0])
translation_model

A médica terminou seu trabalho.

In [None]:
for token in translation_model:
  print(token.text)
  print("ancestors -> ", [ancestor for ancestor in token.ancestors])
  print("children ->", [child for child in token.children])
  print("dep_ ->",token.dep_)
  print("head ->", token.head)
  print("is_ancestor ->", token.is_ancestor(token))
  print("lemma_ ->", token.lemma_)
  print("morph ->", token.morph)
  print("pos_ ->", token.pos_)
  print("norm_ ->", token.norm_)
  print("pos_ ->", token.pos_)
  print("tag_ ->", token.tag_)
  print("suffix_ ->", token.suffix_ )
  print("------")
 

In [10]:
teste = nlp("A médica terminou o trabalho, porém o enfermeiro ainda não. Outra frase aqui.")

for n in teste.noun_chunks:
  print(n)

A médica
o trabalho
, porém o enfermeiro
Outra frase


In [11]:
for ex in translation_google.noun_chunks:
  print(ex)

O médico
seu trabalho


In [12]:
teste = nlp("A médica era linda, o enfermeiro nem tanto")
for ex in teste.noun_chunks:
  print(ex)


A médica
o enfermeiro


In [None]:
for token in teste:
  print("TOKEN:  ", token.text)
  print("ancestors -> ", [ancestor for ancestor in token.ancestors])
  print("children ->", [child for child in token.children])
  print("dep_ ->",token.dep_)
  print("head ->", token.head)
  print("is_ancestor ->", token.is_ancestor(token))
  print("lemma_ ->", token.lemma_)
  print("morph ->", token.morph)
  print("pos_ ->", token.pos_)
  print("norm_ ->", token.norm_)
  print("tag_ ->", token.tag_)
  print("suffix_ ->", token.suffix_ )
  print("--------------------")

In [14]:
teste = nlp("A médica comeu a comida dela, o enfermeiro comeu o sanduíche dele")

table = {}
text_list = []
anc = []
child = []
dep = []
head = []
lemma = []
morph = []
pos = []
tag = []
norm = []
suffix = []
pref = []

for token in teste:
  text_list.append(token.text)
  anc.append([ancestor for ancestor in token.ancestors])
  child.append([child for child in token.children])
  dep.append(token.dep_)
  head.append(token.head)
  lemma.append(token.lemma_)
  morph.append(token.morph)
  pos.append(token.pos_)
  norm.append(token.norm_)
  pref.append(token.prefix_)
  suffix.append(token.suffix_)

table['text'] = text_list
table['anc'] = anc
table['child'] = child
table['dep'] = dep
table['head'] = head
table['lemma'] = lemma
table['morph'] = morph
table['pos'] = pos
table['norm'] = norm
table['pref'] = pref
table['suffix'] = suffix

In [15]:
import pandas as pd

df = pd.DataFrame(table)

display(df)

Unnamed: 0,text,anc,child,dep,head,lemma,morph,pos,norm,pref,suffix
0,A,"[médica, comeu]",[],det,médica,o,"(Definite=Def, Gender=Fem, Number=Sing, PronTy...",DET,a,A,A
1,médica,[comeu],[A],nsubj,comeu,médica,"(Gender=Fem, Number=Sing)",NOUN,médica,m,ica
2,comeu,[],"[médica, comida, ,, comeu]",ROOT,comeu,comeu,"(Mood=Sub, Number=Sing, Person=3, Tense=Past, ...",VERB,comeu,c,meu
3,a,"[comida, comeu]",[],det,comida,o,"(Definite=Def, Gender=Fem, Number=Sing, PronTy...",DET,a,a,a
4,comida,[comeu],"[a, dela]",obj,comeu,comida,"(Gender=Fem, Number=Sing)",NOUN,comida,c,ida
5,dela,"[comida, comeu]",[],nmod,comida,de ele,"(Gender=Fem, Number=Sing, Person=3, PronType=Prs)",PRON,dela,d,ela
6,",",[comeu],[],punct,comeu,",",(),PUNCT,",",",",","
7,o,"[enfermeiro, comeu, comeu]",[],det,enfermeiro,o,"(Definite=Def, Gender=Masc, Number=Sing, PronT...",DET,o,o,o
8,enfermeiro,"[comeu, comeu]",[o],nsubj,comeu,enfermeiro,"(Gender=Masc, Number=Sing)",NOUN,enfermeiro,e,iro
9,comeu,[comeu],"[enfermeiro, sanduíche]",conj,comeu,comeu,"(Mood=Sub, Number=Sing, Person=3, Tense=Past, ...",VERB,comeu,c,meu


In [16]:
for n in teste.noun_chunks:
  print(n)

A médica
a comida
dela
o enfermeiro
o sanduíche
dele


In [17]:
from transformers import PhrasalConstraint

sentence = "The doctor finished her work, the nurse has his job."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

constraints = [
    PhrasalConstraint(
        tokenizer("acabou seu trabalho", add_special_tokens=False).input_ids
    )
]

outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=3,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))



Output:
----------------------------------------------------------------------------------------------------
0: O médico acabou seu trabalho, a enfermeira tem o seu trabalho.
1: O médico acabou seu trabalho, o enfermeiro tem o seu trabalho.
2: A médica acabou seu trabalho, a enfermeira tem o seu trabalho.


In [18]:
from transformers import DisjunctiveConstraint

sentence = "The doctor finished her work, the nurse has his job."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

flexible_phrases = tokenizer(["médica", "médico", "enfermeira", "enfermeiro"], add_special_tokens=False).input_ids

constraints = [DisjunctiveConstraint(flexible_phrases)]

outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=6,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: A médica terminou seu trabalho, a enfermeira tem o seu trabalho.
1: O médico terminou seu trabalho, a enfermeira tem o seu trabalho.
2: A médica terminou seu trabalho, a enfermeira tem o seu emprego.
3: O médico terminou seu trabalho, o enfermeiro tem o seu trabalho.
4: O médico terminou seu trabalho, a enfermeira tem o seu emprego.
5: O médico terminou seu trabalho, o enfermeiro tem o seu emprego.


### roBERTa

In [None]:
!pip install torch torchvision
!pip install fairseq
!pip install sacremoses
!python -m spacy download en_core_web_lg

In [None]:
import torch

roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.wsc', user_dir='examples/roberta/wsc')

In [21]:
source_sentence = "The doctor finished her work."
trans_google = "O médico terminou seu trabalho."
trans_model = "A médica acabou seu trabalho."

In [22]:
roberta.disambiguate_pronoun('A médica terminou o trabalho [dela].')

'médica'

In [23]:
roberta.disambiguate_pronoun('The doctor finished [her] work.')

'The doctor'

In [38]:
tokens = roberta.encode("Hello world")
t = roberta.extract_features(tokens)


tensor([[[-3.4100e-02, -5.7759e-02, -1.3744e-01,  ..., -1.0358e-01,
           1.1683e-03, -6.4825e-02],
         [ 2.5671e-01,  5.2687e-01, -1.6095e+00,  ..., -1.1937e-01,
          -6.5655e-01, -1.1647e-01],
         [ 4.5235e-01, -6.6283e-02, -8.3834e-01,  ...,  6.5676e-01,
          -2.0854e-02,  2.8804e-01],
         [ 3.0257e-03, -7.1395e-02, -1.2563e-01,  ..., -1.5921e-01,
           5.7723e-03, -7.5381e-02]]], grad_fn=<TransposeBackward0>)

Tem indicativo de gênero na frase? 

In [24]:
import spacy

nlp_en = spacy.load("en_core_web_lg")

source_sentence = nlp_en(source_sentence)

for token in source_sentence:
  print(token.pos_)

DET
NOUN
VERB
PRON
NOUN
PUNCT


In [25]:
has_pronoun = False

for token in source_sentence:
  if token.pos_ is 'PRON':
    has_pronoun = True

print(has_pronoun)    

True


In [26]:
get_pronoun = ''

for token in source_sentence: 
  if token.pos_ is 'PRON':
    get_pronoun = token

print(get_pronoun)    

her


In [27]:
pronoun_text = "[" + get_pronoun.text + "]"
new_source_sentence = source_sentence.text.replace(get_pronoun.text, pronoun_text)

new_source_sentence

'The doctor finished [her] work.'

In [28]:
subj = roberta.disambiguate_pronoun(new_source_sentence)

subj

'The doctor'

In [29]:
get_pronoun.morph


is_fem = False
is_masc = False 
is_neutral = False

gender = get_pronoun.morph.get("Gender")

if gender == ['Fem']:
  is_fem = True
elif gender == ['Masc']:
  is_masc = True
else:
  is_neutral = True

print(is_fem)    
print(is_masc)    
print(is_neutral)    

True
False
False


In [30]:
subj_model = roberta.disambiguate_pronoun('A médica terminou o trabalho [dela].')

In [31]:
subj_model

'médica'

In [32]:
subj

'The doctor'

In [33]:
test_model = nlp(subj_model)
test_model

médica

In [34]:
test_source = nlp_en(subj)
test_source

The doctor

In [35]:
for i in test_model: 
  print(i.morph)

Gender=Fem|Number=Sing


In [36]:
for i in test_source: 
  print(i.morph)

Definite=Def|PronType=Art
Number=Sing


In [37]:

is_fem_trans = False
is_masc_trans = False 
is_neutral_trans = False


for g in test_model:
  gender = g.morph.get("Gender")
  if gender == ['Fem']:
    is_fem_trans = True
  elif gender == ['Masc']:
    is_masc_trans = True
  else:
    is_neutral_trans = True

print(is_fem_trans)    
print(is_masc_trans)    
print(is_neutral_trans)   

True
False
False


In [38]:
are_both_fem = is_fem and is_fem_trans
are_both_masc = is_masc and is_masc_trans
are_both_neutral = is_neutral and is_neutral_trans

print(are_both_fem)
print(are_both_masc)
print(are_both_neutral)

True
False
False


In [None]:
!pip install datasets

In [40]:
from datasets import load_dataset
remote_dataset = load_dataset("VanessaSchenkel/pt_gender", field="data")
remote_dataset




Downloading and preparing dataset json/VanessaSchenkel--pt_gender to /root/.cache/huggingface/datasets/VanessaSchenkel___json/VanessaSchenkel--pt_gender-95b0f9b96d3847c6/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/39.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/VanessaSchenkel___json/VanessaSchenkel--pt_gender-95b0f9b96d3847c6/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['root', 'words'],
        num_rows: 321265
    })
})

In [41]:
t = remote_dataset.filter(lambda example: "médica" in example['words'])

t

  0%|          | 0/322 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['root', 'words'],
        num_rows: 1
    })
})

In [42]:
t['train'][0]

{'root': 'médico', 'words': ['médico', 'médicos', 'médica', 'médicas']}

In [43]:
words_with_gender = t['train'][0]['words']
words_with_gender

['médico', 'médicos', 'médica', 'médicas']

In [122]:
sentence = "The doctor finished her work."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

flexible_phrases = tokenizer(["médica", "médico"], add_special_tokens=False).input_ids

constraints = [DisjunctiveConstraint(flexible_phrases)]

outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=6,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: A médica terminou seu trabalho.
1: A médica terminou o seu trabalho.
2: O médico terminou seu trabalho.
3: A médica acabou seu trabalho.
4: O médico terminou o seu trabalho.
5: A médica acabou o seu trabalho.


**constraints** (List[Constraint], optional) — Custom constraints that can be added to the generation to ensure that the output will contain the use of certain tokens as defined by Constraint objects, in the most sensible way possible.

**force_words_ids**(List[List[int]] or List[List[List[int]]], optional) — List of token ids that must be generated. If given a List[List[int]], this is treated as a simple list of words that must be included, the opposite to bad_words_ids. If given List[List[List[int]]], this triggers a disjunctive constraint, where one can allow different forms of each word.

In [157]:
sentence = "The developer finished her work."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

force_word = "terminou o trabalho"
force_flexible = ["desenvolvedor", "desenvolvedora"]

force_words_ids = [
    tokenizer([force_word], add_special_tokens=False).input_ids,
    tokenizer(force_flexible, add_special_tokens=False).input_ids,
]


outputs = model.generate(
    input_ids,
    force_words_ids=force_words_ids,
    num_beams=20,
    num_return_sequences=3,
    top_k=50, 
    top_p=0.95,
    temperature=0.7,
    no_repeat_ngram_size=4
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: A desenvolvedora terminou o trabalho.
1: O desenvolvedor terminou o trabalho.
2: O desenvolvedora terminou o trabalho.


Source tem gênero e é a mesma da tradução? 

In [159]:
source_sentence = "The doctor finished her work"

print(source_sentence)
print(translation_google)
print(translation_model)

The doctor finished her work
O médico terminou seu trabalho.
A médica terminou seu trabalho.


In [199]:
def get_pronoun(sentence):
  pronoun_list = []
  for token in sentence:
    if token.pos_ is 'PRON':
      pronoun_list.append(token)

  return pronoun_list


In [209]:
source_sentence = nlp_en("The doctor finished her work.")

pronouns_list = get_pronoun(source_sentence)

print(pronouns_list)

[her]


In [211]:
def get_disambiguate_pronoun(sentence, pronouns_list):
  noun_chunks_list = []
  
  for pronoun in pronouns_list:
    pronoun_text_formatted = "[" + pronoun.text + "]"
    new_source_sentence = sentence.text.replace(pronoun.text, pronoun_text_formatted)
    chunk = roberta.disambiguate_pronoun(new_source_sentence)
    noun_chunks_list.append(chunk)
    
  return noun_chunks_list

In [212]:
disambiguation = get_disambiguate_pronoun(source_sentence, pronouns_list)
disambiguation

['The doctor']

In [223]:
def get_sentence_gender(sentence):
  gender_list = []
  for token in sentence:
    gender = token.morph.get("Gender")
    if len(gender) > 0:
      gender_list.append(gender.pop())
    
  return gender_list  


In [224]:
get_sentence_gender(source_sentence)

['Fem']

In [228]:
def is_source_and_translated_same_gender(source, translated):
  source_gender = get_sentence_gender(source)
  translated_gender = get_sentence_gender(translated)
  print(source_gender, translated_gender)
  return source_gender == translated_gender

In [232]:
translated = nlp("A médica terminou o seu trabalho")

is_source_and_translated_same_gender(source_sentence, translated)

['Fem'] ['Fem', 'Fem', 'Masc', 'Masc', 'Masc']


False

In [233]:
pronouns_list_pt = get_pronoun(translated)
pronouns_list_pt

[]

In [234]:
for token in translated.noun_chunks:
  print(token)

A médica
o seu trabalho


In [258]:
for token in source_sentence.noun_chunks:
  print(token)

The doctor
her work


In [268]:
for token in source_sentence:
  print(token.dep_)

det
nsubj
ROOT
poss
dobj
punct


In [267]:
for token in translated:
  print(token.dep_)

det
nsubj
ROOT
det
det
obj


In [269]:
spacy.explain('nsubj')

'nominal subject'

In [270]:
def get_nsub_sentence(sentence):
  nsub_list = []
  for token in sentence:
    if token.dep_ == 'nsubj':
      nsub_list.append(token)

  return nsub_list 

In [271]:
nsub_translated = get_nsub_sentence(translated)
nsub_translated

[médica]

In [272]:
is_source_and_translated_same_gender(source_sentence, nsub_translated)

['Fem'] ['Fem']


True

In [274]:
translation_google = nlp("O médico acabou seu trabalho")

In [297]:
nsub_translated_google = get_nsub_sentence(translation_google)
nsub_translated_google

[médico]

In [276]:
table = {}
text_list = []
anc = []
child = []
dep = []
head = []
lemma = []
morph = []
pos = []
tag = []
norm = []
suffix = []
pref = []

for token in translation_google:
  text_list.append(token.text)
  anc.append([ancestor for ancestor in token.ancestors])
  child.append([child for child in token.children])
  dep.append(token.dep_)
  head.append(token.head)
  lemma.append(token.lemma_)
  morph.append(token.morph)
  pos.append(token.pos_)
  norm.append(token.norm_)
  pref.append(token.prefix_)
  suffix.append(token.suffix_)

table['text'] = text_list
table['anc'] = anc
table['child'] = child
table['dep'] = dep
table['head'] = head
table['lemma'] = lemma
table['morph'] = morph
table['pos'] = pos
table['norm'] = norm
table['pref'] = pref
table['suffix'] = suffix



df = pd.DataFrame(table)

display(df)

Unnamed: 0,text,anc,child,dep,head,lemma,morph,pos,norm,pref,suffix
0,O,"[médico, acabou]",[],det,médico,o,"(Definite=Def, Gender=Masc, Number=Sing, PronT...",DET,o,O,O
1,médico,[acabou],[O],nsubj,acabou,médico,"(Gender=Masc, Number=Sing)",NOUN,médico,m,ico
2,acabou,[],"[médico, trabalho]",ROOT,acabou,acabar,"(Mood=Ind, Number=Sing, Person=3, Tense=Past, ...",VERB,acabou,a,bou
3,seu,"[trabalho, acabou]",[],det,trabalho,seu,"(Gender=Masc, Number=Sing, PronType=Prs)",DET,seu,s,seu
4,trabalho,[acabou],[seu],obj,acabou,trabalho,"(Gender=Masc, Number=Sing)",NOUN,trabalho,t,lho


In [325]:
def get_constrained_sentence(translation, nsub):
  constrained_sentence = ""
  children = [child for child in nsub[0].children]
  for token in translation:
    if token not in nsub and token not in children:
      constrained_sentence += token.text + " "

  return constrained_sentence

In [328]:
constrained_sentence = get_constrained_sentence(translation_google, nsub_translated_google)
constrained_sentence

'acabou seu trabalho '

In [339]:
sentence = "The doctor finished her work."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

force_word = constrained_sentence.strip()

force_words_ids = [
    tokenizer([force_word], add_special_tokens=False).input_ids,
]


outputs = model.generate(
    input_ids,
    force_words_ids=force_words_ids,
    num_beams=20,
    num_return_sequences=3,
    top_k=50, 
    top_p=0.95,
    temperature=0.7,
    no_repeat_ngram_size=4
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))



Output:
----------------------------------------------------------------------------------------------------
0: A médica acabou seu trabalho.
1: O médico acabou seu trabalho.
2: A doutora acabou seu trabalho.


Source tem gênero e não é a mesma da tradução?

Source não tem gênero?

Source tem mais de um gênero?

Adicionando a tradução neutra