<a href="https://colab.research.google.com/github/VanessaSchenkel/how_to/blob/main/how_to_contrained_beam_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers
!pip install spacy
!python -m spacy download pt_core_news_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.0-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 4.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 53.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 92.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.13.1 transformers-4.23.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pt-core-news-lg==3.4.0

### Model

In [3]:
import spacy

nlp = spacy.load("pt_core_news_lg")

doc = nlp("O médico acabou seu trabalho.")

In [112]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 'VanessaSchenkel/pt-unicamp-handcrafted'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

sentence = "The doctor finished her work."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

outputs = model.generate(input_ids, num_beams=5)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['A médica terminou seu trabalho.']

In [5]:
translation_google = nlp("O médico terminou seu trabalho.")
translation_model = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print(translation_google)
print(translation_model)

O médico terminou seu trabalho.
['A médica terminou seu trabalho.']


In [6]:
from transformers import PhrasalConstraint

sentence = "The doctor finished her work."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

constraints = [
    PhrasalConstraint(
        tokenizer("acabou seu trabalho", add_special_tokens=False).input_ids
    )
]

outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=3,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: A médica acabou seu trabalho.
1: O médico acabou seu trabalho.
2: O doutor acabou seu trabalho.


In [7]:
from transformers import DisjunctiveConstraint

sentence = "The doctor finished her work."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

flexible_phrases = tokenizer(["médica", "médico"], add_special_tokens=False).input_ids

constraints = [DisjunctiveConstraint(flexible_phrases)]

outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=3,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: A médica terminou seu trabalho.
1: A médica terminou o seu trabalho.
2: O médico terminou seu trabalho.


In [8]:
print(translation_google)
print(translation_model)

O médico terminou seu trabalho.
['A médica terminou seu trabalho.']


In [9]:
translation_model = nlp(translation_model[0])
translation_model

A médica terminou seu trabalho.

In [None]:
for token in translation_model:
  print(token.text)
  print("ancestors -> ", [ancestor for ancestor in token.ancestors])
  print("children ->", [child for child in token.children])
  print("dep_ ->",token.dep_)
  print("head ->", token.head)
  print("is_ancestor ->", token.is_ancestor(token))
  print("lemma_ ->", token.lemma_)
  print("morph ->", token.morph)
  print("pos_ ->", token.pos_)
  print("norm_ ->", token.norm_)
  print("pos_ ->", token.pos_)
  print("tag_ ->", token.tag_)
  print("suffix_ ->", token.suffix_ )
  print("------")
 

In [11]:
teste = nlp("A médica terminou o trabalho, porém o enfermeiro ainda não. Outra frase aqui.")

for n in teste.noun_chunks:
  print(n)

A médica
o trabalho
, porém o enfermeiro
Outra frase


In [12]:
for ex in translation_google.noun_chunks:
  print(ex)

O médico
seu trabalho


In [13]:
teste = nlp("A médica era linda, o enfermeiro nem tanto")
for ex in teste.noun_chunks:
  print(ex)


A médica
o enfermeiro


In [None]:
for token in teste:
  print("TOKEN:  ", token.text)
  print("ancestors -> ", [ancestor for ancestor in token.ancestors])
  print("children ->", [child for child in token.children])
  print("dep_ ->",token.dep_)
  print("head ->", token.head)
  print("is_ancestor ->", token.is_ancestor(token))
  print("lemma_ ->", token.lemma_)
  print("morph ->", token.morph)
  print("pos_ ->", token.pos_)
  print("norm_ ->", token.norm_)
  print("tag_ ->", token.tag_)
  print("suffix_ ->", token.suffix_ )
  print("--------------------")

In [15]:
teste = nlp("A médica comeu a comida dela, o enfermeiro comeu o sanduíche dele")

table = {}
text_list = []
anc = []
child = []
dep = []
head = []
lemma = []
morph = []
pos = []
tag = []
norm = []
suffix = []
pref = []

for token in teste:
  text_list.append(token.text)
  anc.append([ancestor for ancestor in token.ancestors])
  child.append([child for child in token.children])
  dep.append(token.dep_)
  head.append(token.head)
  lemma.append(token.lemma_)
  morph.append(token.morph)
  pos.append(token.pos_)
  norm.append(token.norm_)
  pref.append(token.prefix_)
  suffix.append(token.suffix_)

table['text'] = text_list
table['anc'] = anc
table['child'] = child
table['dep'] = dep
table['head'] = head
table['lemma'] = lemma
table['morph'] = morph
table['pos'] = pos
table['norm'] = norm
table['pref'] = pref
table['suffix'] = suffix

In [16]:
import pandas as pd

df = pd.DataFrame(table)

display(df)

Unnamed: 0,text,anc,child,dep,head,lemma,morph,pos,norm,pref,suffix
0,A,"[médica, comeu]",[],det,médica,o,"(Definite=Def, Gender=Fem, Number=Sing, PronTy...",DET,a,A,A
1,médica,[comeu],[A],nsubj,comeu,médica,"(Gender=Fem, Number=Sing)",NOUN,médica,m,ica
2,comeu,[],"[médica, comida, ,, comeu]",ROOT,comeu,comeu,"(Mood=Sub, Number=Sing, Person=3, Tense=Past, ...",VERB,comeu,c,meu
3,a,"[comida, comeu]",[],det,comida,o,"(Definite=Def, Gender=Fem, Number=Sing, PronTy...",DET,a,a,a
4,comida,[comeu],"[a, dela]",obj,comeu,comida,"(Gender=Fem, Number=Sing)",NOUN,comida,c,ida
5,dela,"[comida, comeu]",[],nmod,comida,de ele,"(Gender=Fem, Number=Sing, Person=3, PronType=Prs)",PRON,dela,d,ela
6,",",[comeu],[],punct,comeu,",",(),PUNCT,",",",",","
7,o,"[enfermeiro, comeu, comeu]",[],det,enfermeiro,o,"(Definite=Def, Gender=Masc, Number=Sing, PronT...",DET,o,o,o
8,enfermeiro,"[comeu, comeu]",[o],nsubj,comeu,enfermeiro,"(Gender=Masc, Number=Sing)",NOUN,enfermeiro,e,iro
9,comeu,[comeu],"[enfermeiro, sanduíche]",conj,comeu,comeu,"(Mood=Sub, Number=Sing, Person=3, Tense=Past, ...",VERB,comeu,c,meu


In [17]:
for n in teste.noun_chunks:
  print(n)

A médica
a comida
dela
o enfermeiro
o sanduíche
dele


In [18]:
from transformers import PhrasalConstraint

sentence = "The doctor finished her work, the nurse has his job."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

constraints = [
    PhrasalConstraint(
        tokenizer("acabou seu trabalho", add_special_tokens=False).input_ids
    )
]

outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=3,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))



Output:
----------------------------------------------------------------------------------------------------
0: O médico acabou seu trabalho, a enfermeira tem o seu trabalho.
1: O médico acabou seu trabalho, o enfermeiro tem o seu trabalho.
2: A médica acabou seu trabalho, a enfermeira tem o seu trabalho.


In [19]:
from transformers import DisjunctiveConstraint

sentence = "The doctor finished her work, the nurse has his job."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

flexible_phrases = tokenizer(["médica", "médico", "enfermeira", "enfermeiro"], add_special_tokens=False).input_ids

constraints = [DisjunctiveConstraint(flexible_phrases)]

outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=6,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: A médica terminou seu trabalho, a enfermeira tem o seu trabalho.
1: O médico terminou seu trabalho, a enfermeira tem o seu trabalho.
2: A médica terminou seu trabalho, a enfermeira tem o seu emprego.
3: O médico terminou seu trabalho, o enfermeiro tem o seu trabalho.
4: O médico terminou seu trabalho, a enfermeira tem o seu emprego.
5: O médico terminou seu trabalho, o enfermeiro tem o seu emprego.


### roBERTa

In [20]:
!pip install torch torchvision
!pip install fairseq
!pip install sacremoses
!python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fairseq
  Downloading fairseq-0.12.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.0 MB)
[K     |████████████████████████████████| 11.0 MB 4.3 MB/s 
[?25hCollecting hydra-core<1.1,>=1.0.7
  Downloading hydra_core-1.0.7-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 78.1 MB/s 
Collecting sacrebleu>=1.4.12
  Downloading sacrebleu-2.2.1-py3-none-any.whl (116 kB)
[K     |████████████████████████████████| 116 kB 99.7 MB/s 
[?25hCollecting bitarray
  Downloading bitarray-2.6.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (235 kB)
[K     |████████████████████████████████| 235 kB 69.9 MB/s 
[?25hCollecting omegaconf<2.1
  Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)
Collecting antlr4-python3-runtime==4.8
  Downloading ant

In [21]:
import torch

roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.wsc', user_dir='examples/roberta/wsc')

Downloading: "https://github.com/pytorch/fairseq/zipball/main" to /root/.cache/torch/hub/main.zip


running build_ext
cythoning fairseq/data/data_utils_fast.pyx to fairseq/data/data_utils_fast.cpp




cythoning fairseq/data/token_block_utils_fast.pyx to fairseq/data/token_block_utils_fast.cpp
building 'fairseq.libbleu' extension
creating build
creating build/temp.linux-x86_64-3.7
creating build/temp.linux-x86_64-3.7/fairseq
creating build/temp.linux-x86_64-3.7/fairseq/clib
creating build/temp.linux-x86_64-3.7/fairseq/clib/libbleu
x86_64-linux-gnu-gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -g -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -fPIC -I/usr/include/python3.7m -c fairseq/clib/libbleu/libbleu.cpp -o build/temp.linux-x86_64-3.7/fairseq/clib/libbleu/libbleu.o -std=c++11 -O3 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE="_gcc" -DPYBIND11_STDLIB="_libstdcpp" -DPYBIND11_BUILD_ABI="_cxxabi1011" -DTORCH_EXTENSION_NAME=libbleu -D_GLIBCXX_USE_CXX11_ABI=0
x86_64-linux-gnu-gcc -pthread -Wno-unused-result -Wsign-compare -DNDE

100%|██████████| 655510705/655510705 [00:28<00:00, 22708700.08B/s]


| dictionary: 50265 types


1042301B [00:00, 1916601.27B/s]
456318B [00:00, 1064746.65B/s]


In [22]:
source_sentence = "The doctor finished her work."
trans_google = "O médico terminou seu trabalho."
trans_model = "A médica acabou seu trabalho."

In [23]:
roberta.disambiguate_pronoun('A médica terminou o trabalho [dela].')

'médica'

In [24]:
roberta.disambiguate_pronoun('The doctor finished [her] work.')

'The doctor'

In [25]:
tokens = roberta.encode("Hello world")
t = roberta.extract_features(tokens)


Tem indicativo de gênero na frase? 

In [26]:
import spacy

nlp_en = spacy.load("en_core_web_lg")

source_sentence = nlp_en(source_sentence)

for token in source_sentence:
  print(token.pos_)

DET
NOUN
VERB
PRON
NOUN
PUNCT


In [27]:
has_pronoun = False

for token in source_sentence:
  if token.pos_ is 'PRON':
    has_pronoun = True

print(has_pronoun)    

True


In [28]:
get_pronoun = ''

for token in source_sentence: 
  if token.pos_ is 'PRON':
    get_pronoun = token

print(get_pronoun)    

her


In [29]:
pronoun_text = "[" + get_pronoun.text + "]"
new_source_sentence = source_sentence.text.replace(get_pronoun.text, pronoun_text)

new_source_sentence

'The doctor finished [her] work.'

In [30]:
subj = roberta.disambiguate_pronoun(new_source_sentence)

subj

'The doctor'

In [31]:
get_pronoun.morph


is_fem = False
is_masc = False 
is_neutral = False

gender = get_pronoun.morph.get("Gender")

if gender == ['Fem']:
  is_fem = True
elif gender == ['Masc']:
  is_masc = True
else:
  is_neutral = True

print(is_fem)    
print(is_masc)    
print(is_neutral)    

True
False
False


In [32]:
subj_model = roberta.disambiguate_pronoun('A médica terminou o trabalho [dela].')

In [33]:
subj_model

'médica'

In [34]:
subj

'The doctor'

In [35]:
test_model = nlp(subj_model)
test_model

médica

In [36]:
test_source = nlp_en(subj)
test_source

The doctor

In [37]:
for i in test_model: 
  print(i.morph)

Gender=Fem|Number=Sing


In [38]:
for i in test_source: 
  print(i.morph)

Definite=Def|PronType=Art
Number=Sing


In [39]:

is_fem_trans = False
is_masc_trans = False 
is_neutral_trans = False


for g in test_model:
  gender = g.morph.get("Gender")
  if gender == ['Fem']:
    is_fem_trans = True
  elif gender == ['Masc']:
    is_masc_trans = True
  else:
    is_neutral_trans = True

print(is_fem_trans)    
print(is_masc_trans)    
print(is_neutral_trans)   

True
False
False


In [40]:
are_both_fem = is_fem and is_fem_trans
are_both_masc = is_masc and is_masc_trans
are_both_neutral = is_neutral and is_neutral_trans

print(are_both_fem)
print(are_both_masc)
print(are_both_neutral)

True
False
False


In [41]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.2-py3-none-any.whl (432 kB)
[K     |████████████████████████████████| 432 kB 5.1 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 78.8 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 56.9 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 79.4 MB/s 
Installing collected packages: urllib3, xxhash, responses, multiprocess, datasets
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.

In [42]:
from datasets import load_dataset
remote_dataset = load_dataset("VanessaSchenkel/pt_gender", field="data")
remote_dataset




Downloading and preparing dataset json/VanessaSchenkel--pt_gender to /root/.cache/huggingface/datasets/VanessaSchenkel___json/VanessaSchenkel--pt_gender-95b0f9b96d3847c6/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/39.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/VanessaSchenkel___json/VanessaSchenkel--pt_gender-95b0f9b96d3847c6/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['root', 'words'],
        num_rows: 321265
    })
})

In [43]:
t = remote_dataset.filter(lambda example: "médica" in example['words'])

t

  0%|          | 0/322 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['root', 'words'],
        num_rows: 1
    })
})

In [44]:
t['train'][0]

{'root': 'médico', 'words': ['médico', 'médicos', 'médica', 'médicas']}

In [45]:
words_with_gender = t['train'][0]['words']
words_with_gender

['médico', 'médicos', 'médica', 'médicas']

In [46]:
sentence = "The doctor finished her work."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

flexible_phrases = tokenizer(["médica", "médico"], add_special_tokens=False).input_ids

constraints = [DisjunctiveConstraint(flexible_phrases)]

outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=6,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))



Output:
----------------------------------------------------------------------------------------------------
0: A médica terminou seu trabalho.
1: A médica terminou o seu trabalho.
2: O médico terminou seu trabalho.
3: A médica acabou seu trabalho.
4: O médico terminou o seu trabalho.
5: A médica acabou o seu trabalho.


**constraints** (List[Constraint], optional) — Custom constraints that can be added to the generation to ensure that the output will contain the use of certain tokens as defined by Constraint objects, in the most sensible way possible.

**force_words_ids**(List[List[int]] or List[List[List[int]]], optional) — List of token ids that must be generated. If given a List[List[int]], this is treated as a simple list of words that must be included, the opposite to bad_words_ids. If given List[List[List[int]]], this triggers a disjunctive constraint, where one can allow different forms of each word.

In [47]:
sentence = "The developer finished her work."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

force_word = "terminou o trabalho"
force_flexible = ["desenvolvedor", "desenvolvedora"]

force_words_ids = [
    tokenizer([force_word], add_special_tokens=False).input_ids,
    tokenizer(force_flexible, add_special_tokens=False).input_ids,
]


outputs = model.generate(
    input_ids,
    force_words_ids=force_words_ids,
    num_beams=20,
    num_return_sequences=3,
    top_k=50, 
    top_p=0.95,
    temperature=0.7,
    no_repeat_ngram_size=4
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: A desenvolvedora terminou o trabalho.
1: O desenvolvedor terminou o trabalho.
2: O desenvolvedora terminou o trabalho.


Source tem gênero e é a mesma da tradução? 

In [48]:
source_sentence = "The doctor finished her work"

print(source_sentence)
print(translation_google)
print(translation_model)

The doctor finished her work
O médico terminou seu trabalho.
A médica terminou seu trabalho.


In [49]:
def get_pronoun(sentence):
  pronoun_list = []
  for token in sentence:
    if token.pos_ is 'PRON':
      pronoun_list.append(token)

  return pronoun_list


In [50]:
source_sentence = nlp_en("The doctor finished her work.")

pronouns_list = get_pronoun(source_sentence)

print(pronouns_list)

[her]


In [51]:
# def get_disambiguate_pronoun(sentence, pronouns_list):
#   noun_chunks_list = []
  
#   for pronoun in pronouns_list:
#     pronoun_text_formatted = "[" + pronoun.text + "]"
#     new_source_sentence = sentence.text.replace(pronoun.text, pronoun_text_formatted)
#     chunk = roberta.disambiguate_pronoun(new_source_sentence)
#     noun_chunks_list.append(chunk)
    
#   return noun_chunks_list


def get_disambiguate_pronoun(sentence, pronoun):
  pronoun_text_formatted = "[" + pronoun.text + "]"
  new_source_sentence = sentence.text.replace(pronoun.text, pronoun_text_formatted)
  return roberta.disambiguate_pronoun(new_source_sentence)

In [53]:
def get_sentence_gender(sentence):
  gender_list = []
  for token in sentence:
    gender = token.morph.get("Gender")
    if len(gender) > 0:
      gender_list.append(gender.pop())
    
  return gender_list  


In [54]:
get_sentence_gender(source_sentence)

['Fem']

In [55]:
def is_source_and_translated_same_gender(source, translated):
  source_gender = get_sentence_gender(source)
  translated_gender = get_sentence_gender(translated)
  print(source_gender, translated_gender)
  return source_gender == translated_gender

In [56]:
translated = nlp("A médica terminou o seu trabalho")

is_source_and_translated_same_gender(source_sentence, translated)

['Fem'] ['Fem', 'Fem', 'Masc', 'Masc', 'Masc']


False

In [57]:
pronouns_list_pt = get_pronoun(translated)
pronouns_list_pt

[]

In [58]:
for token in translated.noun_chunks:
  print(token)

A médica
o seu trabalho


In [59]:
for token in source_sentence.noun_chunks:
  print(token)

The doctor
her work


In [60]:
for token in source_sentence:
  print(token.dep_)

det
nsubj
ROOT
poss
dobj
punct


In [61]:
for token in translated:
  print(token.dep_)

det
nsubj
ROOT
det
det
obj


In [62]:
spacy.explain('nsubj')

'nominal subject'

In [63]:
def get_nsub_sentence(sentence):
  nsub_list = []
  for token in sentence:
    if token.dep_ == 'nsubj':
      nsub_list.append(token)

  return nsub_list 

In [64]:
nsub_translated = get_nsub_sentence(translated)
nsub_translated

[médica]

In [65]:
is_source_and_translated_same_gender(source_sentence, nsub_translated)

['Fem'] ['Fem']


True

In [66]:
translation_google = nlp("O médico acabou seu trabalho")

In [67]:
nsub_translated_google = get_nsub_sentence(translation_google)
nsub_translated_google

[médico]

In [69]:
table = {}
text_list = []
anc = []
child = []
dep = []
head = []
lemma = []
morph = []
pos = []
tag = []
norm = []
suffix = []
pref = []

for token in translation_google:
  text_list.append(token.text)
  anc.append([ancestor for ancestor in token.ancestors])
  child.append([child for child in token.children])
  dep.append(token.dep_)
  head.append(token.head)
  lemma.append(token.lemma_)
  morph.append(token.morph)
  pos.append(token.pos_)
  norm.append(token.norm_)
  pref.append(token.prefix_)
  suffix.append(token.suffix_)

table['text'] = text_list
table['anc'] = anc
table['child'] = child
table['dep'] = dep
table['head'] = head
table['lemma'] = lemma
table['morph'] = morph
table['pos'] = pos
table['norm'] = norm
table['pref'] = pref
table['suffix'] = suffix



df = pd.DataFrame(table)

display(df)

Unnamed: 0,text,anc,child,dep,head,lemma,morph,pos,norm,pref,suffix
0,O,"[médico, acabou]",[],det,médico,o,"(Definite=Def, Gender=Masc, Number=Sing, PronT...",DET,o,O,O
1,médico,[acabou],[O],nsubj,acabou,médico,"(Gender=Masc, Number=Sing)",NOUN,médico,m,ico
2,acabou,[],"[médico, trabalho]",ROOT,acabou,acabar,"(Mood=Ind, Number=Sing, Person=3, Tense=Past, ...",VERB,acabou,a,bou
3,seu,"[trabalho, acabou]",[],det,trabalho,seu,"(Gender=Masc, Number=Sing, PronType=Prs)",DET,seu,s,seu
4,trabalho,[acabou],[seu],obj,acabou,trabalho,"(Gender=Masc, Number=Sing)",NOUN,trabalho,t,lho


In [70]:
def get_constrained_sentence(translation, nsub):
  constrained_sentence = ""
  children = [child for child in nsub[0].children]
  for token in translation:
    if token not in nsub and token not in children:
      constrained_sentence += token.text + " "

  return constrained_sentence

In [71]:
constrained_sentence = get_constrained_sentence(translation_google, nsub_translated_google)
constrained_sentence

'acabou seu trabalho '

In [72]:
sentence = "The doctor finished her work."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

force_word = constrained_sentence.strip()

force_words_ids = [
    tokenizer([force_word], add_special_tokens=False).input_ids,
]


outputs = model.generate(
    input_ids,
    force_words_ids=force_words_ids,
    num_beams=20,
    num_return_sequences=3,
    top_k=50, 
    top_p=0.95,
    temperature=0.7,
    no_repeat_ngram_size=4
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))



Output:
----------------------------------------------------------------------------------------------------
0: A médica acabou seu trabalho.
1: O médico acabou seu trabalho.
2: A doutora acabou seu trabalho.


Source tem gênero e não é a mesma da tradução?

In [73]:
translated_masc = nlp("O médico acabou seu trabalho")
nsub_translated = get_nsub_sentence(translated_masc)
nsub_translated

[médico]

In [74]:
is_source_and_translated_same_gender(source_sentence, nsub_translated)

['Fem'] ['Masc']


False

In [75]:
constrained_sentence = get_constrained_sentence(translated_masc, nsub_translated)
constrained_sentence

'acabou seu trabalho '

In [76]:
sentence = "The doctor finished her work."
input_ids = tokenizer(sentence, return_tensors="pt").input_ids

force_word = constrained_sentence.strip()

force_words_ids = [
    tokenizer([force_word], add_special_tokens=False).input_ids,
]


outputs = model.generate(
    input_ids,
    force_words_ids=force_words_ids,
    num_beams=20,
    num_return_sequences=3,
    top_k=50, 
    top_p=0.95,
    temperature=0.7,
    no_repeat_ngram_size=4
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: A médica acabou seu trabalho.
1: O médico acabou seu trabalho.
2: A doutora acabou seu trabalho.


Source tem mais de um gênero?

In [77]:
source_sentence = nlp_en("The doctor finished her work, the nurse was still on his break")
translation_google = nlp("O médico terminou seu trabalho, a enfermeira ainda estava de folga")
translation_model = nlp("O médico terminou seu trabalho, a enfermeira ainda era a sua pausa.")

print(source_sentence)
print(translation_google)
print(translation_model)

The doctor finished her work, the nurse was still on his break
O médico terminou seu trabalho, a enfermeira ainda estava de folga
O médico terminou seu trabalho, a enfermeira ainda era a sua pausa.


In [78]:
pronouns_list = get_pronoun(source_sentence)

print(pronouns_list)

[her, his]


In [80]:
get_sentence_gender(source_sentence)

['Fem', 'Masc']

In [82]:
table = {}
text_list = []
anc = []
child = []
dep = []
head = []
lemma = []
morph = []
pos = []
tag = []
norm = []
suffix = []
pref = []

for token in source_sentence:
  text_list.append(token.text)
  anc.append([ancestor for ancestor in token.ancestors])
  child.append([child for child in token.children])
  dep.append(token.dep_)
  head.append(token.head)
  lemma.append(token.lemma_)
  morph.append(token.morph)
  pos.append(token.pos_)
  norm.append(token.norm_)
  pref.append(token.prefix_)
  suffix.append(token.suffix_)

table['text'] = text_list
table['anc'] = anc
table['child'] = child
table['dep'] = dep
table['head'] = head
table['lemma'] = lemma
table['morph'] = morph
table['pos'] = pos
table['norm'] = norm
table['pref'] = pref
table['suffix'] = suffix



df = pd.DataFrame(table)

display(df)

Unnamed: 0,text,anc,child,dep,head,lemma,morph,pos,norm,pref,suffix
0,The,"[doctor, finished, was]",[],det,doctor,the,"(Definite=Def, PronType=Art)",DET,the,T,The
1,doctor,"[finished, was]",[The],nsubj,finished,doctor,(Number=Sing),NOUN,doctor,d,tor
2,finished,[was],"[doctor, work]",ccomp,was,finish,"(Tense=Past, VerbForm=Fin)",VERB,finished,f,hed
3,her,"[work, finished, was]",[],poss,work,her,"(Gender=Fem, Number=Sing, Person=3, Poss=Yes, ...",PRON,her,h,her
4,work,"[finished, was]",[her],dobj,finished,work,(Number=Sing),NOUN,work,w,ork
5,",",[was],[],punct,was,",",(PunctType=Comm),PUNCT,",",",",","
6,the,"[nurse, was]",[],det,nurse,the,"(Definite=Def, PronType=Art)",DET,the,t,the
7,nurse,[was],[the],nsubj,was,nurse,(Number=Sing),NOUN,nurse,n,rse
8,was,[],"[finished, ,, nurse, still, on]",ROOT,was,be,"(Mood=Ind, Number=Sing, Person=3, Tense=Past, ...",AUX,was,w,was
9,still,[was],[],advmod,was,still,(),ADV,still,s,ill


In [83]:
for s in source_sentence[1].head.subtree:
  print(s)

The
doctor
finished
her
work


In [84]:
teste = nlp_en(", the nurse was still on his break")

for s in teste[2].head.subtree:
  print(s)

,
the
nurse
was
still
on
his
break


In [85]:
nsub = get_nsub_sentence(source_sentence)
nsub

[doctor, nurse]

In [86]:
def split_sentences_by_nsubj(source_sentence, nsub):
  splitted = []
  sentence_complete = source_sentence
  sentence_to_remove = ""
  for sub in nsub:
    sentence = get_new_sentence_without_subj(sentence_complete, sentence_to_remove)
    for token in sentence:
      if token.text == str(sub):
        sentence_to_remove = get_subj_subtree(sentence, token.i)
        splitted.append(sentence_to_remove)

  return splitted

In [87]:
def get_new_sentence_without_subj(sentence_complete, sentence_to_remove):
  if len(sentence_to_remove) > 0:
    new_sentence = sentence_complete.text.split(sentence_to_remove)[-1]
  else:
    new_sentence = sentence_complete.text

  return nlp_en(new_sentence)


In [88]:
def get_subj_subtree(source_sentence, index):
  sentence = ""
  
  for subtree in source_sentence[index].head.subtree:
    sentence += subtree.text_with_ws

  return sentence  

In [89]:
sentences_splitted = split_sentences_by_nsubj(source_sentence, nsub)
sentences_splitted

['The doctor finished her work', ', the nurse was still on his break']

In [90]:
pronouns_list

[her, his]

In [91]:
for index, sentence in enumerate(sentences_splitted):
  sentence = nlp_en(sentence)
  pro = get_disambiguate_pronoun(sentence, pronouns_list[index])
  print(pro)

The doctor
nurse


In [92]:
get_sentence_gender(source_sentence)

['Fem', 'Masc']

In [240]:
translations_sep = generate_contrained_translation(sentences_splitted)
translations_sep



'A médica terminou seu trabalho., o enfermeiro ainda estava de folga.'

In [241]:
import re

def format_translation(translation):
  regex = r".,"
  subst = ","

  result = re.sub(regex, subst, translation, 0, re.MULTILINE)

  return result

In [239]:
def generate_contrained_translation(sentences_splitted):
  translation = ""
  for sentence in sentences_splitted:
    traslation_google_splitted = get_google_translation(sentence)
    translation_model_splitted = get_nbest_translation_model(sentence)

    nsub_google = get_nsub_sentence(traslation_google_splitted)

    constrained_sentence = get_constrained_sentence(traslation_google_splitted, nsub_google)

    translation_contrained = get_contrained_translation(sentence, constrained_sentence)
    translation += translation_contrained
    
  return translation
    

In [149]:
def get_google_translation(sentence):
  if "doctor" in sentence:
    return nlp("O médico terminou seu trabalho")
  elif "nurse" in sentence:
    return nlp(", a enfermeira ainda estava de folga")

In [139]:
def get_nbest_translation_model(source):
  source += "."
  input_ids = tokenizer(source, return_tensors="pt").input_ids  
  output = model.generate(input_ids, num_beams=10, num_return_sequences=1)
  translation = tokenizer.decode(output[0], skip_special_tokens=True)

  return translation

In [230]:
def get_contrained_translation(source, constrained_sentence):

  source = source.strip() + "."
  constrained_sentence = constrained_sentence.strip().lstrip(",") + "."
  
  input_ids = tokenizer(source, return_tensors="pt").input_ids

  force_words_ids = [
    tokenizer([constrained_sentence], add_special_tokens=False).input_ids,
  ]

  output = model.generate(
    input_ids,
    force_words_ids=force_words_ids,
    num_beams=20,
    num_return_sequences=3,
    max_new_tokens=50
  )

  translation = tokenizer.decode(output[0], skip_special_tokens=True)
 
  return translation

In [242]:
format_translation(translations_sep)

'A médica terminou seu trabalho, o enfermeiro ainda estava de folga.'

Source tem mais de uma entidade e só um gênero?

In [243]:
source_sentence = nlp_en("The doctor finished her work, the nurse was still working.")
translation_google = nlp("A médica terminou seu trabalho, a enfermeira ainda estava trabalhando.")

print(source_sentence)
print(translation_google)

The doctor finished her work, the nurse was still working.
A médica terminou seu trabalho, a enfermeira ainda estava trabalhando.


In [244]:
pronouns_list = get_pronoun(source_sentence)

print(pronouns_list)

[her]


In [245]:
get_sentence_gender(source_sentence)

['Fem']

In [251]:
nsubs_source = get_nsub_sentence(source_sentence)
nsubs_source

[doctor, nurse]

In [248]:
def has_more_suject_than_pronoun(pronouns_list, nsubs_source):
  return len(pronouns_list) == len(nsubs_source)

In [252]:
has_more_suject_than_pronoun(pronouns_list, nsubs_source)

False

Source não tem gênero?

Adicionando a tradução neutra