Skip to content

Commit

Permalink
Adapt to the version used in our ACL paper
Browse files Browse the repository at this point in the history
  • Loading branch information
SivilTaram committed Apr 8, 2020
1 parent b832d5b commit e1e7184
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pytorch_pretrained_bert/modeling_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,7 +726,7 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=N
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))
return loss
return lm_logits
return lm_logits, hidden_states


class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
Expand Down
10 changes: 8 additions & 2 deletions pytorch_pretrained_bert/tokenization_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,12 +223,18 @@ def tokenize(self, text):
# Using BERT's BasicTokenizer
text = self.nlp.tokenize(text)
for token in text:
split_tokens.extend([t for t in self.bpe(token).split(' ')])
if token not in self.special_tokens:
split_tokens.extend([t for t in self.bpe(token).split(' ')])
else:
split_tokens.append(token)
else:
# Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
text = self.nlp(text_standardize(self.fix_text(text)))
for token in text:
split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
if token not in self.special_tokens:
split_tokens.extend([t for t in self.bpe(token).split(' ')])
else:
split_tokens.append(token)
return split_tokens

def convert_tokens_to_ids(self, tokens):
Expand Down

0 comments on commit e1e7184

Please sign in to comment.