In [1]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch

In [3]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [4]:
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [6]:
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [8]:
import text2image

2022-07-14 20:14:18,542 - text2image - INFO - DialoGPT loaded
2022-07-14 20:14:18,561 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: clips/mfaq
2022-07-14 20:14:31,301 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cuda
2022-07-14 20:14:31,516 - text2image - INFO - Sentence Transformer loaded
2022-07-14 20:15:16,180 - text2image - INFO - CLIP loaded
2022-07-14 20:15:38,373 - text2image - INFO - CLIP features loaded


In [22]:
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

In [9]:
dialog = ['Say , Jim , how about going for a few beers after dinner ? ',
 ' You know that is tempting but is really not good for our fitness . ',
 ' What do you mean ? It will help us to relax . ',
 " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ",
 " I guess you are right.But what shall we do ? I don't feel like sitting at home . ",
 ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . ',
 " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ",
 ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ',
 " Good.Let ' s go now . ",
 ' All right . ']

In [10]:
img, scores = text2image.score_image(dialog)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
img_for_caption = img[5][0]

In [18]:
pixel_values = feature_extractor(images=img[5], return_tensors="pt").pixel_values

In [20]:
pixel_values = pixel_values.to(device)

In [23]:
output_ids = model.generate(pixel_values, **gen_kwargs)

In [24]:
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
preds = [pred.strip() for pred in preds]

In [42]:
preds[0]

'a room that has a lot of equipment in it'

# GPT

In [51]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AutoModelForCausalLM, AutoTokenizer

In [52]:
tokenizer_gpt = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")

In [54]:
model_gpt = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

In [68]:
result = tokenizer_gpt.encode(dialog[0] + tokenizer_gpt.eos_token, return_tensors='pt')
for elem in dialog[1:5]:
    tokens = tokenizer_gpt.encode(elem + tokenizer_gpt.eos_token, return_tensors='pt')
    result = torch.cat([tokens, result], dim=-1)
#text = text + preds[0] + tokenizer_gpt.eos_token_id

In [71]:
tokens = tokenizer_gpt.encode(preds[0] + tokenizer_gpt.eos_token, return_tensors='pt')
result = torch.cat([tokens, result], dim=-1)

In [72]:
result

tensor([[   64,  2119,   326,   468,   257,  1256,   286,  5112,   287,   340,
         50256,   314,  4724,   345,   389,   826,    13,  1537,   644,  2236,
           356,   466,  5633,   314,   836,   470,  1254,   588,  5586,   379,
          1363,   764,   220, 50256,  2141,   345,  1107,   892,   523,  5633,
           314,   836,   470,   764,   632,   481,   655,   787,   514,  3735,
           290,   719, 14397,   764, 11436,   938,   640,  5633,   220, 50256,
          1867,   466,   345,  1612,  5633,   632,   481,  1037,   514,   284,
          8960,   764,   220, 50256,   921,   760,   326,   318, 29850,   475,
           318,  1107,   407,   922,   329,   674, 13547,   764,   220, 50256,
         25515,   837,  5395,   837,   703,   546,  1016,   329,   257,  1178,
         16800,   706,  8073,  5633,   220, 50256]])

In [73]:
ans = model_gpt.generate(tokens, max_length=300, pad_token_id=tokenizer_gpt.eos_token_id)


In [74]:
ans

tensor([[   64,  2119,   326,   468,   257,  1256,   286,  5112,   287,   340,
         50256,    40,  1101,   407,  1654,   611,   345,   821,  2726,   837,
           475,   314,  1053,  1775,   257,  1256,   286,   661,   910,   326,
           764, 50256]])

In [75]:
print("DialoGPT: {}".format(tokenizer_gpt.decode(ans[0], skip_special_tokens=True)))

DialoGPT: a room that has a lot of equipment in itI'm not sure if you're serious, but I've seen a lot of people say that.


In [79]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User:Say , Jim , how about going for a few beers after dinner ?
DialoGPT: I'm not sure if you're serious or not, but I laughed.
>> User:You know that is tempting but is really not good for our fitness 
DialoGPT: I'm not sure if you're serious or not, but I laughed.


KeyboardInterrupt: Interrupted by user

In [63]:
bot_input_ids.shape

torch.Size([1, 30])