In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import tensorflow as tf
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from scripts import main
from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoModelWithLMHead,
    AutoTokenizer,
)
import logging

try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

# Configs
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")

dataset = pd.read_csv('ALL_Dialogues_in_friends-2.csv')
dataset = dataset.dropna()
dataset = dataset.reset_index(drop=True)
dataset = dataset.drop(['Unnamed: 0'], axis=1)

#Remove \n
dataset['Dialogue'] = dataset['Dialogue'].str.replace('\n', ' ')
dialogs = list(dataset['Dialogue'])[:500]

#%%
contexted = []
n = 10
for i in range(n, len(dialogs)):
  row = []
  prev = i - 1 - n # we additionally subtract 1, so row will contain current response and 7 previous responses  
  for j in range(i, prev, -1):
    row.append(dialogs[j])
  contexted.append(row)
  contexted.append(row)

columns = ['response', 'context'] 
columns = columns + ['context/'+str(i) for i in range(n-1)]
df = pd.DataFrame.from_records(contexted, columns=columns)
df.head(10)

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5,context/6,context/7,context/8
0,So she went to the airport to meet him.,"She was, like, ""Oh, my God!""",let it slip that Ross loved Rachel.,"When Ross was in China, Chandler","like, ""Forget about her.""","And finally, Chandler was,","...like cats, Italian guys.",something got in the way...,"Every time he tried to tell her,",with Rachel since forever.,﻿Ross was in love
1,So she went to the airport to meet him.,"She was, like, ""Oh, my God!""",let it slip that Ross loved Rachel.,"When Ross was in China, Chandler","like, ""Forget about her.""","And finally, Chandler was,","...like cats, Italian guys.",something got in the way...,"Every time he tried to tell her,",with Rachel since forever.,﻿Ross was in love
2,She didn't know Ross was getting,So she went to the airport to meet him.,"She was, like, ""Oh, my God!""",let it slip that Ross loved Rachel.,"When Ross was in China, Chandler","like, ""Forget about her.""","And finally, Chandler was,","...like cats, Italian guys.",something got in the way...,"Every time he tried to tell her,",with Rachel since forever.
3,She didn't know Ross was getting,So she went to the airport to meet him.,"She was, like, ""Oh, my God!""",let it slip that Ross loved Rachel.,"When Ross was in China, Chandler","like, ""Forget about her.""","And finally, Chandler was,","...like cats, Italian guys.",something got in the way...,"Every time he tried to tell her,",with Rachel since forever.
4,off the plane with another woman.,She didn't know Ross was getting,So she went to the airport to meet him.,"She was, like, ""Oh, my God!""",let it slip that Ross loved Rachel.,"When Ross was in China, Chandler","like, ""Forget about her.""","And finally, Chandler was,","...like cats, Italian guys.",something got in the way...,"Every time he tried to tell her,"
5,off the plane with another woman.,She didn't know Ross was getting,So she went to the airport to meet him.,"She was, like, ""Oh, my God!""",let it slip that Ross loved Rachel.,"When Ross was in China, Chandler","like, ""Forget about her.""","And finally, Chandler was,","...like cats, Italian guys.",something got in the way...,"Every time he tried to tell her,"
6,That's pretty much everything,off the plane with another woman.,She didn't know Ross was getting,So she went to the airport to meet him.,"She was, like, ""Oh, my God!""",let it slip that Ross loved Rachel.,"When Ross was in China, Chandler","like, ""Forget about her.""","And finally, Chandler was,","...like cats, Italian guys.",something got in the way...
7,That's pretty much everything,off the plane with another woman.,She didn't know Ross was getting,So she went to the airport to meet him.,"She was, like, ""Oh, my God!""",let it slip that Ross loved Rachel.,"When Ross was in China, Chandler","like, ""Forget about her.""","And finally, Chandler was,","...like cats, Italian guys.",something got in the way...
8,you need to know.,That's pretty much everything,off the plane with another woman.,She didn't know Ross was getting,So she went to the airport to meet him.,"She was, like, ""Oh, my God!""",let it slip that Ross loved Rachel.,"When Ross was in China, Chandler","like, ""Forget about her.""","And finally, Chandler was,","...like cats, Italian guys."
9,you need to know.,That's pretty much everything,off the plane with another woman.,She didn't know Ross was getting,So she went to the airport to meet him.,"She was, like, ""Oh, my God!""",let it slip that Ross loved Rachel.,"When Ross was in China, Chandler","like, ""Forget about her.""","And finally, Chandler was,","...like cats, Italian guys."


In [12]:
class Args():
    def __init__(self):
        self.output_dir = 'output-small'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-small'
        self.config_name = 'microsoft/DialoGPT-small'
        self.tokenizer_name = 'microsoft/DialoGPT-small'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 5
        self.learning_rate = 2e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 6
        self.max_steps = -1
        self.warmup_steps = 1
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()
trn_df, val_df = train_test_split(df, test_size = 0.1)
torch.cuda.empty_cache()
main(trn_df, val_df,args) #import from scripts.py

model = AutoModelForCausalLM.from_pretrained('output-small')

# %%
#Test
for step in range(10):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input_ids)
# append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
# generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=30,       
        do_sample=True, 
        top_k=100, 
        top_p=1,
        temperature = 0.8
    )
    
    # pretty print last ouput tokens from bot
    print("Chatbot: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))



Epoch:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/220 [00:00<?, ?it/s]

Iteration:   0%|          | 0/220 [00:00<?, ?it/s]

Iteration:   0%|          | 0/220 [00:00<?, ?it/s]

Iteration:   0%|          | 0/220 [00:00<?, ?it/s]

Iteration:   0%|          | 0/220 [00:00<?, ?it/s]

Iteration:   0%|          | 0/220 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

>> User:Hi


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Chatbot: Hiya Ben. I'm Dad. 
>> User:Okay daddy, commit die !


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Chatbot: Oh, my God! 
>> User:Die please


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Chatbot: No, you're thinking of me. 
>> User:Yes because i need you to die


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Chatbot: That's what I meant by I'm sorry. 
>> User:Shut the fuck up you cumsucker


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Chatbot: No, you're thinking of me. 
>> User:Ratio


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Chatbot: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!:!!!!!!!!!!!!!!!!!!!!!!!!!!!!!?!!!!!!!!!!!!!!!!!!!!!!!!!!!!!,!!!!!!!!!!!!!!!!!!!!!!!!!!!!
>> User:Ratio + not asked 


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 207, but `max_length` is set to 200. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Chatbot: !


KeyboardInterrupt: ignored

In [13]:
for step in range(10):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input_ids)
# append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
# generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=30,       
        do_sample=True, 
        top_k=100, 
        top_p=1,
        temperature = 0.8
    )
    
    # pretty print last ouput tokens from bot
    print("Chatbot: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User:hello


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Chatbot: Hello. I'm greeted by my best friend. 
>> User:who?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Chatbot: Wait, who's in charge of the 
>> User:no, who asked?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Chatbot: This is my sister. 
>> User:Is she fuckable?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Chatbot: This is really great. 
>> User:Nice


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Chatbot: Yeah, I think 
>> User:You want to do it with me?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Chatbot: How about some of you? 
>> User:Okay take me please


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Chatbot: I have some questions. 
>> User:Okay ask them


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Chatbot: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!?!!!!!!!!!!!!!!!!!!!!!!!!!!!!!.!!!!!!!!!!!!!!!!!!!!!!!!!!!!!,!!!!!!!!!!!!!!!


KeyboardInterrupt: ignored