In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
! ls ../../models/

cached					  ERICA
distilroberta-finetuned			  hopperbot-medium
distilroberta-finetuned__exported	  runs
distilroberta-finetuned__exported.tar.gz


In [3]:
MODEL_NAME = "../../models/ERICA"

In [4]:
from pathlib import Path

Path(MODEL_NAME).exists()

True

In [5]:
from typing import Union, List, Dict

from transformers import PreTrainedModel, pipeline


ROOT_DIR = Path("../").parent.resolve()
DATA_DIR = ROOT_DIR.parent.resolve().joinpath("data")


ID2LABEL_FILEPATH = DATA_DIR.joinpath("id2label.json")
LABEL2ID_FILEPATH = DATA_DIR.joinpath("label2id.json")

def read_json(filename: str, as_type=None) -> Dict:
    with open(filename, "r") as json_file:
        data = json.load(json_file)

        if as_type is not None:
            data = dict([(as_type(k), v) for k, v in data.items()])

        return data



class Classifier:
    def __init__(
        self, model: Union[str, PreTrainedModel], id2label_file=None, label2id_file=None
    ) -> None:
        if not id2label_file:
            id2label_file = ID2LABEL_FILEPATH
        if not label2id_file:
            label2id_file = LABEL2ID_FILEPATH
        self.pipe = pipeline("text-classification", model=model, return_all_scores=True)
        self.pipe.model.config.id2label = read_json(id2label_file, as_type=int)
        self.pipe.model.config.label2id = read_json(label2id_file)

    def classify(self, text, k: int = 1) -> Union[List[Dict], List[str]]:
        results = self.pipe(text)
        result = results[0] if results else None

        if not result:
            return

        result.sort(key=lambda item: item.get("score"), reverse=True)

        # Return all the results
        if k < 1:
            return result

        # Sort by score, in descending order
        result.sort(key=lambda item: item.get("score"), reverse=True)

        # Return the top k results
        return [r["label"] for r in result[:k]]


In [6]:
DATA_DIR

PosixPath('/home/jovyan/work/research/data')

In [19]:
import argparse
import re

import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoConfig,
    Conversation,
    pipeline,
    ConversationalPipeline,
)


def chat(model, tokenizer, classifier=None, device=None):
    """Use model.generate to interact"""
    model.config.pad_token_id = tokenizer.pad_token_id

    model.to("cpu" if device == -1 else device)

    step = 0

    while True:
        text = input(">> ")
        if text in ["/q", "/quit", "/e", "/exit"]:
            break

        print(f"User: {text}")
        
        # encode the new user input, add the eos_token and return a tensor in Pytorch
        text = preprocess_text(text, classifier=classifier)
        
        print(text)
    
        new_user_input_ids = tokenizer.encode(
            text + tokenizer.eos_token, return_tensors="pt"
        )

        # append the new user input tokens to the chat history
        bot_input_ids = (
            torch.cat([chat_history_ids, new_user_input_ids], dim=-1)
            if step > 0
            else new_user_input_ids
        )

        # generate chat ids
        chat_history_ids = model.generate(
            bot_input_ids,
            max_length=512,
            pad_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=3,
            do_sample=True,
            top_k=100,
            top_p=0.7,
            temperature=0.8,
        )

        response = tokenizer.decode(
            chat_history_ids[:, bot_input_ids.shape[-1] :][0],
            skip_special_tokens=True,
        )

        print(f"Bot: {response}")


def chat_pipeline(model, tokenizer, classifier=None, device=None):
    conversation = Conversation()
    pipe = ConversationalPipeline(
        model=model, tokenizer=tokenizer, device=-1 if device == "cpu" else device, max_length=tokenizer.max_len_single_sentence
    )
    pipe.model.config.pad_token_id = pipe.tokenizer.eos_token_id
    pipe.model.max_length = 128

    while True:
        text = input(">> ")
        if text in ["/q", "/quit", "/e", "/exit"]:
            break

        conversation.add_user_input(preprocess_text(text, classifier=classifier))

        print(f"User: {text}")

        result = pipe(conversation)
        response = result.generated_responses[-1]

        print(f"Bot: {clean_text(response)}")


def preprocess_text(text, classifier=None):
    """Prepend context label if classifier specified"""
    prefix = ""
    if classifier:
        context_label = classifier.classify(text, k=1)[0]
        prefix = f"{context_label} "
    return f"{prefix}{text}"


def clean_text(text):
    """Clean response text"""
    return re.sub(r"^\w+", "", text)


def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--model_name", "-m")
    parser.add_argument("--tokenizer", "-t")
    parser.add_argument("--config", "-c")
    parser.add_argument("--classifier", "-cf", default=None)
    parser.add_argument("--pipeline", "-p", action="store_true", default=False)
    parser.add_argument(
        "--device",
        "-d",
        type=str,
        default="cuda" if torch.cuda.is_available() else "cpu",
        help="Device (cuda or cpu)",
    )

    args = parser.parse_args()

    classifier = None

    if args.classifier is not None:
        classifier = Classifier(model=args.classifier)

    if not args.config:
        args.config = args.model_name

    config = AutoConfig.from_pretrained(args.config)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)

    model = AutoModelForCausalLM.from_pretrained(
        args.model_name,
        from_tf=False,
        config=config,
    )

    chat_fn = chat_pipeline if args.pipeline else chat

    chat_fn(model, tokenizer, classifier=classifier, device=args.device)


In [20]:
CLASSIFIER = ROOT_DIR.parent.parent.resolve().joinpath("models").joinpath("distilroberta-finetuned")

In [21]:
import json

config = AutoConfig.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    from_tf=False,
    config=config,
)
classifier = Classifier(model=str(CLASSIFIER))

In [None]:
chat(model, tokenizer, classifier=classifier)

>>  I am so sad.


User: I am so sad.
sad I am so sad.
Bot: sad What's wrong?


>>  I feel like nobody really understands me.


User: I feel like nobody really understands me.
embarrassed I feel like nobody really understands me.
Bot: embarrassment I'm sorry to hear that. Have you tried to work on it?


>>  Yeah, I see a therapist about it.


User: Yeah, I see a therapist about it.
afraid Yeah, I see a therapist about it.
Bot: afraid I hope it works out for you.


>>  Thank you, I hope so, too.


User: Thank you, I hope so, too.
hopeful Thank you, I hope so, too.
Bot: hopeful I'm sure you will get it_comma_ and I'm rooting for you!


>>  I just want to move on from the pain


User: I just want to move on from the pain
sad I just want to move on from the pain
Bot: sad It will pass


In [None]:
"I'm sad

In [17]:
# Let's chat for 5 lines
# https://github.com/microsoft/DialoGPT/issues/45#issuecomment-680338798

config = AutoConfig.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    from_tf=False,
    config=config,
)

for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens,
    chat_history_ids = model.generate(
        bot_input_ids,
        do_sample=True, 
        max_length=1000,
        top_k=50, 
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.35,
    )

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))



>> User: sad I am sad


DialoGPT: sad Yeah_comma_ me too. How long has it been?  Did she move recently or something?


>> User: sad She passed away


DialoGPT: sad So sorry for your loss. It's hard. Is the new house nice? 


>> User: happy It is nice


DialoGPT: joyful When my son was born i could not believe that


>> User: joyful Birthdays are amazing


DialoGPT: joyful Yes_comma_ they make you happy in your life. They take many years to prepare and carry out a celebration like this. Especially when he is almost a man. You have to cherish his milestones and remember those moments. Often these milestones can be marked by other people


>> User: sad I wish I could be happy


KeyboardInterrupt: 

In [None]:
#check the size of the hist, if it's over 'max_lenght' remove the oldest tokens
            if bot_input_ids.size(dim=1) >= args.get('max_length'):
                #trim the tokens
                bot_input_ids = torch.narrow(bot_input_ids, 1, -args.get('max_length'), args.get('max_length'))
            chat_history_ids = model.generate(
                bot_input_ids,
                max_length=args.get('max_length')+20,
                pad_token_id=tokenizer.eos_token_id,
                no_repeat_ngram_size=args.get('no_repeat_ngram_size'),
                do_sample=args.get('do_sample'),
                top_k=args.get('top_k'),
                top_p=args.get('top_p'),
                temperature=args.get('temperature')
                )

# Issues with DialoGPT (so many)

- https://github.com/microsoft/DialoGPT/issues/84
- https://github.com/microsoft/DialoGPT/issues/34
- https://www.reddit.com/r/GPT3/comments/uokws7/dialogpt_finetuned_on_my_own_message_data/
- https://chatbotslife.com/creating-cartmanbot-part-two-de95c348aeb1

In [43]:
import argparse
import re

import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoConfig,
    Conversation,
    pipeline,
    ConversationalPipeline,
)


config = AutoConfig.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    from_tf=False,
    config=config,
)

# Let's chat for 4 lines
for step in range(6):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input_ids)

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 200 tokens,
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=1,
        do_sample=True,
        top_k=100,
        top_p=0.8,
        temperature=0.8
    )

    # pretty print last ouput tokens from bot
    print("{}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))


>> User: I'm happy today


KeyboardInterrupt: 