In [4]:
#import neo4j_memory.neo4j_helper as neo4j_helper
import numpy as np
import torch
import json
from transformers import AutoTokenizer, AutoConfig,\
      T5ForConditionalGeneration, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoModelForCausalLM
from scipy.special import softmax

In [5]:
# Pass input into topic extraction

# download the models
cot_tokenizer = AutoTokenizer.from_pretrained("prakharz/DIAL-BART0")
cot_model = AutoModelForSeq2SeqLM.from_pretrained("prakharz/DIAL-BART0")
cot_model.load_state_dict(torch.load('../topic_extraction/model/topic_er2.pt'))

sent_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
sent_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
config = AutoConfig.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

inst_tokenizer = AutoTokenizer.from_pretrained("prakharz/DIAL-BART0")
inst_model = AutoModelForSeq2SeqLM.from_pretrained("prakharz/DIAL-BART0")

# chain of topics
def extract_topic_sentiment(text_in):
    instruct_input = "Instruction:What is the topic of conversation?\n\nInput:[CONTEXT]{}[ENDOFDIALOGUE][QUESTION]The topic of conversation is".format(text_in)
    tokens_input = inst_tokenizer(instruct_input, max_length=250, padding='max_length', truncation=True, return_tensors='pt')
    input_out = inst_model.generate(**tokens_input)
    topic = inst_tokenizer.decode(input_out[0], skip_special_tokens=True)

    tokens_input = sent_tokenizer(text_in, max_length=250, padding='max_length', truncation=True, return_tensors='pt')
    input_out = sent_model(**tokens_input)

    scores = softmax(input_out[0][0].detach().numpy())
    #print(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = config.id2label[ranking[i]]
        s = scores[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")

    return topic, config.id2label[ranking[0]]

def generate_cot(text_in):
    tok_text = cot_tokenizer(text_in, return_tensors='pt')
    gen_text = cot_model.generate(**tok_text)
    dec_text = cot_tokenizer.decode(gen_text[0], skip_special_tokens=True)
    return dec_text

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
in_str = "I like playing the super mario bros."
topic, sent = extract_topic_sentiment(in_str)
dec_out = generate_cot(in_str)
print(f"CoT:{dec_out}, Topic:{topic}, Sent:{sent}")



1) positive 0.9492
2) neutral 0.0482
3) negative 0.0026
CoT:video games,yes)|(super mario bros,yes)
, Topic:Mario, Sent:positive


In [4]:
# load the input
with open('../topical_chat/Topical-Chat-master/conversations/train.json', 'r') as jsonfile:
    topical_chat_conversations = json.load(jsonfile)
    instance = topical_chat_conversations[list(topical_chat_conversations.keys())[0]]['content']
    
    for x in instance:
        print(x['message'], x['agent'])
        # print('--')

Are you a fan of Google or Microsoft? agent_1
Both are excellent technology they are helpful in many ways. For the security purpose both are super. agent_2
I'm not  a huge fan of Google, but I use it a lot because I have to. I think they are a monopoly in some sense.  agent_1
Google provides online related services and products, which includes online ads, search engine and cloud computing. agent_2
Yeah, their services are good. I'm just not a fan of intrusive they can be on our personal lives.  agent_1
Google is leading the alphabet subsidiary and will continue to be the Umbrella company for Alphabet internet interest. agent_2
Did you know Google had hundreds of live goats to cut the grass in the past? 
 agent_1
It is very interesting. Google provide "Chrome OS" which is a light weight OS. Google provided a lot of hardware mainly in 2010 to 2015.  agent_2
I like Google Chrome. Do you use it as well for your browser?  agent_1
Yes.Google is the biggest search engine and Google service fi

In [None]:
# Create Nodes for each topic in CoT

In [5]:
# Recommender model
recommender_tokenizer = AutoTokenizer.from_pretrained("t5-large")
recommender_model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
recommender_model.load_state_dict(torch.load('./model/rec_er.pt'))
recommender_model.eval()
print('done')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


done


In [6]:
def generate_recommendation(text_in):
    tok_text = recommender_tokenizer(text_in, return_tensors='pt')
    gen_text = recommender_model.generate(**tok_text, max_new_tokens=32)
    dec_text = recommender_tokenizer.decode(gen_text[0], skip_special_tokens=True)
    return dec_text

    # Input: CoT, All nodes that are 1 distance from current topic
    # Output: New suggested topic CoT
def generate_rec2(text_in):
    tok_text = recommender_tokenizer(text_in, return_tensors='pt')
    print(tok_text)
    gen_text = recommender_model(input_ids=tok_text.input_ids, labels=tok_text.input_ids)
    #dec_text = recommender_tokenizer.decode(gen_text[0], skip_special_tokens=True)
    return gen_text

In [35]:
text_in = "Instruction: Generate only 4 similar topics that could be suggested for new conversation that takes influence from but are not present in the following user profile: {\"sports\":\"positive\", \"football\":\"positive\", \"nflteams\":\"positive\"} In the generated answer, generate the suggested topic within brackets [SUGGESTEDTOPIC]\nAnswer:"

num_sugg = 3
#inp = "{\"sports\":\"positive\", \"football\":\"positive\", \"nflteams\":\"positive\"}"
#inp = "{\"food\":\"positive\", \"cheeseburger\":\"positive\", \"fry sauce\":\"positive\", \"mcdonalds\":\"positive\"}"
#inp = "{\"food\":\"positive\", \"cheeseburger\":\"negative\", \"chicken nuggets\":\"positive\", \"mcdonalds\":\"positive\"}"
#inp = "{\"movies\":\"positive\", \"sci-fi\":\"positive\", \"star wars\":\"positive\", \"darth vader\":\"positive\"}"
#inp = "{\"animals\":\"positive\", \"zoo\":\"positive\", \"pandas\":\"positive\"}"
#inp = "{\"sports\":\"positive\", \"basketball\":\"positive\"}"
inp = "{\"sports\":\"negative\", \"basketball\":\"negative\", \"music\":\"positive\", \"soccer\":\"negative\"}"
#inp = "{\"education\":\"positive\", \"universities\":\"positive\", \"virginia tech\":\"positive\", \"lifu huang\":\"positive\", \"computer science\":\"positive\"}"
prompt = f"Instruction: Generate only {num_sugg} similar topics that could be suggested for new conversation that takes influence from but are not present in the following user profile: {inp} In the generated answer, generate each of the suggested topics separated by a comma like so: TOPIC1,TOPIC2,TOPIC3,TOPIC4,etc.\nSuggested Topics:"
        
instruction = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"

print('gen: ', generate_recommendation(prompt))
#print(generate_rec2("I like things."))

gen:  basketball,music appreciation,sports controversies


In [23]:
def CoT_to_Preference(cot):
    # (sports,yes)|(football team,yes)
    # "{\"sports\":\"positive\", \"football\":\"positive\"}"
    topics = cot.split('|')
    top_dict = {}
    for top in topics:
        top = top.replace('(', '')
        top = top.replace(')', '')
        the_top, pref = top.split(',')
        #print(pref)
        if pref == 'yes':
            pref = 'positive'
        elif pref == 'no':
            pref = 'negative'
        else:
            pref = 'unknown'
        top_dict[the_top] = pref
    return top_dict

print(CoT_to_Preference('(technology,yes)|(smartphone features,yes)'))

{'technology': 'positive', 'smartphone features': 'positive'}


In [27]:
# Validate output / shifting (using Amazon dataset I found)
output_file = open('./out_log_ext.txt', 'w')
with open('../topical_chat/Topical-Chat-master/conversations/train.json', 'r') as jsonfile:
    topical_chat_conversations = json.load(jsonfile)
    
    for idx in range(len(topical_chat_conversations.keys())):
        if idx == 10:
            break

        instance = topical_chat_conversations[list(topical_chat_conversations.keys())[idx]]['content']
        for x in instance:
            #print(x['message'], x['agent'])
            if x['agent'] == 'agent_2':
                # pass input into model
                cot_out = generate_cot(x['message'])
                cot_out = cot_out.strip()
                pref = CoT_to_Preference(cot_out)
                
                num_sugg = 3
                inp = pref
                prompt = f"Instruction: Generate only {num_sugg} similar topics that could be suggested for new conversation that takes influence from but are not present in the following user profile: {inp} In the generated answer, generate each of the suggested topics separated by a comma like so: TOPIC1,TOPIC2,TOPIC3,TOPIC4,etc.\nSuggested Topics:"
                sugg_topics = generate_recommendation(prompt)
                output_file.write(f"{x['message']}|{cot_out}|{pref}|{sugg_topics}\n\n")
                print(f"{x['message']}|{cot_out}|{pref}|{sugg_topics}")
            else:
                output_file.write(f"TARGET RESPONSE: {x['message']}\n")
                print(f"TARGET RESPONSE: {x['message']}\n")
    output_file.close()
    

TARGET RESPONSE: Are you a fan of Google or Microsoft?

Both are excellent technology they are helpful in many ways. For the security purpose both are super.|(technology,yes)|(smartphone features,yes)|{'technology': 'positive', 'smartphone features': 'positive'}|smartphone features,smartphone apps,smartwatches
TARGET RESPONSE: I'm not  a huge fan of Google, but I use it a lot because I have to. I think they are a monopoly in some sense. 

Google provides online related services and products, which includes online ads, search engine and cloud computing.|(technology,yes)|(virtual reality,yes)|{'technology': 'positive', 'virtual reality': 'positive'}|virtual reality,ethics of VR,virtual reality experiences
TARGET RESPONSE: Yeah, their services are good. I'm just not a fan of intrusive they can be on our personal lives. 

Google is leading the alphabet subsidiary and will continue to be the Umbrella company for Alphabet internet interest.|(technology,yes)|(internet interest,yes)|{'technolo

ValueError: not enough values to unpack (expected 2, got 1)