In [7]:
#import neo4j_memory.neo4j_helper as neo4j_helper
import numpy as np
import torch
import json
from transformers import AutoTokenizer, AutoConfig,\
      T5ForConditionalGeneration, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoModelForCausalLM
from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# Pass input into topic extraction

# download the models
cot_tokenizer = AutoTokenizer.from_pretrained("prakharz/DIAL-BART0")
cot_model = AutoModelForSeq2SeqLM.from_pretrained("prakharz/DIAL-BART0")
cot_model.load_state_dict(torch.load('../topic_extraction/model/topic_er2.pt'))

sent_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
sent_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
config = AutoConfig.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

inst_tokenizer = AutoTokenizer.from_pretrained("prakharz/DIAL-BART0")
inst_model = AutoModelForSeq2SeqLM.from_pretrained("prakharz/DIAL-BART0")

# chain of topics
def extract_topic_sentiment(text_in):
    instruct_input = "Instruction:What is the topic of conversation?\n\nInput:[CONTEXT]{}[ENDOFDIALOGUE][QUESTION]The topic of conversation is".format(text_in)
    tokens_input = inst_tokenizer(instruct_input, max_length=250, padding='max_length', truncation=True, return_tensors='pt')
    input_out = inst_model.generate(**tokens_input)
    topic = inst_tokenizer.decode(input_out[0], skip_special_tokens=True)

    tokens_input = sent_tokenizer(text_in, max_length=250, padding='max_length', truncation=True, return_tensors='pt')
    input_out = sent_model(**tokens_input)

    scores = softmax(input_out[0][0].detach().numpy())
    #print(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = config.id2label[ranking[i]]
        s = scores[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")

    return topic, config.id2label[ranking[0]]

def generate_cot(text_in):
    tok_text = cot_tokenizer(text_in, return_tensors='pt')
    gen_text = cot_model.generate(**tok_text)
    dec_text = cot_tokenizer.decode(gen_text[0], skip_special_tokens=True)
    return dec_text

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
in_str = "My favorite football team is the Kansas City Chiefs."
topic, sent = extract_topic_sentiment(in_str)
dec_out = generate_cot(in_str)
print(f"CoT:{dec_out}, Topic:{topic}, Sent:{sent}")



1) positive 0.9802
2) neutral 0.0167
3) negative 0.0032
CoT:(sports,yes)|(football team,yes)
, Topic:Playing football, Sent:positive


In [None]:
# load the input
with open('./topical_chat/Topical-Chat-master/conversations/train.json', 'r') as jsonfile:
    topical_chat_conversations = json.load(jsonfile)
    instance = topical_chat_conversations[list(topical_chat_conversations.keys())[0]]['content']
    
    for x in instance:
        print(x['message'], x['agent'])
        # print('--')

# pass input into model

In [None]:
# Create Nodes for each topic in CoT

In [108]:
# Recommender model
recommender_tokenizer = AutoTokenizer.from_pretrained("t5-large")
recommender_model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
recommender_model.load_state_dict(torch.load('./model/rec_er.pt'))
recommender_model.eval()
print('done')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


done


In [109]:
def generate_recommendation(text_in):
    tok_text = recommender_tokenizer(text_in, return_tensors='pt')
    gen_text = recommender_model.generate(**tok_text, max_new_tokens=32)
    dec_text = recommender_tokenizer.decode(gen_text[0], skip_special_tokens=True)
    return dec_text

    # Input: CoT, All nodes that are 1 distance from current topic
    # Output: New suggested topic CoT
def generate_rec2(text_in):
    tok_text = recommender_tokenizer(text_in, return_tensors='pt')
    print(tok_text)
    gen_text = recommender_model(input_ids=tok_text.input_ids, labels=tok_text.input_ids)
    #dec_text = recommender_tokenizer.decode(gen_text[0], skip_special_tokens=True)
    return gen_text

In [122]:
text_in = "Instruction: Generate only 4 similar topics that could be suggested for new conversation that takes influence from but are not present in the following user profile: {\"sports\":\"positive\", \"football\":\"positive\", \"nflteams\":\"positive\"} In the generated answer, generate the suggested topic within brackets [SUGGESTEDTOPIC]\nAnswer:"

num_sugg = 4
#inp = "{\"sports\":\"positive\", \"football\":\"positive\", \"nflteams\":\"positive\"}"
#inp = "{\"food\":\"positive\", \"cheeseburger\":\"positive\", \"fry sauce\":\"positive\", \"mcdonalds\":\"positive\"}"
#inp = "{\"food\":\"positive\", \"cheeseburger\":\"negative\", \"chicken nuggets\":\"positive\", \"mcdonalds\":\"positive\"}"
inp = "{\"movies\":\"positive\", \"sci-fi\":\"positive\", \"star wars\":\"positive\", \"darth vader\":\"positive\"}"
#inp = "{\"animals\":\"positive\", \"zoo\":\"positive\", \"pandas\":\"positive\"}"
#inp = "{\"sports\":\"positive\", \"basketball\":\"positive\"}"
#inp = "{\"sports\":\"negative\", \"basketball\":\"negative\", \"music\":\"positive\", \"country\":\"positive\", \"soccer\":\"negative\", \"baseball\":\"negative\"}"
#inp = "{\"education\":\"positive\", \"universities\":\"positive\", \"virginia tech\":\"positive\", \"lifu huang\":\"positive\", \"computer science\":\"positive\"}"
prompt = f"Instruction: Generate only {num_sugg} similar topics that could be suggested for new conversation that takes influence from but are not present in the following user profile: {inp} In the generated answer, generate each of the suggested topics separated by a comma like so: TOPIC1,TOPIC2,TOPIC3,TOPIC4,etc.\nSuggested Topics:"
        
instruction = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"

print('gen: ', generate_recommendation(prompt))
#print(generate_rec2("I like things."))

gen:  science fiction,fantasy films,sci-fi literature,star wars characters


In [None]:
# Validate output / shifting (using Amazon dataset I found)