In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, TrainingArguments, pipeline
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM
from trl import SFTTrainer
from datasets import load_dataset
import argparse
import pandas as pd
from scipy import spatial
#import gensim
import gensim.downloader as api
import numpy as np
import re
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
    PyTorch 2.1.2+cu121 with CUDA 1201 (you have 2.2.2+cu121)
    Python  3.9.18 (you have 3.9.16)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
  torch.utils._pytree._register_pytree_node(


In [2]:
def prompt_instruction_format_recommend(sample, tokenizer):
    num_sugg = 3 #len(sample['topics'].split(','))
    messages = [
        {
        "role":"system",
        "content": f"Your goal is to recommend new topics of conversation based on a user\'s preferences towards topics."
        },
        {
        "role": "user",
        "content": f"Generate only {num_sugg} similar topics that could be suggested for new conversation that takes influence from but are not present in the following user profile: {sample['profile']} In the generated answer, generate each of the suggested topics separated by a comma like so: TOPIC1,TOPIC2,TOPIC3,TOPIC4,etc."
        },
        # {
        # "role": "assistant",
        # "content": f"{sample['topics']}"
        # }
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

def prompt_instruction_format_xtract(sample, tokenizer):

    messages = [
        {
        "role":"system",
        "content": f"Your goal is to extract topics and the speaker\'s positive preference (yes, unknown, or no) towards the topic from a conversation turn."
        },
        {
        "role": "user",
        "content": f"Generate a list of topics increasing in specificity to define the subject of conversation from this utterance: {sample['utterance']}"
        },
        # {
        # "role": "assistant",
        # "content": f"{sample['topics']}"
        # }
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

In [3]:
recomm_test = pd.read_csv('../CoT/recommender/recommender_data/recommend_test.csv', sep='\t')
xtract_test = pd.read_csv('../CoT/topic_extraction/topic_xtract_data/xtract_test.csv', sep='\t')
xtract_test.head()

Unnamed: 0.1,Unnamed: 0,utterance,topics
0,4766,"Not really, I just listen to the radio when I'...","(music,no)"
1,5540,"Not really, I'm not that into sports. What abo...","(sports,unknown)"
2,10715,"No, I haven't. What kind of food do they serve?","(food,yes)|(restaurant,yes)|(10th street resta..."
3,2241,"I have heard of that one, but I haven't read i...","(hobby,yes), yes)"
4,12650,Not really. I'm not very good at it.,"(food,no)|(cooking,no)"


In [4]:
recomm_test

Unnamed: 0.1,Unnamed: 0,profile,topics
0,225,"{""travel"":""positive"", ""road trips"":""positive"",...","road trip destinations,travel safety,road trip..."
1,194,"{""gardening"":""positive"", ""vertical gardening"":...","vertical gardens,soil testing,urban gardening,..."
2,3,"{""technology"":""positive"", ""gadgets"":""positive""...","tech innovations,app development"
3,154,"{""art"":""positive"", ""abstract art"":""positive"", ...","contemporary art,art criticism,art accessibili..."
4,187,"{""sports"":""positive"", ""basketball"":""positive"",...","NBA,sports betting,basketball techniques,baske..."
5,153,"{""history"":""positive"", ""Renaissance art"":""posi...","art history,heist stories,art recovery,famous ..."
6,363,"{""horror movies"":""negative"", ""video games"":""ne...","nutrition tips,healthy recipes,fitness challenges"
7,185,"{""music"":""positive"", ""folk music"":""positive"", ...","folklore,cultural issues,folk festivals,folk i..."
8,164,"{""technology"":""positive"", ""blockchain technolo...","blockchain applications,online security,digita..."
9,78,"{""fitness"":""positive"", ""swimming"":""positive"", ...","swimming techniques,pool hygiene"


In [5]:
glove = api.load('word2vec-google-news-300') #choose from multiple models https://github.com/RaRe-Technologies/gensim-data

In [6]:
def catch(func, handle=lambda e : e, *args, **kwargs):
    try:
        return func(*args, **kwargs)
    except Exception as e:
        return None

def preprocess(s):
    return [i.lower() for i in s.split()]

def get_vector(s):
    # ignore any words that are not within glove
    vecs = [catch(lambda: glove[i]) for i in preprocess(s)]
    vecs = [v for v in vecs if type(v) != type(None)]
    # print(vecs)
    if len(vecs) == 0:
        return np.zeros(300)
    return np.sum(np.array(vecs), axis=0)

# eval 7b recommend model

In [7]:
llm_path = '../CoT/recommender/hf_output/checkpoint-220'

llm_model = AutoModelForCausalLM.from_pretrained(llm_path)
llm_model.to('cuda:0')
llm_tokenizer = AutoTokenizer.from_pretrained(llm_path)
llm_tokenizer.pad_token = llm_tokenizer.eos_token
llm_tokenizer.padding_side = 'right'
llm_pipeline = pipeline('text-generation', model=llm_model, 
                                tokenizer=llm_tokenizer, torch_dtype=torch.float32, device=0)

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.61s/it]


In [12]:
score = 0
score2 = 0
rec_file = open('./aa_7b.tsv', 'w')
#rec_file.write(f"targets\tresponse\tscore\n")
rec_file.write(f"targets\tresponse\tprecision\trecall\tf1\n")
for i in tqdm(range(len(recomm_test))):
    inst = recomm_test.iloc[i]
    #print(inst)
    
    prompt = prompt_instruction_format_recommend(inst, llm_tokenizer)
    outputs = llm_pipeline(prompt, max_new_tokens=100, do_sample=True, 
                                temperature=0.85, top_k=50, top_p=0.95)
    response = outputs[0]['generated_text'].split('[/INST]')[-1].strip().split(',')
    #print('prof: ', inst['profile'])
    targets = inst['topics'].split(',')
    #print('old: ', targets)
    #print()
    #print('new: ', response)
    
    # modify score based upon number of suggestions
    #score -= abs(len(response) - 3)

    avg_pred_trg = np.average([np.max([1 - spatial.distance.cosine(get_vector(y_hat), get_vector(y)) for y_hat in response]) for y in targets])
    avg_pred_trg2 = np.average([np.max([1 - spatial.distance.cosine(get_vector(y_hat), get_vector(y)) for y_hat in targets]) for y in response]) # recall
    score += avg_pred_trg
    score2 += avg_pred_trg2
    rec_file.write(f"{inst['topics']}\t{targets}\t{response}\t{avg_pred_trg}\t{avg_pred_trg2}\t{(2 * avg_pred_trg * avg_pred_trg2)/(avg_pred_trg + avg_pred_trg2)}\n")
    
print('precision: ', score/len(recomm_test))
print('recall: ', score2/len(recomm_test))
print('f1: ', (2 * (score/len(recomm_test)) * (score2/len(recomm_test)))/(score/len(recomm_test) + score2/len(recomm_test)))
rec_file.close()

  dist = 1.0 - uv / math.sqrt(uu * vv)
100%|██████████| 40/40 [00:53<00:00,  1.35s/it]

precision:  0.7241904125213284
recall:  0.7516811736508903
f1:  0.7376797606661606





# Eval 1b Recommend model

In [13]:
def generate_recommendation(xtract_prof, tok_in, mod_in):
    num_sugg = 3
    prompt = f"Instruction: Generate only {num_sugg} similar topics that could be suggested for new conversation that takes influence from but are not present in the following user profile: {xtract_prof} In the generated answer, generate each of the suggested topics separated by a comma like so: TOPIC1,TOPIC2,TOPIC3,TOPIC4,etc.\nSuggested Topics:"           
    tok_text = tok_in(prompt, return_tensors='pt').to('cuda:0')
    gen_text = mod_in.generate(**tok_text, max_new_tokens=100)#, do_sample=True, 
                                #temperature=0.85, top_k=50, top_p=0.95)
    dec_text = tok_in.decode(gen_text[0], skip_special_tokens=True)
    return dec_text

In [15]:
llm_path = '../CoT/recommender/hf_model_1b/'

llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_path)
llm_model.to('cuda:0')
llm_tokenizer = AutoTokenizer.from_pretrained(llm_path)
llm_tokenizer.pad_token = llm_tokenizer.eos_token
llm_tokenizer.padding_side = 'right'
# llm_pipeline = pipeline('text-generation', model=llm_model, 
#                                 tokenizer=llm_tokenizer, torch_dtype=torch.float32, device=0)

In [17]:
score = 0
score2 = 0
rec_file = open('./aa_1b.tsv', 'w')
#rec_file.write(f"targets\tresponse\tscore\n")
rec_file.write(f"targets\tresponse\tprecision\trecall\tf1\n")
for i in tqdm(range(len(recomm_test))):
    inst = recomm_test.iloc[i]
    prof = inst['profile'].split(',')
    response = generate_recommendation(prof, llm_tokenizer, llm_model).strip().split(',')
    targets = inst['topics'].split(',')
    # score -= abs(len(response) - 3)

    avg_pred_trg = np.average([np.max([1 - spatial.distance.cosine(get_vector(y_hat), get_vector(y)) for y_hat in response]) for y in targets]) # precision
    avg_pred_trg2 = np.average([np.max([1 - spatial.distance.cosine(get_vector(y_hat), get_vector(y)) for y_hat in targets]) for y in response]) # recall
    score += avg_pred_trg
    score2 += avg_pred_trg2
    rec_file.write(f"{inst['topics']}\t{targets}\t{response}\t{avg_pred_trg}\t{avg_pred_trg2}\t{(2 * avg_pred_trg * avg_pred_trg2)/(avg_pred_trg + avg_pred_trg2)}\n")
    
print('precision: ', score/len(recomm_test))
print('recall: ', score2/len(recomm_test))
print('f1: ', (2 * (score/len(recomm_test)) * (score2/len(recomm_test)))/(score/len(recomm_test) + score2/len(recomm_test)))
rec_file.close()

100%|██████████| 40/40 [00:11<00:00,  3.38it/s]

precision:  0.724823691429133
recall:  0.7834936854952763
f1:  0.7530176261578174





# Eval 7b extract model

In [7]:
llm_path = '../CoT/topic_extraction/hf_output/checkpoint-3534'

llm_model = AutoModelForCausalLM.from_pretrained(llm_path)
llm_model.to('cuda:0')
llm_tokenizer = AutoTokenizer.from_pretrained(llm_path)
llm_tokenizer.pad_token = llm_tokenizer.eos_token
llm_tokenizer.padding_side = 'right'
llm_pipeline = pipeline('text-generation', model=llm_model, 
                                tokenizer=llm_tokenizer, torch_dtype=torch.float32, device=0)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.35s/it]


In [8]:
score = 0
score2 = 0
rec_file = open('./aa_7b_x_1178.tsv', 'w')
rec_file.write(f"targets\tresponse\tprecision\trecall\tf1\n")
for i in tqdm(range(len(xtract_test))):
    inst = xtract_test.iloc[i]
    #print(inst)
    
    prompt = prompt_instruction_format_xtract(inst, llm_tokenizer)
    outputs = llm_pipeline(prompt, max_new_tokens=100, do_sample=True, 
                                temperature=0.85, top_k=50, top_p=0.95)
    response = outputs[0]['generated_text'].split('[/INST]')[-1].strip().split('|')
    response = [r.replace(',', ' ') for r in response]
    response = [r.replace('(', ' ') for r in response]
    response = [r.replace(')', ' ') for r in response]
    response = [r.replace('[', ' ') for r in response]
    response = [r.replace(']', ' ') for r in response]
    #print(response)
    targets = inst['topics'].split('|')
    targets = [t.replace(',', ' ') for t in targets]
    targets = [t.replace('(', ' ') for t in targets]
    targets = [t.replace(')', ' ') for t in targets]
    targets = [t.replace('[', ' ') for t in targets]
    targets = [t.replace(']', ' ') for t in targets]
    
    # modify score based upon number of suggestions
    # score -= abs(len(response) - 3)

    avg_pred_trg = np.average([np.max([1 - spatial.distance.cosine(get_vector(y_hat), get_vector(y)) for y_hat in response]) for y in targets]) # precision
    avg_pred_trg2 = np.average([np.max([1 - spatial.distance.cosine(get_vector(y_hat), get_vector(y)) for y_hat in targets]) for y in response]) # recall
    score += avg_pred_trg
    score2 += avg_pred_trg2
    rec_file.write(f"{inst['topics']}\t{targets}\t{response}\t{avg_pred_trg}\t{avg_pred_trg2}\t{(2 * avg_pred_trg * avg_pred_trg2)/(avg_pred_trg + avg_pred_trg2)}\n")
    
print('precision: ', score/len(xtract_test))
print('recall: ', score2/len(xtract_test))
print('f1: ', (2 * (score/len(xtract_test)) * (score2/len(xtract_test)))/(score/len(xtract_test) + score2/len(xtract_test)))
rec_file.close()

100%|██████████| 1335/1335 [17:06<00:00,  1.30it/s]

precision:  0.7607818071688531
recall:  0.7660989008489488
f1:  0.7634310960868332





# Eval 1b extract model

In [None]:
def generate_cot(text_in, tok_in, mod_in):
    instruction = "Instruction: Generate a list of topics increasing in specificity to define the subject of conversation.\n"
    instruction += f"Input:{text_in}"
    formatted_prompt = (f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\nThe topics defining the input are:")
    tok_text = tok_in(formatted_prompt, return_tensors='pt').to('cuda:0')
    gen_text = mod_in.generate(**tok_text, max_new_tokens=60)
    dec_text = tok_in.decode(gen_text[0], skip_special_tokens=True)
    #print(dec_text)
    dec_text = re.search('```.*\n```', dec_text).group()[3:-4]

    return dec_text

In [None]:
llm_tokenizer = AutoTokenizer.from_pretrained("../CoT/topic_extraction/hf_model_1b/")
llm_model = AutoModelForCausalLM.from_pretrained("../CoT/topic_extraction/hf_model_1b/")
llm_model.to('cuda:0')

In [None]:
score = 0
score2 = 0
rec_file = open('./aa_1b_x.tsv', 'w')
rec_file.write(f"targets\tresponse\tprecision\trecall\tf1\n")
for i in tqdm(range(len(xtract_test))):
    inst = xtract_test.iloc[i]
    response = generate_cot(inst['utterance'], llm_tokenizer, llm_model).strip().split('|')
    
    response = [r.replace(',', ' ') for r in response]
    response = [r.replace('(', ' ') for r in response]
    response = [r.replace(')', ' ') for r in response]
    response = [r.replace('[', ' ') for r in response]
    response = [r.replace(']', ' ') for r in response]
    #print(response)
    targets = inst['topics'].split('|')
    targets = [t.replace(',', ' ') for t in targets]
    targets = [t.replace('(', ' ') for t in targets]
    targets = [t.replace(')', ' ') for t in targets]
    targets = [t.replace('[', ' ') for t in targets]
    targets = [t.replace(']', ' ') for t in targets]
    
    avg_pred_trg = np.average([np.max([1 - spatial.distance.cosine(get_vector(y_hat), get_vector(y)) for y_hat in response]) for y in targets]) # precision
    avg_pred_trg2 = np.average([np.max([1 - spatial.distance.cosine(get_vector(y_hat), get_vector(y)) for y_hat in targets]) for y in response]) # recall
    score += avg_pred_trg
    score2 += avg_pred_trg2
    rec_file.write(f"{inst['utterance']}\t{targets}\t{response}\t{avg_pred_trg}\t{avg_pred_trg2}\t{(2 * avg_pred_trg * avg_pred_trg2)/(avg_pred_trg + avg_pred_trg2)}\n")
    
print('precision: ', score/len(xtract_test))
print('recall: ', score2/len(xtract_test))
print('f1: ', (2 * (score/len(xtract_test)) * (score2/len(xtract_test)))/(score/len(xtract_test) + score2/len(xtract_test)))
rec_file.close()