# finetune llama2

In [16]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3"
import json
import pandas as pd
from io import StringIO
from sklearn.model_selection import train_test_split
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
import random
from tqdm import tqdm

In [3]:
dat = json.loads(open('../V2/evaluation/topical_chat/Topical-Chat/conversations/train.json', 'r').read())

In [4]:
def generate_cot(text_in, tok_in, mod_in):
    instruction = "Instruction: Generate a list of topics increasing in specificity to define the subject of conversation.\n"
    instruction += f"Input:{text_in}"
    formatted_prompt = (f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\nThe topics defining the input are:")
    tok_text = tok_in(formatted_prompt, return_tensors='pt').to('cuda:0')
    gen_text = mod_in.generate(**tok_text, max_new_tokens=60)
    dec_text = tok_in.decode(gen_text[0], skip_special_tokens=True)
    #print(dec_text)
    dec_text = re.search('```.*\n```', dec_text).group()[3:-4]

    return dec_text

In [6]:
cot_tokenizer = AutoTokenizer.from_pretrained("../V2/CoT/topic_extraction/hf_model_1b/")
cot_model = AutoModelForCausalLM.from_pretrained("../V2/CoT/topic_extraction/hf_model_1b/")
cot_model.to('cuda:0')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32003, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

In [18]:
print(dat['t_bde29ce2-4153-4056-9eb7-f4ad710505fe']['content'])
print(len(dat['t_bde29ce2-4153-4056-9eb7-f4ad710505fe']['content']))

training_lines = []

for key in tqdm(dat):
    curr_lines = []

    for i in range(len(dat[key]['content'])-1):
        inst = dat[key]['content'][i]
        next = dat[key]['content'][i+1]
        curr_lines.append(f"{inst['agent']}:{inst['message']}")

        # generate guideline
        # grab a topic from the next target (next) message
        target_xtract = generate_cot(next['message'], cot_tokenizer, cot_model).strip().split('|')[0].replace('(', '').replace(')', '').split(',')
        # generate 2 topics from random indices in the conversation
        placeholder_xtract = '|'.join([generate_cot(curr_lines[random.randint(0, len(curr_lines)-1)].split(':')[-1], cot_tokenizer, cot_model).strip() for i in range(2)])

        if target_xtract[1] == 'yes':
            if next['agent'] == 'agent_1':
                # tpref = 'person2 likes'
                tpref = 'agent_2 likes'
            else:
                # tpref = 'person1 likes'
                tpref = 'agent_1 likes'
        elif target_xtract[1] == 'no':
            if next['agent'] == 'agent_1':
                # tpref = 'person2 dislikes'
                tpref = 'agent_2 dislikes'
            else:
                # tpref = 'person1 dislikes'
                tpref = 'agent_1 dislikes'
        else:
            if next['agent'] == 'agent_1':
                # tpref = 'It is unclear if the person 1 likes or dislikes'
                tpref = 'It is unclear if the agent_1 likes or dislikes'
            else:
                # tpref = 'It is unclear if the person 2 likes or dislikes'
                tpref = 'It is unclear if the agent_2 likes or dislikes'

        topic_recs = []
        topic_recs.append(target_xtract[0])
        # print(placeholder_xtract)
        for inst in placeholder_xtract.split('|'):
            inst = inst.replace('(', '').replace(')', '').split(',')
            #for subinst in inst:
            if inst[1] == 'yes' and inst[0] not in topic_recs:
                topic_recs.append(inst[0])
            
            if len(topic_recs) == 3:
                break
        
        if len(topic_recs) < 3:
            topic_recs.append(placeholder_xtract.split('|')[0].replace('(', '').replace(')', '').split(',')[0])
        
        if len(topic_recs) < 3:
            topic_recs.append(placeholder_xtract.split('|')[0].replace('(', '').replace(')', '').split(',')[1])

        guideline = f'{tpref} {target_xtract[0]}. {next["agent"]}\'s response should fall into one of the following 3 topics: {topic_recs}.'
        # print(guideline)
        training_lines.append(f"{' '.join(curr_lines)}\t\t{next['agent']}:{next['message']}\t\t{guideline}")

    # write lines to file
    df = pd.read_csv(StringIO('\n'.join(training_lines)), sep='\t\t', header=None)
    train, test = train_test_split(df, test_size=0.2)
    train, _ = train_test_split(train, test_size=0.9)
    test, _ = train_test_split(test, test_size=0.9)
    test = test.dropna()
    train = train.dropna()
    test.to_csv('./lora_ft_test2.csv', sep='\t')
    train.to_csv('./lora_ft_train2.csv', sep='\t')

[{'message': 'Are you a fan of Google or Microsoft?', 'agent': 'agent_1', 'sentiment': 'Curious to dive deeper', 'knowledge_source': ['FS1'], 'turn_rating': 'Good'}, {'message': 'Both are excellent technology they are helpful in many ways. For the security purpose both are super.', 'agent': 'agent_2', 'sentiment': 'Curious to dive deeper', 'knowledge_source': ['FS1'], 'turn_rating': 'Excellent'}, {'message': "I'm not  a huge fan of Google, but I use it a lot because I have to. I think they are a monopoly in some sense. ", 'agent': 'agent_1', 'sentiment': 'Curious to dive deeper', 'knowledge_source': ['FS1'], 'turn_rating': 'Good'}, {'message': 'Google provides online related services and products, which includes online ads, search engine and cloud computing.', 'agent': 'agent_2', 'sentiment': 'Curious to dive deeper', 'knowledge_source': ['FS1'], 'turn_rating': 'Excellent'}, {'message': "Yeah, their services are good. I'm just not a fan of intrusive they can be on our personal lives. "

  0%|          | 9/8628 [10:13<163:04:45, 68.12s/it]


KeyboardInterrupt: 

In [43]:
print(len(training_lines))
# print(training_lines)
df = pd.read_csv(StringIO('\n'.join(training_lines)), sep='\t\t', header=None)

179750


  df = pd.read_csv(StringIO('\n'.join(training_lines)), sep='\t\t', header=None)


In [44]:
df.head()

Unnamed: 0,0,1
0,agent_1:Are you a fan of Google or Microsoft?,agent_2:Both are excellent technology they are...
1,agent_1:Are you a fan of Google or Microsoft? ...,"agent_1:I'm not a huge fan of Google, but I u..."
2,agent_1:Are you a fan of Google or Microsoft? ...,agent_2:Google provides online related service...
3,agent_1:Are you a fan of Google or Microsoft? ...,"agent_1:Yeah, their services are good. I'm jus..."
4,agent_1:Are you a fan of Google or Microsoft? ...,agent_2:Google is leading the alphabet subsidi...


In [53]:
train, test = train_test_split(df, test_size=0.2)
train, _ = train_test_split(train, test_size=0.9)
test, _ = train_test_split(test, test_size=0.9)
test = test.dropna()
train = train.dropna()
test.to_csv('./lora_ft_test2.csv', sep='\t')
train.to_csv('./lora_ft_train2.csv', sep='\t')

In [51]:
test.head()

Unnamed: 0,0,1
113133,"agent_1:Wow, really!! I can see that happening...",agent_1:You too! Have a good night!
185291,agent_1:Hello. Did you know the NFL doesn't ha...,agent_2:Maybe there are some videos of Bruce L...
96462,"agent_1:Hi, what is your favorite car? agent_2...",agent_2:i have a honda and love it although it...
65270,"agent_1:Hi there, do you use your phone much? ...",agent_1:Hahah I guess that speaks to all of us...
25468,"agent_1:Do you like basketball, swimming or ho...",agent_1:Nice. Do you like to watch the olympics?


# finetune vicuna

In [1]:
import json
import pandas as pd
from io import StringIO
from sklearn.model_selection import train_test_split

In [2]:
dat = json.loads(open('../V2/evaluation/topical_chat/Topical-Chat/conversations/train.json', 'r').read())

In [3]:
training_lines = []

for key in dat:
    curr_lines = []

    for i in range(len(dat[key]['content'])-1):
        inst = dat[key]['content'][i]
        next = dat[key]['content'][i+1]
        curr_lines.append(f"{inst['agent']}:{inst['message']}")
        if len(curr_lines) == 3:
            training_lines.append(f"{' '.join(curr_lines)}\t\t{next['agent']}:{next['message']}")
            curr_lines = curr_lines[1:]

In [5]:
print(len(training_lines))
df = pd.read_csv(StringIO('\n'.join(training_lines)), sep='\t\t', header=None)

162494


  df = pd.read_csv(StringIO('\n'.join(training_lines)), sep='\t\t', header=None)


In [6]:
df.head()

Unnamed: 0,0,1
0,agent_1:Are you a fan of Google or Microsoft? ...,agent_2:Google provides online related service...
1,agent_2:Both are excellent technology they are...,"agent_1:Yeah, their services are good. I'm jus..."
2,"agent_1:I'm not a huge fan of Google, but I u...",agent_2:Google is leading the alphabet subsidi...
3,agent_2:Google provides online related service...,agent_1:Did you know Google had hundreds of li...
4,"agent_1:Yeah, their services are good. I'm jus...",


In [7]:
train, test = train_test_split(df, test_size=0.2)
train, _ = train_test_split(train, test_size=0.9)
test, _ = train_test_split(test, test_size=0.9)
test = test.dropna()
train = train.dropna()
test.to_csv('./lora_ft_test_vicuna.csv', sep='\t')
train.to_csv('./lora_ft_train_vicuna.csv', sep='\t')